git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11230 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2014-01-13 15:51:54 +00:00 · 2014-01-13 15:51:54 +00:00 · bbffbe06a2
parent 427e90c151
commit bbffbe06a2
10 changed files with 438 additions and 193 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -64,7 +64,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
       $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
-       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
+       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -109,7 +110,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
       $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
       $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
-       $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h
+       $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
       $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h
 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
@ -644,6 +646,18 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(O
 $(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lj_gromacs.cubin: lal_lj_gromacs.cu lal_precision.h lal_preprocessor.h
 	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_gromacs.cu
 $(OBJ_DIR)/lj_gromacs_cubin.h: $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs.cubin
 	$(BIN2C) -c -n lj_gromacs $(OBJ_DIR)/lj_gromacs.cubin > $(OBJ_DIR)/lj_gromacs_cubin.h
 $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -53,7 +53,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
       $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
-       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
+       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -75,7 +76,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
       $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
       $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
-       $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h
+       $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
       $(OBJ_DIR)/lj_gromacs_cl.h
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -460,6 +462,15 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp  $(
 $(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lj_gromacs_cl.h: lal_lj_gromacs.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh lj_gromacs $(PRE1_H) lal_lj_gromacs.cu $(OBJ_DIR)/lj_gromacs_cl.h;
 $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp  $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -194,12 +194,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
      if (_vf_atom)
-        if (_ilist==NULL)
+        if (_ilist==NULL) {
          int ii=0;
          for (int i=vstart; i<iend; i++) 
-            vatom[i][j]+=engv[i];
+            vatom[ii++][j]+=engv[i];
-        else
+        } else {
          int ii=0;
          for (int i=vstart; i<iend; i++) 
-            vatom[_ilist[i]][j]+=engv[i];
+            vatom[_ilist[ii++]][j]+=engv[i];
        }
      vstart+=_inum;
      iend+=_inum;
    }
@ -218,8 +221,9 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    return energy_virial(eatom,vatom,virial);
  double evdwl=0.0;
-  int vstart=0, iend=_inum*2;
+  int vstart=0, iend=_inum;
  if (_eflag) {
    iend=_inum*2;
    for (int i=0; i<_inum; i++)
      evdwl+=engv[i];
    for (int i=_inum; i<iend; i++)
@ -244,12 +248,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
      if (_vf_atom)
-        if (_ilist==NULL)
+        if (_ilist==NULL) {
          int ii=0;
          for (int i=vstart; i<iend; i++)
-              vatom[i][j]+=engv[i];
+            vatom[ii++][j]+=engv[i];
-        else
+        } else {
          int ii=0;
          for (int i=vstart; i<iend; i++)
-              vatom[_ilist[i]][j]+=engv[i];
+            vatom[_ilist[ii++]][j]+=engv[i];
        }  
      vstart+=_inum;
      iend+=_inum;
    }
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@ -112,6 +112,7 @@ void BuckCoulLongT::clear() {
  coeff1.clear();
  coeff2.clear();
  cutsq.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@ -10,7 +10,7 @@
 __________________________________________________________________________
    begin                : 8/15/2012
-    email                : nguyentdw@ornl.gov
+    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #if defined(USE_OPENCL)
--- a/lib/gpu/lal_coul_dsf.h
+++ b/lib/gpu/lal_coul_dsf.h
@ -10,7 +10,7 @@
 __________________________________________________________________________
    begin                : 8/15/2012
-    email                : nguyentdw@ornl.gov
+    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_LJ_DSF_H
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@ -16,7 +16,7 @@
 #if defined(USE_OPENCL)
 #include "sw_cl.h"
 #elif defined(USE_CUDART)
-const char *lj=0;
+const char *sw=0;
 #else
 #include "sw_cubin.h"
 #endif
@ -43,28 +43,15 @@ int SWT::bytes_per_atom(const int max_nbors) const {
 }
 template <class numtyp, class acctyp>
-int SWT::init(const int nlocal, const int nall, const int max_nbors, 
+int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
-              const double cell_size, const double gpu_split, FILE *_screen,
+           const double cell_size, const double gpu_split, FILE *_screen,
-              const double epsilon, const double sigma,
+           int* host_map, const int nelements, int*** host_elem2param, const int nparams, 
-              const double lambda, const double gamma,
+           const double* epsilon, const double* sigma,
-              const double costheta, const double biga,
+           const double* lambda, const double* gamma,
-              const double bigb, const double powerp,
+           const double* costheta, const double* biga,
-              const double powerq, const double cut, const double cutsq) {
+           const double* bigb, const double* powerp,
-
+           const double* powerq, const double* cut, const double* cutsq)
-  sw_epsilon=static_cast<numtyp>(epsilon);
+{
  sw_sigma=static_cast<numtyp>(sigma);
  sw_lambda=static_cast<numtyp>(lambda);
  sw_gamma=static_cast<numtyp>(gamma);
  sw_costheta=static_cast<numtyp>(costheta);
  sw_biga=static_cast<numtyp>(biga);
  sw_bigb=static_cast<numtyp>(bigb);
  sw_powerp=static_cast<numtyp>(powerp);
  sw_powerq=static_cast<numtyp>(powerq);
  sw_cut=static_cast<numtyp>(cut);
  sw_cutsq=static_cast<numtyp>(cutsq);
  if (sw_cutsq>=sw_cut*sw_cut) 
    sw_cutsq=sw_cut*sw_cut-1e-4;
  int success;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,sw,"k_sw","k_sw_three_center",
@ -73,10 +60,93 @@ int SWT::init(const int nlocal, const int nall, const int max_nbors,
    return success;
  // If atom type constants fit in shared memory use fast kernel
-  shared_types=true;
+  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  _nparams = nparams;
  _nelements = nelements;
  UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                             UCL_WRITE_ONLY);
  for (int i=0; i<nparams; i++) {
    dview[i].x=(numtyp)0; 
    dview[i].y=(numtyp)0;
    dview[i].z=(numtyp)0; 
    dview[i].w=(numtyp)0;
  }
  // pack coefficients into arrays
  sw1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<nparams; i++) {
    dview[i].x=static_cast<numtyp>(epsilon[i]);
    dview[i].y=static_cast<numtyp>(sigma[i]);
    dview[i].z=static_cast<numtyp>(lambda[i]);
    dview[i].w=static_cast<numtyp>(gamma[i]);
  }
  ucl_copy(sw1,dview,false);
  sw1_tex.get_texture(*(this->pair_program),"sw1_tex");
  sw1_tex.bind_float(sw1,4);
  sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<nparams; i++) {
    dview[i].x=static_cast<numtyp>(biga[i]);
    dview[i].y=static_cast<numtyp>(bigb[i]);
    dview[i].z=static_cast<numtyp>(powerp[i]);
    dview[i].w=static_cast<numtyp>(powerq[i]);
  }
  ucl_copy(sw2,dview,false);
  sw2_tex.get_texture(*(this->pair_program),"sw2_tex");
  sw2_tex.bind_float(sw2,4);
  sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<nparams; i++) {
    dview[i].x=static_cast<numtyp>(cut[i]);
    dview[i].y=static_cast<numtyp>(cutsq[i]);
    dview[i].z=static_cast<numtyp>(costheta[i]);
    dview[i].w=(numtyp)0;
  }
  ucl_copy(sw3,dview,false);
  sw3_tex.get_texture(*(this->pair_program),"sw3_tex");
  sw3_tex.bind_float(sw3,4);
  UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                           *(this->ucl_device), UCL_WRITE_ONLY);
  elem2param.alloc(nelements*nelements*nelements,*(this->ucl_device),
                   UCL_READ_ONLY);
  for (int i = 0; i < nelements; i++)
    for (int j = 0; j < nelements; j++)
      for (int k = 0; k < nelements; k++) {
         int idx = i*nelements*nelements+j*nelements+k;
         dview_elem2param[idx] = host_elem2param[i][j][k];
      }
  ucl_copy(elem2param,dview_elem2param,false);
  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
  for (int i = 0; i < lj_types; i++)
    dview_map[i] = host_map[i];
  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
  ucl_copy(map,dview_map,false);
  _allocated=true;
-  this->_max_bytes=0;
+  this->_max_bytes=sw1.row_bytes()+sw2.row_bytes()+sw3.row_bytes()+
    map.row_bytes()+elem2param.row_bytes();
  return 0;
 }
@ -86,6 +156,11 @@ void SWT::clear() {
    return;
  _allocated=false;
  sw1.clear();
  sw2.clear();
  sw3.clear();
  map.clear();
  elem2param.clear();
  this->clear_atomic();
 }
@ -121,22 +196,23 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &this->nbor->dev_nbor, 
+  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, 
-                   &this->_nbor_data->begin(), &this->ans->force,
+                   &map, &elem2param, &_nelements,
-                   &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, 
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                   &this->_threads_per_atom, &sw_cut, &sw_epsilon, &sw_sigma,
+                   &this->ans->force, &this->ans->engv, 
-                   &sw_biga, &sw_bigb, &sw_powerp, &sw_powerq, &sw_cutsq);
+                   &eflag, &vflag, &ainum, &nbor_pitch, 
                   &this->_threads_per_atom);
  BX=this->block_size();
  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                           (BX/(KTHREADS*JTHREADS)))); 
  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &this->nbor->dev_nbor, 
+  this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, 
-                   &this->_nbor_data->begin(), &this->ans->force,
+                           &map, &elem2param, &_nelements,
-                   &this->ans->engv, &eflag, &vflag, &ainum, 
+                           &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                   &nbor_pitch, &this->_threads_per_atom, &evatom,
+                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, 
-                   &sw_cut, &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
+                           &nbor_pitch, &this->_threads_per_atom, &evatom);
-                   &sw_costheta, &sw_cutsq);
+
  Answer<numtyp,acctyp> *end_ans;
  #ifdef THREE_CONCURRENT
  end_ans=this->ans2;
@ -145,20 +221,20 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  #endif
  if (evatom!=0) {
    this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &this->nbor->dev_nbor, 
+    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, 
-                          &this->_nbor_data->begin(), &end_ans->force,
+                          &map, &elem2param, &_nelements, 
-                          &end_ans->engv, &eflag, &vflag, &ainum, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                          &nbor_pitch, &this->_threads_per_atom, &sw_cut, 
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
+                          &nbor_pitch, &this->_threads_per_atom);
-                          &sw_costheta, &sw_cutsq);
+
  } else {
    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &this->nbor->dev_nbor, 
+    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, 
-                          &this->_nbor_data->begin(), &end_ans->force,
+                          &map, &elem2param, &_nelements, 
-                          &end_ans->engv, &eflag, &vflag, &ainum, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                          &nbor_pitch, &this->_threads_per_atom, &sw_cut, 
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, 
-                          &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
+                          &nbor_pitch, &this->_threads_per_atom);
-                          &sw_costheta, &sw_cutsq);
+
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -15,13 +15,24 @@
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float4> sw1_tex;
 texture<float4> sw2_tex;
 texture<float4> sw3_tex;
 #else
 texture<int4,1> pos_tex;
 texture<int4> sw1_tex;
 texture<int4> sw2_tex;
 texture<int4> sw3_tex;
 #endif
 #else
 #define pos_tex x_
 #define sw1_tex sw1
 #define sw2_tex sw2
 #define sw3_tex sw3
 #endif
 #define THIRD (numtyp)0.66666667
@ -60,15 +71,15 @@ texture<int4,1> pos_tex;
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv+=energy*(acctyp)0.5;                                            \
+      engv[ei]+=energy*(acctyp)0.5;                                         \
-      engv+=inum;                                                           \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv+=virial[i]*(acctyp)0.5;                                       \
+        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        engv+=inum;                                                         \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    acctyp4 old=ans[ii];                                                    \
@ -97,15 +108,15 @@ texture<int4,1> pos_tex;
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv+=energy*(acctyp)0.5;                                            \
+      engv[ei]+=energy*(acctyp)0.5;                                         \
-      engv+=inum;                                                           \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv+=virial[i]*(acctyp)0.5;                                       \
+        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        engv+=inum;                                                         \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    acctyp4 old=ans[ii];                                                    \
@ -119,33 +130,19 @@ texture<int4,1> pos_tex;
 __kernel void k_sw(const __global numtyp4 *restrict x_,
                   const __global numtyp4 *restrict sw1,
                   const __global numtyp4 *restrict sw2,
                   const __global numtyp4 *restrict sw3,
                   const __global int *restrict map,
                   const __global int *restrict elem2param,
                   const int nelements,
                   const __global int * dev_nbor, 
                   const __global int * dev_packed, 
                   __global acctyp4 *restrict ans, 
                   __global acctyp *restrict engv, 
                   const int eflag, const int vflag, const int inum, 
-                   const int nbor_pitch, const int t_per_atom, 
+                   const int nbor_pitch, const int t_per_atom) {
                   const numtyp sw_cut, const numtyp sw_epsilon, 
                   const numtyp sw_sigma, const numtyp sw_biga, 
                   const numtyp sw_bigb, const numtyp sw_powerp, 
                   const numtyp sw_powerq, const numtyp sw_cutsq) {
  __local int n_stride;
  __local numtyp pre_sw_c1, pre_sw_c2, pre_sw_c3, pre_sw_c4;
  __local numtyp pre_sw_c5, pre_sw_c6;
  pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
            pow(sw_sigma,sw_powerp);
  pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
            pow(sw_sigma,sw_powerq);
  pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
            pow(sw_sigma,sw_powerp+(numtyp)1.0);
  pre_sw_c4=sw_biga*sw_epsilon*
            pow(sw_sigma,sw_powerq+(numtyp)1.0);
  pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
            pow(sw_sigma,sw_powerp);
  pre_sw_c6=sw_biga*sw_epsilon*
            pow(sw_sigma,sw_powerq);
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -165,8 +162,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //int iw=ix.w;
+    int itype=ix.w;
-    //int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    itype=map[itype];
    for ( ; nbor<list_end; nbor+=n_stride) {
@ -174,7 +171,10 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int mtype=itype+jx.w;
+      int jtype=jx.w;
      jtype=map[jtype];
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
      // Compute r12
      numtyp delx = ix.x-jx.x;
@ -182,7 +182,31 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-      if (rsq<sw_cutsq) {
+      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
        numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
        numtyp sw_epsilon=sw1_ijparam.x;
        numtyp sw_sigma=sw1_ijparam.y;
        numtyp4 sw2_ijparam; fetch4(sw2_ijparam,ijparam,sw2_tex);
        numtyp sw_biga=sw2_ijparam.x;
        numtyp sw_bigb=sw2_ijparam.y;
        numtyp sw_powerp=sw2_ijparam.z;
        numtyp sw_powerq=sw2_ijparam.w;
        numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
        numtyp sw_cut=sw3_ijparam.x;
        numtyp sw_cutsq=sw3_ijparam.y;
        numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
            pow(sw_sigma,sw_powerp);
        numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
            pow(sw_sigma,sw_powerq);
        numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
            pow(sw_sigma,sw_powerp+(numtyp)1.0);
        numtyp pre_sw_c4=sw_biga*sw_epsilon*
            pow(sw_sigma,sw_powerq+(numtyp)1.0);
        numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
            pow(sw_sigma,sw_powerp);
        numtyp pre_sw_c6=sw_biga*sw_epsilon*
            pow(sw_sigma,sw_powerq);
        numtyp r=ucl_sqrt(rsq);
        numtyp rp=ucl_powr(r,-sw_powerp);
        numtyp rq=ucl_powr(r,-sw_powerq);
@ -209,40 +233,41 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                  ans,engv);
  } // if ii
 }
 #define threebody(delr1x, delr1y, delr1z, eflag, energy)                     \
 {                                                                            \
  numtyp r1 = ucl_sqrt(rsq1);                                                \
  numtyp rinvsq1 = ucl_recip(rsq1);                                          \
-  numtyp rainv1 = ucl_recip(r1 - sw_cut);                                \
+  numtyp rainv1 = ucl_recip(r1 - sw_cut_ij);                                 \
-  numtyp gsrainv1 = sw_sigma_gamma * rainv1;                             \
+  numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1;                              \
  numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
  numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                             \
  numtyp r2 = ucl_sqrt(rsq2);                                                \
  numtyp rinvsq2 = ucl_recip(rsq2);                                          \
-  numtyp rainv2 = ucl_recip(r2 - sw_cut);                                \
+  numtyp rainv2 = ucl_recip(r2 - sw_cut_ik);                                 \
-  numtyp gsrainv2 = sw_sigma_gamma * rainv2;                             \
+  numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2;                              \
  numtyp gsrainvsq2 = gsrainv2*rainv2/r2;                                    \
  numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                             \
  numtyp rinv12 = ucl_recip(r1*r2);                                          \
  numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - sw_costheta;                                       \
+  numtyp delcs = cs - sw_costheta_ijk;                                       \
  numtyp delcssq = delcs*delcs;                                              \
                                                                             \
  numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                             \
-  numtyp facrad = sw_lambda_epsilon * facexp*delcssq;                    \
+  numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq;                    \
  numtyp frad1 = facrad*gsrainvsq1;                                          \
  numtyp frad2 = facrad*gsrainvsq2;                                          \
-  numtyp facang = sw_lambda_epsilon2 * facexp*delcs;                     \
+  numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs;                     \
  numtyp facang12 = rinv12*facang;                                           \
  numtyp csfacang = cs*facang;                                               \
  numtyp csfac1 = rinvsq1*csfacang;                                          \
@ -273,26 +298,26 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
 {                                                                            \
  numtyp r1 = ucl_sqrt(rsq1);                                                \
  numtyp rinvsq1 = ucl_recip(rsq1);                                          \
-  numtyp rainv1 = ucl_recip(r1 - sw_cut);                                \
+  numtyp rainv1 = ucl_recip(r1 - sw_cut_ij);                                 \
-  numtyp gsrainv1 = sw_sigma_gamma * rainv1;                             \
+  numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1;                              \
  numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
  numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                             \
  numtyp r2 = ucl_sqrt(rsq2);                                                \
-  numtyp rainv2 = ucl_recip(r2 - sw_cut);                                \
+  numtyp rainv2 = ucl_recip(r2 - sw_cut_ik);                                 \
-  numtyp gsrainv2 = sw_sigma_gamma * rainv2;                             \
+  numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2;                              \
  numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                             \
  numtyp rinv12 = ucl_recip(r1*r2);                                          \
  numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - sw_costheta;                                       \
+  numtyp delcs = cs - sw_costheta_ijk;                                       \
  numtyp delcssq = delcs*delcs;                                              \
                                                                             \
  numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                             \
-  numtyp facrad = sw_lambda_epsilon * facexp*delcssq;                    \
+  numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq;                    \
  numtyp frad1 = facrad*gsrainvsq1;                                          \
-  numtyp facang = sw_lambda_epsilon2 * facexp*delcs;                     \
+  numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs;                     \
  numtyp facang12 = rinv12*facang;                                           \
  numtyp csfacang = cs*facang;                                               \
  numtyp csfac1 = rinvsq1*csfacang;                                          \
@ -303,24 +328,24 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
 }
 __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, 
                                const __global numtyp4 *restrict sw1,
                                const __global numtyp4 *restrict sw2,
                                const __global numtyp4 *restrict sw3,
                                const __global int *restrict map,
                                const __global int *restrict elem2param,
                                const int nelements,
                                const __global int * dev_nbor, 
                                const __global int * dev_packed, 
                                __global acctyp4 *restrict ans, 
                                __global acctyp *restrict engv, 
                                const int eflag, const int vflag, 
                                const int inum,  const int nbor_pitch, 
-                                const int t_per_atom, const int evatom,
+                                const int t_per_atom, const int evatom) {
                                const numtyp sw_cut, const numtyp sw_epsilon,
                                const numtyp sw_sigma, const numtyp sw_lambda,
                                const numtyp sw_gamma, const numtyp sw_costheta,
                                const numtyp sw_cutsq) {
  __local int tpa_sq, n_stride;
  __local numtyp sw_sigma_gamma, sw_lambda_epsilon;
  __local numtyp sw_lambda_epsilon2;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  sw_sigma_gamma=sw_sigma*sw_gamma;
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
-  sw_lambda_epsilon=sw_lambda*sw_epsilon;
+  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
-  sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
+  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
  int tid, ii, offset;
  atom_info(tpa_sq,ii,tid,offset);
@ -344,8 +369,8 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
    int offset_k=tid & (t_per_atom-1);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //int iw=ix.w;
+    int itype=ix.w; 
-    //int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    itype=map[itype];
    for ( ; nbor_j<list_end; nbor_j+=n_stride) {
@ -353,7 +378,8 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int mtype=itype+jx.w;
+      int jtype=jx.w;
      jtype=map[jtype];
      // Compute r12
      numtyp delr1x = jx.x-ix.x;
@ -361,7 +387,16 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
      numtyp delr1z = jx.z-ix.z;
      numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
-      if (rsq1 > sw_cutsq) continue;
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
      if (rsq1 > sw3_ijparam.y) continue;
      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
      sw_sigma=sw1_ijparam.y;
      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;
      const __global int *nbor_k=nbor_j-offset_j+offset_k;
      if (nbor_k<=nbor_j)
@ -372,11 +407,31 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
        k &= NEIGHMASK;
        numtyp4 kx; fetch4(kx,k,pos_tex);
        int ktype=kx.w;
        ktype=map[ktype];
        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
        numtyp delr2x = kx.x-ix.x;
        numtyp delr2y = kx.y-ix.y;
        numtyp delr2z = kx.z-ix.z;
        numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        if (rsq2 < sw_cutsq) {
+        if (rsq2 < sw3_ikparam.y) {   // sw_cutsq=sw3[ikparam].y;
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
          sw_sigma=sw1_ikparam.y;
          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;
          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
          sw_epsilon=sw1_ijkparam.x;
          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
          sw_costheta_ijk=sw3_ijkparam.z;
          numtyp fjx, fjy, fjz, fkx, fky, fkz;
          threebody(delr1x,delr1y,delr1z,eflag,energy);
@ -403,23 +458,24 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
 }
 __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, 
                             const __global numtyp4 *restrict sw1,
                             const __global numtyp4 *restrict sw2,
                             const __global numtyp4 *restrict sw3,
                             const __global int *restrict map,
                             const __global int *restrict elem2param,
                             const int nelements,
                             const __global int * dev_nbor, 
                             const __global int * dev_packed, 
                             __global acctyp4 *restrict ans, 
                             __global acctyp *restrict engv, 
                             const int eflag, const int vflag, 
                             const int inum,  const int nbor_pitch, 
-                             const int t_per_atom, const numtyp sw_cut, 
+                             const int t_per_atom) {
                             const numtyp sw_epsilon, const numtyp sw_sigma,
                             const numtyp sw_lambda, const numtyp sw_gamma,
                             const numtyp sw_costheta, const numtyp sw_cutsq) {
  __local int tpa_sq, n_stride;
  __local numtyp sw_sigma_gamma, sw_lambda_epsilon;
  __local numtyp sw_lambda_epsilon2;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  sw_sigma_gamma=sw_sigma*sw_gamma;
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
-  sw_lambda_epsilon=sw_lambda*sw_epsilon;
+  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
-  sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
+  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
  int tid, ii, offset;
  atom_info(tpa_sq,ii,tid,offset);
@ -443,15 +499,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
    int offset_k=tid & (t_per_atom-1);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //int iw=ix.w;
+    int itype=ix.w;
-    //int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    itype=map[itype];
    for ( ; nbor_j<list_end; nbor_j+=n_stride) {
      int j=*nbor_j;
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int mtype=itype+jx.w;
+      int jtype=jx.w;
      jtype=map[jtype];
      // Compute r12
      numtyp delr1x = ix.x-jx.x;
@ -459,7 +516,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
      numtyp delr1z = ix.z-jx.z;
      numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
-      if (rsq1 > sw_cutsq) continue;
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
      if (rsq1 > sw3_ijparam.y) continue;
      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
      sw_sigma=sw1_ijparam.y;
      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;
      const __global int *nbor_k=dev_nbor+j+nbor_pitch;
      int numk=*nbor_k; 
@ -478,15 +544,35 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
        int k=*nbor_k;
        k &= NEIGHMASK;
-        if (k == i)
+        if (k == i) continue;
          continue;
        numtyp4 kx; fetch4(kx,k,pos_tex);
        int ktype=kx.w;
        ktype=map[ktype];
        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; 
        numtyp delr2x = kx.x - jx.x;
        numtyp delr2y = kx.y - jx.y;
        numtyp delr2z = kx.z - jx.z;
        numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        if (rsq2 < sw_cutsq) {
+        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
        if (rsq2 < sw3_ikparam.y) {
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
          sw_sigma=sw1_ikparam.y;
          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;
          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
          sw_epsilon=sw1_ijkparam.x;
          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
          sw_costheta_ijk=sw3_ijkparam.z;
          numtyp fjx, fjy, fjz;
          //if (evatom==0) {
            threebody_half(delr1x,delr1y,delr1z);
@ -513,23 +599,24 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
 }
 __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict sw1,
                             const __global numtyp4 *restrict sw2,
                             const __global numtyp4 *restrict sw3,
                             const __global int *restrict map,
                             const __global int *restrict elem2param,
                             const int nelements,
                             const __global int * dev_nbor, 
                             const __global int * dev_packed, 
                             __global acctyp4 *restrict ans, 
                             __global acctyp *restrict engv, 
                             const int eflag, const int vflag, 
                             const int inum,  const int nbor_pitch, 
-                             const int t_per_atom, const numtyp sw_cut, 
+                             const int t_per_atom) {
                             const numtyp sw_epsilon, const numtyp sw_sigma,
                             const numtyp sw_lambda, const numtyp sw_gamma,
                             const numtyp sw_costheta, const numtyp sw_cutsq) {
  __local int tpa_sq, n_stride;
  __local numtyp sw_sigma_gamma, sw_lambda_epsilon;
  __local numtyp sw_lambda_epsilon2;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  sw_sigma_gamma=sw_sigma*sw_gamma;
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
-  sw_lambda_epsilon=sw_lambda*sw_epsilon;
+  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
-  sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
+  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
  int tid, ii, offset;
  atom_info(tpa_sq,ii,tid,offset);
@ -553,15 +640,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
    int offset_k=tid & (t_per_atom-1);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //int iw=ix.w;
+    int itype=ix.w;
-    //int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    itype=map[itype];
    for ( ; nbor_j<list_end; nbor_j+=n_stride) {
      int j=*nbor_j;
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int mtype=itype+jx.w;
+      int jtype=jx.w;
      jtype=map[jtype];
      // Compute r12
      numtyp delr1x = ix.x-jx.x;
@ -569,7 +657,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
      numtyp delr1z = ix.z-jx.z;
      numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
-      if (rsq1 > sw_cutsq) continue;
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
      if (rsq1 > sw3_ijparam.y) continue;
      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
      sw_sigma=sw1_ijparam.y;
      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;
      const __global int *nbor_k=dev_nbor+j+nbor_pitch;
      int numk=*nbor_k;
@ -588,15 +685,35 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
        int k=*nbor_k;
        k &= NEIGHMASK;
-        if (k == i)
+        if (k == i) continue;
          continue;
        numtyp4 kx; fetch4(kx,k,pos_tex);
        int ktype=kx.w;
        ktype=map[ktype];
        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; 
        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
        numtyp delr2x = kx.x - jx.x;
        numtyp delr2y = kx.y - jx.y;
        numtyp delr2z = kx.z - jx.z;
        numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        if (rsq2 < sw_cutsq) {
+
        if (rsq2 < sw3_ikparam.y) {
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
          sw_sigma=sw1_ikparam.y;
          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;
          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
          sw_epsilon=sw1_ijkparam.x;
          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
          sw_costheta_ijk=sw3_ijkparam.z;
          numtyp fjx, fjy, fjz, fkx, fky, fkz;
          threebody(delr1x,delr1y,delr1z,eflag,energy);
--- a/lib/gpu/lal_sw.h
+++ b/lib/gpu/lal_sw.h
@ -37,13 +37,14 @@ class SW : public BaseThree<numtyp, acctyp> {
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
-  int init(const int nlocal, const int nall, const int max_nbors, 
+  int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, 
           const double cell_size, const double gpu_split, FILE *screen,
-           const double epsilon, const double sigma,
+           int* host_map, const int nelements, int*** host_elem2param, const int nparams, 
-           const double lambda, const double gamma,
+           const double* epsilon, const double* sigma,
-           const double costheta, const double biga,
+           const double* lambda, const double* gamma,
-           const double bigb, const double powerp,
+           const double* costheta, const double* biga,
-           const double powerq, const double cut, const double cutsq);
+           const double* bigb, const double* powerp,
           const double* powerq, const double* cut, const double* cutsq);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
@ -60,11 +61,26 @@ class SW : public BaseThree<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types   
  int _lj_types;
  /// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma
  UCL_D_Vec<numtyp4> sw1;
  /// sw2.x = biga, sw2.y = bigb, sw2.z = powerp, sw2.w = powerq
  UCL_D_Vec<numtyp4> sw2;
  /// sw3.x = cut, sw3.y = cutsq, sw3.z = costheta
  UCL_D_Vec<numtyp4> sw3;
  UCL_D_Vec<int> elem2param;
  UCL_D_Vec<int> map;
  int _nparams,_nelements;
  UCL_Texture sw1_tex, sw2_tex, sw3_tex;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag, const int evatom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta;
+
  numtyp sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq;
 };
 }
--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@ -27,14 +27,15 @@ static SW<PRECISION,ACC_PRECISION> SWMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int sw_gpu_init(const int inum, const int nall, const int max_nbors, 
+int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, 
                const double cell_size, int &gpu_mode, FILE *screen,
-                const double sw_epsilon, const double sw_sigma,
+                int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-                const double sw_lambda, const double sw_gamma,
+                const double* sw_epsilon, const double* sw_sigma,
-                const double sw_costheta, const double sw_biga,
+                const double* sw_lambda, const double* sw_gamma,
-                const double sw_bigb, const double sw_powerp,
+                const double* sw_costheta, const double* sw_biga,
-                const double sw_powerq, const double sw_cut, 
+                const double* sw_bigb, const double* sw_powerp,
-                const double sw_cutsq) {
+                const double* sw_powerq, const double* sw_cut, 
                const double* sw_cutsq) {
  SWMF.clear();
  gpu_mode=SWMF.device->gpu_mode();
  double gpu_split=SWMF.device->particle_split();
@ -55,13 +56,14 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
    message=true;
  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
-    init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
+    init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
                      host_map, nelements, host_elem2param, nparams,
                      sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
                      sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
@ -72,14 +74,15 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
+      init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
                        host_map, nelements, host_elem2param, nparams,
                        sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
                        sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, 
                        sw_cutsq);