git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12146 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2014-06-27 17:25:22 +00:00 · 2014-06-27 17:25:22 +00:00 · 67ae64329e
parent f745f24bcd
commit 67ae64329e
24 changed files with 422 additions and 55 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -67,7 +67,9 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
-       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o 
+       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o \
+       $(OBJ_DIR)/lal_coul.o $(OBJ_DIR)/lal_coul_ext.o \
+       $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o

 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -114,7 +116,9 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
       $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
       $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h \
-       $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h
+       $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h \
+       $(OBJ_DIR)/coul.cubin $(OBJ_DIR)/coul_cubin.h \
+       $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye_cubin.h

 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)

@ -676,6 +680,30 @@ $(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cubin.h $(OB
 $(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
 	$(CUDR) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/coul.cubin: lal_coul.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul.cu
+
+$(OBJ_DIR)/coul_cubin.h: $(OBJ_DIR)/coul.cubin $(OBJ_DIR)/coul.cubin
+	$(BIN2C) -c -n coul $(OBJ_DIR)/coul.cubin > $(OBJ_DIR)/coul_cubin.h
+
+$(OBJ_DIR)/lal_coul.o: $(ALL_H) lal_coul.h lal_coul.cpp $(OBJ_DIR)/coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_ext.o: $(ALL_H) lal_coul.h lal_coul_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_debye.cubin: lal_coul_debye.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_debye.cu
+
+$(OBJ_DIR)/coul_debye_cubin.h: $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye.cubin
+	$(BIN2C) -c -n coul_debye $(OBJ_DIR)/coul_debye.cubin > $(OBJ_DIR)/coul_debye_cubin.h
+
+$(OBJ_DIR)/lal_coul_debye.o: $(ALL_H) lal_coul_debye.h lal_coul_debye.cpp $(OBJ_DIR)/coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_debye_ext.o: $(ALL_H) lal_coul_debye.h lal_coul_debye_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_debye_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 

--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -56,7 +56,9 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
-       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o 
+       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o \
+       $(OBJ_DIR)/lal_coul.o $(OBJ_DIR)/lal_coul_ext.o \
+       $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o

 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -79,7 +81,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
       $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
       $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
-       $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h
+       $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h \
+       $(OBJ_DIR)/coul_cl.h $(OBJ_DIR)/coul_debye_cl.h


 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -485,6 +488,24 @@ $(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp  $(OBJ_DIR)/dpd_cl.h $(OBJ_
 $(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
 	$(OCL) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/coul_cl.h: lal_coul.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh coul $(PRE1_H) lal_coul.cu $(OBJ_DIR)/coul_cl.h;
+
+$(OBJ_DIR)/lal_coul.o: $(ALL_H) lal_coul.h lal_coul.cpp  $(OBJ_DIR)/coul_cl.h $(OBJ_DIR)/coul_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_coul.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_ext.o: $(ALL_H) lal_coul.h lal_coul_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_coul_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_debye_cl.h: lal_coul_debye.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh coul_debye $(PRE1_H) lal_coul_debye.cu $(OBJ_DIR)/coul_debye_cl.h;
+
+$(OBJ_DIR)/lal_coul_debye.o: $(ALL_H) lal_coul_debye.h lal_coul_debye.cpp  $(OBJ_DIR)/coul_debye_cl.h $(OBJ_DIR)/coul_debye_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_debye_ext.o: $(ALL_H) lal_coul_debye.h lal_coul_debye_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_coul_debye_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 

--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@ -97,6 +97,25 @@ int BornT::init(const int ntypes, double **host_cutsq,
  return 0;
 }

+template <class numtyp, class acctyp>
+void BornT::reinit(const int ntypes, double **host_rhoinv,
+                   double **host_born1, double **host_born2,
+                   double **host_born3, double **host_a, double **host_c,
+                   double **host_d, double **host_offset) {
+  
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
+                         host_born1,host_born2,host_born3);
+  this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
+		                     host_d,host_offset);
+}
+
 template <class numtyp, class acctyp>
 void BornT::clear() {
  if (!_allocated)
--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@ -45,7 +45,13 @@ class Born : public BaseAtomic<numtyp, acctyp> {
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
-
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_rhoinv,
+              double **host_born1, double **host_born2,
+              double **host_born3, double **host_a, double **host_c,
+              double **host_d, double **host_offset);
+       
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@ -92,6 +92,32 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  return init_ok;
 }

+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void born_gpu_reinit(const int ntypes, double **host_rhoinv,
+                     double **host_born1, double **host_born2,
+                     double **host_born3, double **host_a, double **host_c,
+                     double **host_d, double **offset) {
+  int world_me=BORNMF.device->world_me();
+  int gpu_rank=BORNMF.device->gpu_rank();
+  int procs_per_gpu=BORNMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
+                  host_born3, host_a, host_c, host_d, offset);
+  
+  BORNMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
+                    host_born3, host_a, host_c, host_d, offset);
+    
+    BORNMF.device->gpu_barrier();
+  }
+}
+
 void born_gpu_clear() {
  BORNMF.clear(); 
 }
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@ -91,6 +91,24 @@ int BuckT::init(const int ntypes, double **host_cutsq,
  return 0;
 }

+template <class numtyp, class acctyp>
+void BuckT::reinit(const int ntypes, double **host_cutsq,
+                   double **host_rhoinv, double **host_buck1, double **host_buck2,
+                   double **host_a, double **host_c, double **host_offset) {
+  
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
+                         host_buck1,host_buck2,host_cutsq);
+  this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
+                         host_offset);
+}
+
 template <class numtyp, class acctyp>
 void BuckT::clear() {
  if (!_allocated)
--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@ -45,6 +45,11 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);

+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+            double **host_rhoinv, double **host_buck1, double **host_buck2,
+            double **host_a, double **host_c, double **host_offset);
+  
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@ -89,6 +89,31 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  return init_ok;
 }

+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
+                  double **host_buck1, double **host_buck2,
+                  double **host_a, double **host_c, double **offset) {
+  int world_me=BUCKMF.device->world_me();
+  int gpu_rank=BUCKMF.device->gpu_rank();
+  int procs_per_gpu=BUCKMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                  host_a, host_c, offset);
+  
+  BUCKMF.device->world_barrier();
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                    host_a, host_c, offset);
+    
+    BUCKMF.device->gpu_barrier();
+  }
+}
+
 void buck_gpu_clear() {
  BUCKMF.clear(); 
 }
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@ -43,21 +43,19 @@ int CoulLongT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
-			 const int maxspecial, const double cell_size,
-			 const double gpu_split, FILE *_screen,
-			 const double host_cut_coulsq, double *host_special_coul,
-			 const double qqrd2e, const double g_ewald) {
+int CoulLongT::init(const int ntypes, double **host_scale,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const double cell_size,
+                    const double gpu_split, FILE *_screen,
+                    const double host_cut_coulsq, double *host_special_coul,
+                    const double qqrd2e, const double g_ewald) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
 			                      gpu_split,_screen,coul_long,"k_coul_long");
  if (success!=0)
    return success;

-  // we don't have atom types for coulomb only,
-  // but go with the minimum so that we can use
-  // the same infrastructure as lj/cut/coul/long/gpu.
-  int lj_types=1;
+  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
@ -69,13 +67,16 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
-
+  
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-
+  
+  scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
+  
  sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_coul[i];
@ -87,10 +88,18 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
  _g_ewald=g_ewald;

  _allocated=true;
-  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_cl.row_bytes();
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+scale.row_bytes()+
+    sp_cl.row_bytes();
  return 0;
 }

+template <class numtyp, class acctyp>
+void CoulLongT::reinit(const int ntypes, double **host_scale) {
+  UCL_H_Vec<numtyp> hscale(_lj_types*_lj_types,*(this->ucl_device),
+                           UCL_WRITE_ONLY);
+  this->atom->type_pack1(ntypes,_lj_types,scale,hscale,host_scale);
+}
+
 template <class numtyp, class acctyp>
 void CoulLongT::clear() {
  if (!_allocated)
@ -99,6 +108,7 @@ void CoulLongT::clear() {

  lj1.clear();
  lj3.clear();
+  scale.clear();
  sp_cl.clear();
  this->clear_atomic();
 }
@ -134,7 +144,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_cl,
+    this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv,
                          &eflag, &vflag, &ainum, &nbor_pitch, 
@ -142,7 +152,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_cl,
+    this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -124,8 +124,7 @@ texture<int2> q_tex;
 #endif

 __kernel void k_coul_long(const __global numtyp4 *restrict x_, 
-                          const __global numtyp4 *restrict lj1,
-                          const __global numtyp4 *restrict lj3, 
+                          const __global numtyp *restrict scale,
                          const int lj_types,
                          const __global numtyp *restrict sp_cl_in, 
                          const __global int *dev_nbor,
@ -161,6 +160,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
              n_stride,list_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
    numtyp qtmp; fetch(qtmp,i,q_tex);

    for ( ; nbor<list_end; nbor+=n_stride) {
@ -171,24 +171,26 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-
+      int jtype=jx.w;
+      
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
+      
+      int mtype=itype*lj_types+jtype;
      if (rsq < cut_coulsq) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp force, prefactor, _erfc;
-
+        
        numtyp r = ucl_rsqrt(r2inv);
        numtyp grij = g_ewald * r;
        numtyp expm2 = ucl_exp(-grij*grij);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
        fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * qtmp/r;
+        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
        force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;

        f.x+=delx*force;
@ -196,7 +198,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
        f.z+=delz*force;

        if (eflag>0) {
-	  e_coul += prefactor*(_erfc-factor_coul);
+	        e_coul += prefactor*(_erfc-factor_coul);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
@ -215,8 +217,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
 }

 __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, 
-                               const __global numtyp4 *restrict lj1_in,
-                               const __global numtyp4 *restrict lj3_in,
+                               const __global numtyp *restrict scale_in,
                               const __global numtyp *restrict sp_cl_in,
                               const __global int *dev_nbor, 
                               const __global int *dev_packed,
@ -230,10 +231,14 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

+  __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_cl[4];
  if (tid<4)
    sp_cl[tid]=sp_cl_in[tid];
-
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    scale[tid]=scale_in[tid];
+  }
+  
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -252,7 +257,9 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
-
+    int iw = ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

@ -261,7 +268,8 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-
+      int mtype=itype+jx.w;
+      
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
@ -272,13 +280,13 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
        numtyp r2inv=ucl_recip(rsq);
        numtyp force, prefactor, _erfc;

-        numtyp r = ucl_rsqrt(r2inv);
+        numtyp r = ucl_sqrt(rsq);
        numtyp grij = g_ewald * r;
        numtyp expm2 = ucl_exp(-grij*grij);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
        fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * qtmp/r;
+        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
        force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;

        f.x+=delx*force;
@ -286,7 +294,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
        f.z+=delz*force;

        if (eflag>0) {
-	  e_coul += prefactor*(_erfc-factor_coul);
+	        e_coul += prefactor*(_erfc-factor_coul);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@ -37,12 +37,16 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
-  int init(const int nlocal, const int nall, const int max_nbors,
+  int init(const int ntypes, double **scale,
+           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const double cell_size,
-	   const double gpu_split, FILE *screen, 
-	   const double host_cut_coulsq, double *host_special_coul,
-	   const double qqrd2e, const double g_ewald);
-
+	         const double gpu_split, FILE *screen,
+	         const double host_cut_coulsq, double *host_special_coul,
+	         const double qqrd2e, const double g_ewald);
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **scale);
+  
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
@ -59,6 +63,8 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
  UCL_D_Vec<numtyp4> lj1;
  /// lj3 dummy
  UCL_D_Vec<numtyp4> lj3;
+  /// scale
+  UCL_D_Vec<numtyp> scale;
  /// Special Coul values [0-3]
  UCL_D_Vec<numtyp> sp_cl;

--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@ -27,10 +27,11 @@ static CoulLong<PRECISION,ACC_PRECISION> CLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int cl_gpu_init(const int inum, const int nall, const int max_nbors,
-		const int maxspecial, const double cell_size, int &gpu_mode,
-		FILE *screen, double host_cut_coulsq, double *host_special_coul,
-		const double qqrd2e, const double g_ewald) {
+int cl_gpu_init(const int ntypes, double **host_scale,
+                const int inum, const int nall, const int max_nbors,
+                const int maxspecial, const double cell_size, int &gpu_mode,
+                FILE *screen, double host_cut_coulsq, double *host_special_coul,
+                const double qqrd2e, const double g_ewald) {
  CLMF.clear();
  gpu_mode=CLMF.device->gpu_mode();
  double gpu_split=CLMF.device->particle_split();
@ -53,9 +54,9 @@ int cl_gpu_init(const int inum, const int nall, const int max_nbors,

  int init_ok=0;
  if (world_me==0)
-    init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
-		      screen, host_cut_coulsq, host_special_coul, qqrd2e,
-		      g_ewald);
+    init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+                      cell_size, gpu_split, screen, host_cut_coulsq,
+                      host_special_coul, qqrd2e, g_ewald);

  CLMF.device->world_barrier();
  if (message)
@ -71,9 +72,9 @@ int cl_gpu_init(const int inum, const int nall, const int max_nbors,
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
-			screen, host_cut_coulsq, host_special_coul,
-			qqrd2e, g_ewald);
+      init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen, host_cut_coulsq,
+                        host_special_coul, qqrd2e, g_ewald);

    CLMF.device->gpu_barrier();
    if (message)
@ -87,6 +88,27 @@ int cl_gpu_init(const int inum, const int nall, const int max_nbors,
  return init_ok;
 }

+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void cl_gpu_reinit(const int ntypes, double **host_scale) {
+  int world_me=CLMF.device->world_me();
+  int gpu_rank=CLMF.device->gpu_rank();
+  int procs_per_gpu=CLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    CLMF.reinit(ntypes, host_scale);
+  
+  CLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      CLMF.reinit(ntypes, host_scale);
+    
+    CLMF.device->gpu_barrier();
+  }
+}
+
 void cl_gpu_clear() {
  CLMF.clear();
 }
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@ -87,6 +87,21 @@ int GaussT::init(const int ntypes,
  return 0;
 }

+template <class numtyp, class acctyp>
+void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a,
+                    double **host_b, double **host_offset) {
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b,
+			                   host_cutsq,host_offset);
+}
+
 template <class numtyp, class acctyp>
 void GaussT::clear() {
  if (!_allocated)
--- a/lib/gpu/lal_gauss.h
+++ b/lib/gpu/lal_gauss.h
@ -14,7 +14,7 @@
 ***************************************************************************/

 #ifndef LAL_GAUSS_H
-#define LAL_GAYSS_H
+#define LAL_GAUSS_H

 #include "lal_base_atomic.h"

@ -43,7 +43,11 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
-
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_a, double **host_b, double **host_offset);
+           
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@ -88,6 +88,28 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
  return init_ok;
 }

+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a,
+                      double **host_b, double **offset) {
+  int world_me=GLMF.device->world_me();
+  int gpu_rank=GLMF.device->gpu_rank();
+  int procs_per_gpu=GLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);
+  
+  GLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);
+    
+    GLMF.device->gpu_barrier();
+  }
+}
+
 void gauss_gpu_clear() {
  GLMF.clear();
 }
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@ -92,6 +92,23 @@ int LJT::init(const int ntypes,
  return 0;
 }

+template <class numtyp, class acctyp>
+void LJT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3,
+                 double **host_lj4, double **host_offset) {
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cutsq);
+  this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset);
+}
+
 template <class numtyp, class acctyp>
 void LJT::clear() {
  if (!_allocated)
--- a/lib/gpu/lal_lj.h
+++ b/lib/gpu/lal_lj.h
@ -43,7 +43,12 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
-
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_lj1, double **host_lj2, double **host_lj3,
+              double **host_lj4, double **host_offset);
+  
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@ -92,6 +92,26 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
  return 0;
 }

+template <class numtyp, class acctyp>
+void LJExpandT::reinit(const int ntypes, double **host_cutsq,
+                       double **host_lj1, double **host_lj2,
+                       double **host_lj3, double **host_lj4,
+                       double **host_offset, double **host_shift) {
+  
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cutsq, host_shift);
+  
+  this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset);
+}
+
 template <class numtyp, class acctyp>
 void LJExpandT::clear() {
  if (!_allocated)
--- a/lib/gpu/lal_lj_expand.h
+++ b/lib/gpu/lal_lj_expand.h
@ -43,7 +43,12 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
-
+  
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_lj1, double **host_lj2, double **host_lj3,
+              double **host_lj4, double **host_offset, double **host_shift);
+  
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@ -89,6 +89,29 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  return init_ok;
 }

+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+int lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double **offset, double **shift) {
+  int world_me=LJEMF.device->world_me();
+  int gpu_rank=LJEMF.device->gpu_rank();
+  int procs_per_gpu=LJEMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                offset, shift);
+  LJEMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                   offset, shift);
+    LJEMF.device->gpu_barrier();
+  }
+}
+
 void lje_gpu_clear() {
  LJEMF.clear();
 }
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@ -88,6 +88,27 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  return init_ok;
 }

+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
+                    double **host_lj2, double **host_lj3, double **host_lj4,
+                    double **offset) {
+  int world_me=LJLMF.device->world_me();
+  int gpu_rank=LJLMF.device->gpu_rank();
+  int procs_per_gpu=LJLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
+  LJLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
+    LJLMF.device->gpu_barrier();
+  }
+}
+
 void ljl_gpu_clear() {
  LJLMF.clear();
 }
--- a/lib/gpu/lal_soft.cpp
+++ b/lib/gpu/lal_soft.cpp
@ -86,6 +86,21 @@ int SoftT::init(const int ntypes, double **host_cutsq,
  return 0;
 }

+template <class numtyp, class acctyp>
+void SoftT::reinit(const int ntypes, double **host_cutsq,
+                   double **host_prefactor, double **host_cut) {
+  
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor,
+			                   host_cut,host_cutsq);
+}
+
 template <class numtyp, class acctyp>
 void SoftT::clear() {
  if (!_allocated)
--- a/lib/gpu/lal_soft.h
+++ b/lib/gpu/lal_soft.h
@ -13,8 +13,8 @@
    email                : nguyentd@ornl.gov
 ***************************************************************************/

-#ifndef LAL_GAUSS_H
-#define LAL_GAYSS_H
+#ifndef LAL_SOFT_H
+#define LAL_SOFT_H

 #include "lal_base_atomic.h"

@ -44,6 +44,10 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);

+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_prefactor, double **host_cut);
+           
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@ -88,6 +88,28 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
  return init_ok;
 }

+// ---------------------------------------------------------------------------
+// Copy updated constants to device
+// ---------------------------------------------------------------------------
+void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor,
+                    double **host_cut) {
+  int world_me=SLMF.device->world_me();
+  int gpu_rank=SLMF.device->gpu_rank();
+  int procs_per_gpu=SLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);
+  
+  SLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);
+    
+    SLMF.device->gpu_barrier();
+  }
+}
+
 void soft_gpu_clear() {
  SLMF.clear();
 }