diff --git a/lib/gpu/lal_born_coul_long_cs.cu b/lib/gpu/lal_born_coul_long_cs.cu
index d18f573c11..c5f98567d9 100644
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                :
+//    begin                : June 2018
 //    email                : trung.nguyen@northwestern.edu
 // ***************************************************************************/
 
@@ -29,15 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-#define B0 (numtyp)-0.1335096380159268
-#define B1 (numtyp)-2.57839507e-1
-#define B2 (numtyp)-1.37203639e-1
-#define B3 (numtyp)-8.88822059e-3
-#define B4 (numtyp)-5.80844129e-3
-#define B5 (numtyp)1.14652755e-1
-#define EPSILON (numtyp)1.0e-20
-#define EPS_EWALD (numtyp)1.0e-6
-#define EPS_EWALD_SQR (numtyp)1.0e-12
+// Note: EWALD_P is different from that in lal_preprocessor.h
+//       acctyp is needed for these parameters
+#define CS_EWALD_P (acctyp)9.95473818e-1
+#define B0        (acctyp)-0.1335096380159268
+#define B1        (acctyp)-2.57839507e-1
+#define B2        (acctyp)-1.37203639e-1
+#define B3        (acctyp)-8.88822059e-3
+#define B4        (acctyp)-5.80844129e-3
+#define B5        (acctyp)1.14652755e-1
+
+#define EPSILON (acctyp)(1.0e-20)
+#define EPS_EWALD (acctyp)(1.0e-6)
+#define EPS_EWALD_SQR (acctyp)(1.0e-12)
 
 __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1,
@@ -104,40 +108,38 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
       int mtype=itype*lj_types+jtype;
-      if (rsq<cutsq_sigma[mtype].x) { // cutsq
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
-        numtyp rexp = (numtyp)0.0;
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq 
+        numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
+
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
+        numtyp r2inv = ucl_recip(rsq);
 
         if (rsq < cut_coulsq) {
-          rsq += EPSILON;
-
-          numtyp r2inv = ucl_recip(rsq);
-          numtyp r = ucl_rsqrt(r2inv);
+          numtyp r = ucl_sqrt(rsq);
           fetch(prefactor,j,q_tex);
           prefactor *= qqrd2e * qtmp;
           if (factor_coul<(numtyp)1.0) {
             numtyp grij = g_ewald * (r+EPS_EWALD);
             numtyp expm2 = ucl_exp(-grij*grij);
-            numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+            numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
             numtyp u = (numtyp)1.0 - t;
             _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
             prefactor /= (r+EPS_EWALD);
             forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
+            // Additionally r2inv needs to be accordingly modified since the later
+            // scaling of the overall force shall be consistent
             r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
-            forcecoul *= r2inv;
           } else {
             numtyp grij = g_ewald * r;
             numtyp expm2 = ucl_exp(-grij*grij);
-            numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+            numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
             numtyp u = (numtyp)1.0 - t;
             _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
             prefactor /= r;
             forcecoul = prefactor*(_erfc + EWALD_F*grij*expm2);
-            forcecoul *= r2inv;
           }
         } else forcecoul = (numtyp)0.0;
 
-        numtyp r2inv = ucl_recip(rsq);
         if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
           numtyp r = ucl_sqrt(rsq);
           rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
@@ -146,7 +148,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
             + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
         } else forceborn = (numtyp)0.0;
 
-        force = forceborn*r2inv + forcecoul;
+        force = (forcecoul + forceborn) * r2inv;
 
         f.x+=delx*force;
         f.y+=dely*force;
@@ -154,9 +156,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
 
         if (eflag>0) {
           if (rsq < cut_coulsq) {
-            e_coul += prefactor*_erfc;
-            if (factor_coul<(numtyp)1.0)
-              e_coul -= ((numtyp)1.0-factor_coul)*prefactor;
+            numtyp e = prefactor*_erfc;
+            if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
           }
           if (rsq < cutsq_sigma[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
@@ -248,39 +250,38 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
       if (rsq<cutsq_sigma[mtype].x) { // cutsq 
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
-        numtyp rexp = (numtyp)0.0;
+        numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
+
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
+        numtyp r2inv = ucl_recip(rsq);
 
         if (rsq < cut_coulsq) {
-          rsq += EPSILON;
-
-          numtyp r2inv = ucl_recip(rsq);
-          numtyp r = ucl_rsqrt(r2inv);
+          numtyp r = ucl_sqrt(rsq);
           fetch(prefactor,j,q_tex);
           prefactor *= qqrd2e * qtmp;
           if (factor_coul<(numtyp)1.0) {
             numtyp grij = g_ewald * (r+EPS_EWALD);
             numtyp expm2 = ucl_exp(-grij*grij);
-            numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+            numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
             numtyp u = (numtyp)1.0 - t;
             _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
             prefactor /= (r+EPS_EWALD);
             forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
+            // Additionally r2inv needs to be accordingly modified since the later
+            // scaling of the overall force shall be consistent
             r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
-            forcecoul *= r2inv;
           } else {
+
             numtyp grij = g_ewald * r;
             numtyp expm2 = ucl_exp(-grij*grij);
-            numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+            numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
             numtyp u = (numtyp)1.0 - t;
             _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
             prefactor /= r;
             forcecoul = prefactor*(_erfc + EWALD_F*grij*expm2);
-            forcecoul *= r2inv;
           }
         } else forcecoul = (numtyp)0.0;
 
-        numtyp r2inv = ucl_recip(rsq);
         if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
           numtyp r = ucl_sqrt(rsq);
           rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
@@ -289,7 +290,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
             + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
         } else forceborn = (numtyp)0.0;
 
-        force = forceborn*r2inv + forcecoul;
+        force = (forcecoul + forceborn) * r2inv;
 
         f.x+=delx*force;
         f.y+=dely*force;
@@ -297,9 +298,9 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
 
         if (eflag>0) {
           if (rsq < cut_coulsq) {
-            e_coul += prefactor*_erfc;
-            if (factor_coul<(numtyp)1.0)
-              e_coul -= ((numtyp)1.0-factor_coul)*prefactor;
+            numtyp e = prefactor*_erfc;
+            if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
           }
           if (rsq < cutsq_sigma[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
diff --git a/lib/gpu/lal_coul_long_cs.cu b/lib/gpu/lal_coul_long_cs.cu
index f6d3ef8d44..1ff9445f4c 100644
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@@ -29,6 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
+// Note: EWALD_P is different from that in lal_preprocessor.h
+//       acctyp is needed for these parameters
+#define CS_EWALD_P (acctyp)9.95473818e-1
+#define B0        (acctyp)-0.1335096380159268
+#define B1        (acctyp)-2.57839507e-1
+#define B2        (acctyp)-1.37203639e-1
+#define B3        (acctyp)-8.88822059e-3
+#define B4        (acctyp)-5.80844129e-3
+#define B5        (acctyp)1.14652755e-1
+
+#define EPSILON (acctyp)(1.0e-20)
+#define EPS_EWALD (acctyp)(1.0e-6)
+#define EPS_EWALD_SQR (acctyp)(1.0e-12)
+
 #if (ARCH < 300)
 
 #define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
@@ -123,16 +137,6 @@ texture<int2> q_tex;
 
 #endif
 
-#define B0 (numtyp)-0.1335096380159268
-#define B1 (numtyp)-2.57839507e-1
-#define B2 (numtyp)-1.37203639e-1
-#define B3 (numtyp)-8.88822059e-3
-#define B4 (numtyp)-5.80844129e-3
-#define B5 (numtyp)1.14652755e-1
-#define EPSILON (numtyp)1.0e-20
-#define EPS_EWALD (numtyp)1.0e-6
-#define EPS_EWALD_SQR (numtyp)1.0e-12
-
 __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
                           const int lj_types,
@@ -191,7 +195,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
 
       int mtype=itype*lj_types+jtype;
       if (rsq < cut_coulsq) {
-        rsq += EPSILON;
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
 
         numtyp force,prefactor,_erfc;
         numtyp r2inv = ucl_recip(rsq);
@@ -201,17 +205,19 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
         if (factor_coul<(numtyp)1.0) {
           numtyp grij = g_ewald * (r+EPS_EWALD);
           numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
           numtyp u = (numtyp)1.0 - t;
           _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
           prefactor /= (r+EPS_EWALD);
           force = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
+          // Additionally r2inv needs to be accordingly modified since the later
+          // scaling of the overall force shall be consistent
           r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
           force *= r2inv;
         } else {
           numtyp grij = g_ewald * r;
           numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
           numtyp u = (numtyp)1.0 - t;
           _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
           prefactor /= r;
@@ -224,9 +230,9 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*_erfc;
-          if (factor_coul<(numtyp)1.0)
-            e_coul -= ((numtyp)1.0-factor_coul)*prefactor;
+          numtyp e = prefactor*_erfc;
+          if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+          e_coul += e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -304,7 +310,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
       if (rsq < cut_coulsq) {
-        rsq += EPSILON;
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
 
         numtyp force,prefactor,_erfc;
         numtyp r2inv = ucl_recip(rsq);
@@ -314,32 +320,34 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
         if (factor_coul<(numtyp)1.0) {
           numtyp grij = g_ewald * (r+EPS_EWALD);
           numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
           numtyp u = (numtyp)1.0 - t;
           _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
           prefactor /= (r+EPS_EWALD);
           force = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
+          // Additionally r2inv needs to be accordingly modified since the later
+          // scaling of the overall force shall be consistent
           r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
-          force *= r2inv;
         } else {
           numtyp grij = g_ewald * r;
           numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
           numtyp u = (numtyp)1.0 - t;
           _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
           prefactor /= r;
           force = prefactor * (_erfc + EWALD_F*grij*expm2);
-          force *= r2inv;
         }
 
+        force *= r2inv;
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*_erfc;
-          if (factor_coul<(numtyp)1.0)
-            e_coul -= ((numtyp)1.0-factor_coul)*prefactor;
+          numtyp e = prefactor*_erfc;
+          if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+          e_coul += e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index b0b3cfb90c..83692a24e4 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -118,24 +118,24 @@ __kernel void transpose(__global tagint *restrict out,
                         const __global tagint *restrict in,
                         int columns_in, int rows_in)
 {
-        __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
+  __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 
-        unsigned ti=THREAD_ID_X;
-        unsigned tj=THREAD_ID_Y;
-        unsigned bi=BLOCK_ID_X;
-        unsigned bj=BLOCK_ID_Y;
+  unsigned ti=THREAD_ID_X;
+  unsigned tj=THREAD_ID_Y;
+  unsigned bi=BLOCK_ID_X;
+  unsigned bj=BLOCK_ID_Y;
 
-        unsigned i=bi*BLOCK_CELL_2D+ti;
-        unsigned j=bj*BLOCK_CELL_2D+tj;
-        if ((i<columns_in) && (j<rows_in))
-                block[tj][ti]=in[j*columns_in+i];
+  unsigned i=bi*BLOCK_CELL_2D+ti;
+  unsigned j=bj*BLOCK_CELL_2D+tj;
+  if ((i<columns_in) && (j<rows_in))
+    block[tj][ti]=in[j*columns_in+i];
 
-        __syncthreads();
+   __syncthreads();
 
-        i=bj*BLOCK_CELL_2D+ti;
-        j=bi*BLOCK_CELL_2D+tj;
-        if ((i<rows_in) && (j<columns_in))
-                out[j*rows_in+i] = block[ti][tj];
+  i=bj*BLOCK_CELL_2D+ti;
+  j=bi*BLOCK_CELL_2D+tj;
+  if ((i<rows_in) && (j<columns_in))
+    out[j*rows_in+i] = block[ti][tj];
 }
 
 __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
@@ -191,7 +191,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
       nbor_list[pid_i]=pid_i;
     } else {
       stride=0;
-            neigh_counts=host_numj+pid_i-inum;
+      neigh_counts=host_numj+pid_i-inum;
       neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
     }
 
@@ -232,7 +232,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
                 diff.z = atom_i.z - pos_sh[j].z;
 
                 r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
-                if (r2 < cell_size*cell_size && r2 > 1e-5) {
+                if (r2 < cell_size*cell_size && pid_j != pid_i) { //  && r2 > 1e-5
                   cnt++;
                   if (cnt <= neigh_bin_size) {
                     *neigh_list = pid_j;
@@ -243,8 +243,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
                 }
               }
             }
-                  __syncthreads();
-                } // for (k)
+            __syncthreads();
+          } // for (k)
         }
       }
     }