lammps/lib/cuda/cuda_pair_kernel.cu

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   See the README file in the top-level LAMMPS directory.

   -----------------------------------------------------------------------

   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/

   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany

   See the README file in the USER-CUDA directory.

   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#define EWALD_F   1.12837917
#define EWALD_P   0.3275911
#define A1        0.254829592
#define A2       -0.284496736
#define A3        1.421413741
#define A4       -1.453152027
#define A5        1.061405429


template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
{
  ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
  ENERGY_CFLOAT ecoul = ENERGY_F(0.0);

  ENERGY_CFLOAT* sharedE;
  ENERGY_CFLOAT* sharedECoul;
  ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];

  if(eflag || eflag_atom) {
    sharedE = &sharedmem[threadIdx.x];
    sharedE[0] = ENERGY_F(0.0);
    sharedV += blockDim.x;

    if(coul_type != COUL_NONE) {
      sharedECoul = sharedE + blockDim.x;
      sharedECoul[0] = ENERGY_F(0.0);
      sharedV += blockDim.x;
    }
  }

  if(vflag || vflag_atom) {
    sharedV[0 * blockDim.x] = ENERGY_F(0.0);
    sharedV[1 * blockDim.x] = ENERGY_F(0.0);
    sharedV[2 * blockDim.x] = ENERGY_F(0.0);
    sharedV[3 * blockDim.x] = ENERGY_F(0.0);
    sharedV[4 * blockDim.x] = ENERGY_F(0.0);
    sharedV[5 * blockDim.x] = ENERGY_F(0.0);
  }

  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

  X_CFLOAT xtmp, ytmp, ztmp;
  X_CFLOAT4 myxtype;
  F_CFLOAT fxtmp, fytmp, fztmp, fpair;
  F_CFLOAT delx, dely, delz;
  F_CFLOAT factor_lj, factor_coul;
  F_CFLOAT qtmp;
  int itype, i, j;
  int jnum = 0;
  int* jlist;

  if(ii < _inum) {
    i = _ilist[ii];

    myxtype = fetchXType(i);
    xtmp = myxtype.x;
    ytmp = myxtype.y;
    ztmp = myxtype.z;
    itype = static_cast <int>(myxtype.w);


    fxtmp = F_F(0.0);
    fytmp = F_F(0.0);
    fztmp = F_F(0.0);

    if(coul_type != COUL_NONE)
      qtmp = fetchQ(i);

    jnum = _numneigh[i];
    jlist = &_neighbors[i];
  }

  __syncthreads();

  for(int jj = 0; jj < jnum; jj++) {
    if(ii < _inum)
      if(jj < jnum) {
        fpair = F_F(0.0);
        j = jlist[jj * _nlocal];
        factor_lj =  _special_lj[sbmask(j)];

        if(coul_type != COUL_NONE)
          factor_coul = _special_coul[sbmask(j)];

        j &= NEIGHMASK;

        myxtype = fetchXType(j);
        delx = xtmp - myxtype.x;
        dely = ytmp - myxtype.y;
        delz = ztmp - myxtype.z;
        int jtype = static_cast <int>(myxtype.w);


        const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;

        bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);

        if(in_cutoff) {
          switch(pair_type) {
            case PAIR_BORN:
              fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_BUCK:
              fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_CG_CMM:
              fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_CHARMM:
              fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_CLASS2:
              fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_CUT:
              fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_EXPAND:
              fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_GROMACS:
              fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_SMOOTH:
              fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ96_CUT:
              fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_MORSE_R6:
              fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_MORSE:
              fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;
          }
        }

        if(coul_type != COUL_NONE) {
          const F_CFLOAT qiqj = qtmp * fetchQ(j);

          if(qiqj * qiqj > 1e-8) {
            const bool in_coul_cutoff =
              rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);

            if(in_coul_cutoff) {
              switch(coul_type) {
                case COUL_CHARMM:
                  fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
                  break;

                case COUL_CHARMM_IMPLICIT:
                  fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
                  break;

                case COUL_CUT: {
                  const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);

                  if(eflag) {
                    ecoul += forcecoul;
                  }

                  fpair += forcecoul * (F_F(1.0) / rsq);
                }
                break;

                case COUL_DEBYE: {
                  const F_CFLOAT r2inv = F_F(1.0) / rsq;
                  const X_CFLOAT r = _RSQRT_(r2inv);
                  const X_CFLOAT rinv = F_F(1.0) / r;
                  const F_CFLOAT screening = _EXP_(-_kappa * r);
                  F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;

                  if(eflag) {
                    ecoul += forcecoul * rinv;
                  }

                  forcecoul *= (_kappa + rinv);
                  fpair += forcecoul * r2inv;
                }
                break;

                case COUL_GROMACS:
                  fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
                  break;

                case COUL_LONG: {
                  const F_CFLOAT r2inv = F_F(1.0) / rsq;
                  const F_CFLOAT r = _RSQRT_(r2inv);
                  const F_CFLOAT grij = _g_ewald * r;
                  const F_CFLOAT expm2 = _EXP_(-grij * grij);
                  const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
                  const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
                  const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
                  F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);

                  if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;

                  if(eflag) {
                    ecoul += prefactor * erfc;

                    if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
                  }

                  fpair += forcecoul * r2inv;
                }
                break;
              }
            }

            in_cutoff = in_cutoff || in_coul_cutoff;
          }
        }


        if(in_cutoff) {
          F_CFLOAT dxfp, dyfp, dzfp;
          fxtmp += dxfp = delx * fpair;
          fytmp += dyfp = dely * fpair;
          fztmp += dzfp = delz * fpair;

          if(vflag) {
            sharedV[0 * blockDim.x] += delx * dxfp;
            sharedV[1 * blockDim.x] += dely * dyfp;
            sharedV[2 * blockDim.x] += delz * dzfp;
            sharedV[3 * blockDim.x] += delx * dyfp;
            sharedV[4 * blockDim.x] += delx * dzfp;
            sharedV[5 * blockDim.x] += dely * dzfp;
          }
        }
      }
  }

  __syncthreads();

  if(ii < _inum) {
    F_CFLOAT* my_f;

    if(_collect_forces_later) {
      ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;

      if(eflag) {
        buffer = &buffer[1 * gridDim.x * gridDim.y];

        if(coul_type != COUL_NONE)
          buffer = &buffer[1 * gridDim.x * gridDim.y];
      }

      if(vflag) {
        buffer = &buffer[6 * gridDim.x * gridDim.y];
      }

      my_f = (F_CFLOAT*) buffer;
      my_f += i;
      *my_f = fxtmp;
      my_f += _nmax;
      *my_f = fytmp;
      my_f += _nmax;
      *my_f = fztmp;
    } else {
      my_f = _f + i;
      *my_f += fxtmp;
      my_f += _nmax;
      *my_f += fytmp;
      my_f += _nmax;
      *my_f += fztmp;
    }
  }

  __syncthreads();

  if(eflag) {
    sharedE[0] = evdwl;

    if(coul_type != COUL_NONE)
      sharedECoul[0] = ecoul;
  }

  if(eflag_atom && i < _nlocal) {
    if(coul_type != COUL_NONE)
      _eatom[i] += evdwl + ecoul;
    else
      _eatom[i] += evdwl;
  }

  if(vflag_atom && i < _nlocal) {
    _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
    _vatom[i + _nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
    _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
    _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
    _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
    _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
  }

  if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
}

template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
{
  int ii = (blockIdx.x * gridDim.y + blockIdx.y);

  if(ii >= _inum)
    return;

  ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
  ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
  F_CFLOAT3* sharedVirial1;
  F_CFLOAT3* sharedVirial2;
  F_CFLOAT* sharedEnergy;
  F_CFLOAT* sharedEnergyCoul;

  F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0];

  if(vflag) {
    sharedVirial1 = &sharedForce[64];
    sharedVirial2 = &sharedVirial1[64];
  } else {
    sharedVirial1 = &sharedForce[0];
    sharedVirial2 = &sharedVirial1[0];
  }

  if(eflag) {
    if(vflag || vflag_atom)
      sharedEnergy = (F_CFLOAT*) &sharedVirial2[64];
    else
      sharedEnergy = (F_CFLOAT*) &sharedForce[64];

    if(coul_type != COUL_NONE)
      sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64];

  }

  F_CFLOAT3 partialForce = { F_F(0.0),  F_F(0.0),  F_F(0.0) };
  F_CFLOAT3 partialVirial1 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
  F_CFLOAT3 partialVirial2 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };

  X_CFLOAT xtmp, ytmp, ztmp;
  X_CFLOAT4 myxtype;
  F_CFLOAT delx, dely, delz;
  F_CFLOAT factor_lj, factor_coul;
  F_CFLOAT fpair;
  F_CFLOAT qtmp;
  int itype, jnum, i, j;
  int* jlist;

  i = _ilist[ii];

  myxtype = fetchXType(i);

  xtmp = myxtype.x;
  ytmp = myxtype.y;
  ztmp = myxtype.z;
  itype = static_cast <int>(myxtype.w);

  if(coul_type != COUL_NONE)
    qtmp = fetchQ(i);

  jnum = _numneigh[i];

  jlist = &_neighbors[i * _maxneighbors];
  __syncthreads();

  for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
    if(jj < jnum) {
      fpair = F_F(0.0);
      j = jlist[jj];
      factor_lj =  _special_lj[sbmask(j)];

      if(coul_type != COUL_NONE)
        factor_coul = _special_coul[sbmask(j)];

      j &= NEIGHMASK;

      myxtype = fetchXType(j);

      delx = xtmp - myxtype.x;
      dely = ytmp - myxtype.y;
      delz = ztmp - myxtype.z;
      int jtype = static_cast <int>(myxtype.w);

      const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;

      bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
      bool in_coul_cutoff;

      if(in_cutoff) {
        switch(pair_type) {
          case PAIR_BORN:
            fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_BUCK:
            fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_CG_CMM:
            fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_CHARMM:
            fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_CLASS2:
            fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_CUT:
            fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_EXPAND:
            fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_GROMACS:
            fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_SMOOTH:
            fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ96_CUT:
            fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_MORSE_R6:
            fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_MORSE:
            fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;
        }
      }

      if(coul_type != COUL_NONE) {
        const F_CFLOAT qiqj = qtmp * fetchQ(j);

        if(qiqj * qiqj > (1e-8f)) {
          in_coul_cutoff =
            rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);

          if(in_coul_cutoff) {
            switch(coul_type) {
              case COUL_CHARMM:
                fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
                break;

              case COUL_CHARMM_IMPLICIT:
                fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
                break;

              case COUL_GROMACS:
                fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
                break;

              case COUL_LONG: {
                const F_CFLOAT r2inv = F_F(1.0) / rsq;
                const F_CFLOAT r = _RSQRT_(r2inv);
                const F_CFLOAT grij = _g_ewald * r;
                const F_CFLOAT expm2 = _EXP_(-grij * grij);
                const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
                const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
                const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
                F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);

                if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;

                if(eflag) {
                  ecoul += prefactor * erfc;

                  if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
                }

                fpair += forcecoul * r2inv;
              }
              break;

              case COUL_DEBYE: {
                const F_CFLOAT r2inv = F_F(1.0) / rsq;
                const X_CFLOAT r = _RSQRT_(r2inv);
                const X_CFLOAT rinv = F_F(1.0) / r;
                const F_CFLOAT screening = _EXP_(-_kappa * r);
                F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;

                if(eflag) {
                  ecoul += forcecoul * rinv;
                }

                forcecoul *= (_kappa + rinv);
                fpair += forcecoul * r2inv;
              }
              break;

              case COUL_CUT: {
                const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);

                if(eflag) {
                  ecoul += forcecoul;
                }

                fpair += forcecoul * (F_F(1.0) / rsq);
              }
              break;


            }
          }
        }
      }


      if(in_cutoff || in_coul_cutoff) {
        F_CFLOAT dxfp, dyfp, dzfp;
        partialForce.x += dxfp = delx * fpair;
        partialForce.y += dyfp = dely * fpair;
        partialForce.z += dzfp = delz * fpair;

        if(vflag) {
          partialVirial1.x += delx * dxfp;
          partialVirial1.y += dely * dyfp;
          partialVirial1.z += delz * dzfp;
          partialVirial2.x += delx * dyfp;
          partialVirial2.y += delx * dzfp;
          partialVirial2.z += dely * dzfp;
        }
      }
    }
  }

  if(eflag) {
    sharedEnergy[threadIdx.x] = evdwl;

    if(coul_type != COUL_NONE)
      sharedEnergyCoul[threadIdx.x] = ecoul;
  }

  sharedForce[threadIdx.x] = partialForce;

  if(vflag) {
    sharedVirial1[threadIdx.x] = partialVirial1;
    sharedVirial2[threadIdx.x] = partialVirial2;
  }

  __syncthreads();


  for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {

    if(threadIdx.x < s) {
      sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
      sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
      sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;

      if(vflag) {
        sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
        sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
        sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;

        sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
        sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
        sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
      }

      if(eflag) {
        sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];

        if(coul_type != COUL_NONE)
          sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
      }
    }

    __syncthreads();
  }

  if(threadIdx.x == 0) {

    ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;

    if(eflag) {
      ENERGY_CFLOAT tmp_evdwl;
      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];

      if(eflag_atom)
        _eatom[i] = tmp_evdwl;

      buffer = &buffer[gridDim.x * gridDim.y];

      if(coul_type != COUL_NONE) {
        buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];

        if(eflag_atom)
          _eatom[i] += tmp_evdwl;

        buffer = &buffer[gridDim.x * gridDim.y];
      }
    }

    if(vflag) {
      ENERGY_CFLOAT tmp;
      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;

      if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;

      if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;

      if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;

      if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;

      if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;

      if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;

      buffer = &buffer[6 * gridDim.x * gridDim.y];
    }

    F_CFLOAT* my_f;

    if(_collect_forces_later) {
      my_f = (F_CFLOAT*) buffer;
      my_f += i;
      *my_f = sharedForce[0].x;
      my_f += _nmax;
      *my_f = sharedForce[0].y;
      my_f += _nmax;
      *my_f = sharedForce[0].z;
    } else {
      my_f = _f + i;
      *my_f += sharedForce[0].x;
      my_f += _nmax;
      *my_f += sharedForce[0].y;
      my_f += _nmax;
      *my_f += sharedForce[0].z;
    }
  }
}


template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
{
  ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
  ENERGY_CFLOAT ecoul = ENERGY_F(0.0);

  ENERGY_CFLOAT* sharedE;
  ENERGY_CFLOAT* sharedECoul;
  ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];

  if(eflag || eflag_atom) {
    sharedE = &sharedmem[threadIdx.x];
    sharedE[0] = ENERGY_F(0.0);
    sharedV += blockDim.x;

    if(coul_type != COUL_NONE) {
      sharedECoul = sharedE + blockDim.x;
      sharedECoul[0] = ENERGY_F(0.0);
      sharedV += blockDim.x;
    }
  }

  if(vflag || vflag_atom) {
    sharedV[0 * blockDim.x] = ENERGY_F(0.0);
    sharedV[1 * blockDim.x] = ENERGY_F(0.0);
    sharedV[2 * blockDim.x] = ENERGY_F(0.0);
    sharedV[3 * blockDim.x] = ENERGY_F(0.0);
    sharedV[4 * blockDim.x] = ENERGY_F(0.0);
    sharedV[5 * blockDim.x] = ENERGY_F(0.0);
  }

  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

  X_CFLOAT xtmp, ytmp, ztmp;
  X_CFLOAT4 myxtype;
  F_CFLOAT fxtmp, fytmp, fztmp, fpair;
  F_CFLOAT delx, dely, delz;
  F_CFLOAT factor_lj, factor_coul;
  F_CFLOAT qtmp;
  int itype, i, j;
  int jnum = 0;
  int* jlist;

  if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
    i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii] ;

    myxtype = fetchXType(i);
    myxtype = _x_type[i];
    xtmp = myxtype.x;
    ytmp = myxtype.y;
    ztmp = myxtype.z;
    itype = static_cast <int>(myxtype.w);


    fxtmp = F_F(0.0);
    fytmp = F_F(0.0);
    fztmp = F_F(0.0);

    if(coul_type != COUL_NONE)
      qtmp = fetchQ(i);

    jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);


    jlist = comm_phase == 0 ? &_neighbors[i] : (comm_phase == 1 ? &_neighbors_inner[i] : &_neighbors_border[ii]);
  }

  __syncthreads();

  for(int jj = 0; jj < jnum; jj++) {
    if(ii < (comm_phase < 2 ? _inum : _inum_border[0]))
      if(jj < jnum) {
        fpair = F_F(0.0);
        j = jlist[jj * _nlocal];

        factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall];

        if(coul_type != COUL_NONE)
          factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];

        j = j < _nall ? j : j % _nall;

        myxtype = fetchXType(j);
        delx = xtmp - myxtype.x;
        dely = ytmp - myxtype.y;
        delz = ztmp - myxtype.z;
        int jtype = static_cast <int>(myxtype.w);


        const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;

        bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);

        if(in_cutoff) {
          switch(pair_type) {
            case PAIR_BORN:
              fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_BUCK:
              fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_CG_CMM:
              fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_CHARMM:
              fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_CLASS2:
              fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_CUT:
              fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_EXPAND:
              fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_GROMACS:
              fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ_SMOOTH:
              fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_LJ96_CUT:
              fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_MORSE_R6:
              fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;

            case PAIR_MORSE:
              fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
              break;
          }
        }

        if(coul_type != COUL_NONE) {
          const F_CFLOAT qiqj = qtmp * fetchQ(j);

          if(qiqj * qiqj > 1e-8) {
            const bool in_coul_cutoff =
              rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);

            if(in_coul_cutoff) {
              switch(coul_type) {
                case COUL_CHARMM:
                  fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
                  break;

                case COUL_CHARMM_IMPLICIT:
                  fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
                  break;

                case COUL_CUT: {
                  const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);

                  if(eflag) {
                    ecoul += forcecoul;
                  }

                  fpair += forcecoul * (F_F(1.0) / rsq);
                }
                break;

                case COUL_DEBYE: {
                  const F_CFLOAT r2inv = F_F(1.0) / rsq;
                  const X_CFLOAT r = _RSQRT_(r2inv);
                  const X_CFLOAT rinv = F_F(1.0) / r;
                  const F_CFLOAT screening = _EXP_(-_kappa * r);
                  F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;

                  if(eflag) {
                    ecoul += forcecoul * rinv;
                  }

                  forcecoul *= (_kappa + rinv);
                  fpair += forcecoul * r2inv;
                }
                break;

                case COUL_GROMACS:
                  fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
                  break;

                case COUL_LONG: {
                  const F_CFLOAT r2inv = F_F(1.0) / rsq;
                  const F_CFLOAT r = _RSQRT_(r2inv);
                  const F_CFLOAT grij = _g_ewald * r;
                  const F_CFLOAT expm2 = _EXP_(-grij * grij);
                  const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
                  const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
                  const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
                  F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);

                  if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;

                  if(eflag) {
                    ecoul += prefactor * erfc;

                    if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
                  }

                  fpair += forcecoul * r2inv;
                }
                break;

              }
            }

            in_cutoff = in_cutoff || in_coul_cutoff;
          }
        }


        if(in_cutoff) {
          F_CFLOAT dxfp, dyfp, dzfp;
          fxtmp += dxfp = delx * fpair;
          fytmp += dyfp = dely * fpair;
          fztmp += dzfp = delz * fpair;

          if(vflag) {
            sharedV[0 * blockDim.x] += delx * dxfp;
            sharedV[1 * blockDim.x] += dely * dyfp;
            sharedV[2 * blockDim.x] += delz * dzfp;
            sharedV[3 * blockDim.x] += delx * dyfp;
            sharedV[4 * blockDim.x] += delx * dzfp;
            sharedV[5 * blockDim.x] += dely * dzfp;
          }
        }
      }
  }

  __syncthreads();

  if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
    F_CFLOAT* my_f;

    if(_collect_forces_later) {
      ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;

      if(eflag) {
        buffer = &buffer[1 * gridDim.x * gridDim.y];

        if(coul_type != COUL_NONE)
          buffer = &buffer[1 * gridDim.x * gridDim.y];
      }

      if(vflag) {
        buffer = &buffer[6 * gridDim.x * gridDim.y];
      }

      my_f = (F_CFLOAT*) buffer;
      my_f += i;
      *my_f = fxtmp;
      my_f += _nmax;
      *my_f = fytmp;
      my_f += _nmax;
      *my_f = fztmp;
    } else {
      my_f = _f + i;
      *my_f += fxtmp;
      my_f += _nmax;
      *my_f += fytmp;
      my_f += _nmax;
      *my_f += fztmp;
    }
  }

  __syncthreads();

  if(eflag) {
    sharedE[0] = evdwl;

    if(coul_type != COUL_NONE)
      sharedECoul[0] = ecoul;
  }

  if(eflag_atom && i < _nlocal) {
    if(coul_type != COUL_NONE)
      _eatom[i] += evdwl + ecoul;
    else
      _eatom[i] += evdwl;
  }

  if(vflag_atom && i < _nlocal) {
    _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
    _vatom[i + _nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
    _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
    _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
    _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
    _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
  }

  if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
}

template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
{
  int ii = (blockIdx.x * gridDim.y + blockIdx.y);

  if(ii >= (comm_phase < 2 ? _inum : _inum_border[0]))
    return;

  ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
  ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
  F_CFLOAT3* sharedVirial1;
  F_CFLOAT3* sharedVirial2;
  F_CFLOAT* sharedEnergy;
  F_CFLOAT* sharedEnergyCoul;

  F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0];

  if(vflag) {
    sharedVirial1 = &sharedForce[64];
    sharedVirial2 = &sharedVirial1[64];
  } else {
    sharedVirial1 = &sharedForce[0];
    sharedVirial2 = &sharedVirial1[0];
  }

  if(eflag) {
    if(vflag || vflag_atom)
      sharedEnergy = (F_CFLOAT*) &sharedVirial2[64];
    else
      sharedEnergy = (F_CFLOAT*) &sharedForce[64];

    if(coul_type != COUL_NONE)
      sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64];

  }

  F_CFLOAT3 partialForce = { F_F(0.0),  F_F(0.0),  F_F(0.0) };
  F_CFLOAT3 partialVirial1 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
  F_CFLOAT3 partialVirial2 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };

  X_CFLOAT xtmp, ytmp, ztmp;
  X_CFLOAT4 myxtype;
  F_CFLOAT delx, dely, delz;
  F_CFLOAT factor_lj, factor_coul;
  F_CFLOAT fpair;
  F_CFLOAT qtmp;
  int itype, jnum, i, j;
  int* jlist;

  i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii];

  myxtype = fetchXType(i);

  xtmp = myxtype.x;
  ytmp = myxtype.y;
  ztmp = myxtype.z;
  itype = static_cast <int>(myxtype.w);

  if(coul_type != COUL_NONE)
    qtmp = fetchQ(i);

  jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);

  jlist = comm_phase == 0 ? &_neighbors[i * _maxneighbors] : (comm_phase == 1 ? &_neighbors_inner[i * _maxneighbors] : &_neighbors_border[ii * _maxneighbors]);
  __syncthreads();

  for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
    if(jj < jnum) {
      fpair = F_F(0.0);
      j = jlist[jj];
      factor_lj   = j < _nall ? F_F(1.0) : _special_lj[j / _nall];

      if(coul_type != COUL_NONE)
        factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];

      j 			= j < _nall ? j : j % _nall;

      myxtype = fetchXType(j);

      delx = xtmp - myxtype.x;
      dely = ytmp - myxtype.y;
      delz = ztmp - myxtype.z;
      int jtype = static_cast <int>(myxtype.w);

      const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;

      bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
      bool in_coul_cutoff;

      if(in_cutoff) {
        switch(pair_type) {
          case PAIR_BORN:
            fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_BUCK:
            fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_CG_CMM:
            fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_CHARMM:
            fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_CLASS2:
            fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_CUT:
            fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_EXPAND:
            fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_GROMACS:
            fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ_SMOOTH:
            fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_LJ96_CUT:
            fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_MORSE_R6:
            fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;

          case PAIR_MORSE:
            fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
            break;
        }
      }

      if(coul_type != COUL_NONE) {
        const F_CFLOAT qiqj = qtmp * fetchQ(j);

        if(qiqj * qiqj > (1e-8f)) {
          in_coul_cutoff =
            rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);

          if(in_coul_cutoff) {
            switch(coul_type) {
              case COUL_CHARMM:
                fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
                break;

              case COUL_CHARMM_IMPLICIT:
                fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
                break;

              case COUL_GROMACS:
                fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
                break;

              case COUL_LONG: {
                const F_CFLOAT r2inv = F_F(1.0) / rsq;
                const F_CFLOAT r = _RSQRT_(r2inv);
                const F_CFLOAT grij = _g_ewald * r;
                const F_CFLOAT expm2 = _EXP_(-grij * grij);
                const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
                const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
                const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
                F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);

                if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;

                if(eflag) {
                  ecoul += prefactor * erfc;

                  if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
                }

                fpair += forcecoul * r2inv;
              }
              break;

              case COUL_DEBYE: {
                const F_CFLOAT r2inv = F_F(1.0) / rsq;
                const X_CFLOAT r = _RSQRT_(r2inv);
                const X_CFLOAT rinv = F_F(1.0) / r;
                const F_CFLOAT screening = _EXP_(-_kappa * r);
                F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;

                if(eflag) {
                  ecoul += forcecoul * rinv;
                }

                forcecoul *= (_kappa + rinv);
                fpair += forcecoul * r2inv;
              }
              break;

              case COUL_CUT: {
                const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);

                if(eflag) {
                  ecoul += forcecoul;
                }

                fpair += forcecoul * (F_F(1.0) / rsq);
              }
              break;


            }
          }
        }
      }


      if(in_cutoff || in_coul_cutoff) {
        F_CFLOAT dxfp, dyfp, dzfp;
        partialForce.x += dxfp = delx * fpair;
        partialForce.y += dyfp = dely * fpair;
        partialForce.z += dzfp = delz * fpair;

        if(vflag) {
          partialVirial1.x += delx * dxfp;
          partialVirial1.y += dely * dyfp;
          partialVirial1.z += delz * dzfp;
          partialVirial2.x += delx * dyfp;
          partialVirial2.y += delx * dzfp;
          partialVirial2.z += dely * dzfp;
        }
      }
    }
  }

  if(eflag) {
    sharedEnergy[threadIdx.x] = evdwl;

    if(coul_type != COUL_NONE)
      sharedEnergyCoul[threadIdx.x] = ecoul;
  }

  sharedForce[threadIdx.x] = partialForce;

  if(vflag) {
    sharedVirial1[threadIdx.x] = partialVirial1;
    sharedVirial2[threadIdx.x] = partialVirial2;
  }

  __syncthreads();


  for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {

    if(threadIdx.x < s) {
      sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
      sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
      sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;

      if(vflag) {
        sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
        sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
        sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;

        sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
        sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
        sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
      }

      if(eflag) {
        sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];

        if(coul_type != COUL_NONE)
          sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
      }
    }

    __syncthreads();
  }

  if(threadIdx.x == 0) {

    ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;

    if(eflag) {
      ENERGY_CFLOAT tmp_evdwl;
      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];

      if(eflag_atom)
        _eatom[i] = tmp_evdwl;

      buffer = &buffer[gridDim.x * gridDim.y];

      if(coul_type != COUL_NONE) {
        buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];

        if(eflag_atom)
          _eatom[i] += tmp_evdwl;

        buffer = &buffer[gridDim.x * gridDim.y];
      }
    }

    if(vflag) {
      ENERGY_CFLOAT tmp;
      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;

      if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;

      if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;

      if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;

      if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;

      if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;

      buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;

      if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;

      buffer = &buffer[6 * gridDim.x * gridDim.y];
    }

    F_CFLOAT* my_f;

    if(_collect_forces_later) {
      my_f = (F_CFLOAT*) buffer;
      my_f += i;
      *my_f = sharedForce[0].x;
      my_f += _nmax;
      *my_f = sharedForce[0].y;
      my_f += _nmax;
      *my_f = sharedForce[0].z;
    } else {
      my_f = _f + i;
      *my_f += sharedForce[0].x;
      my_f += _nmax;
      *my_f += sharedForce[0].y;
      my_f += _nmax;
      *my_f += sharedForce[0].z;
    }
  }
}

__global__ void Pair_GenerateXType_Kernel()
{
  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

  if(i < _nall) {
    X_CFLOAT4 xtype;
    xtype.x = _x[i];
    xtype.y = _x[i + _nmax];
    xtype.z = _x[i + 2 * _nmax];
    xtype.w = _type[i];
    _x_type[i] = xtype;
  }

}

__global__ void Pair_GenerateVRadius_Kernel()
{
  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

  if(i < _nall) {
    V_CFLOAT4 vradius;
    vradius.x = _v[i];
    vradius.y = _v[i + _nmax];
    vradius.z = _v[i + 2 * _nmax];
    vradius.w = _radius[i];
    _v_radius[i] = vradius;
  }
}

__global__ void Pair_GenerateOmegaRmass_Kernel()
{
  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

  if(i < _nall) {
    V_CFLOAT4 omegarmass;
    omegarmass.x = _omega[i];
    omegarmass.y = _omega[i + _nmax];
    omegarmass.z = _omega[i + 2 * _nmax];
    omegarmass.w = _rmass[i];
    _omega_rmass[i] = omegarmass;
  }
}

__global__ void Pair_RevertXType_Kernel()
{
  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

  if(i < _nall) {
    X_CFLOAT4 xtype = _x_type[i];
    _x[i] = xtype.x;
    _x[i + _nmax] = xtype.y;
    _x[i + 2 * _nmax] = xtype.z;
    _type[i] = static_cast <int>(xtype.w);
  }

}

__global__ void Pair_BuildXHold_Kernel()
{
  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

  if(i < _nall) {
    X_CFLOAT4 xtype = _x_type[i];
    _xhold[i] = xtype.x;
    _xhold[i + _nmax] = xtype.y;
    _xhold[i + 2 * _nmax] = xtype.z;
  }

}

__global__ void Pair_CollectForces_Kernel(int nperblock, int n)
{
  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

  if(i >= _nlocal) return;

  ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;

  F_CFLOAT* buf_f = (F_CFLOAT*) &buf[nperblock * n];
  F_CFLOAT* my_f = _f + i;
  buf_f += i;
  *my_f += * buf_f;
  my_f += _nmax;
  buf_f += _nmax;
  *my_f += * buf_f;
  my_f += _nmax;
  buf_f += _nmax;
  *my_f += * buf_f;
  my_f += _nmax;
}