lammps/lib/gpu/gb_gpu_kernel.h

/***************************************************************************
                              gb_gpu_kernel.cu
                             -------------------
                               W. Michael Brown

  Routines that actually perform the force/torque computation

   *** Force Decomposition by Atom Version ***
 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________

    begin                : Tue Jun 23 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/

/* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   ----------------------------------------------------------------------- */

#ifndef GB_GPU_KERNEL
#define GB_GPU_KERNEL

#include "gb_gpu_extra.h"

template <class numtyp>
static __inline__ __device__ void compute_eta_torque(numtyp m[9],
                                                     numtyp m2[9],
                                                     const int i,
                                                     numtyp ans[9])
{
  numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
    m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
    m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
  den = (numtyp)1.0/den;

  numtyp shapex=_shape_<numtyp>(i,0);
  numtyp shapey=_shape_<numtyp>(i,1);
  numtyp shapez=_shape_<numtyp>(i,2);

  ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;

  ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;

  ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;

  ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;

  ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;

  ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;

  ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;

  ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;

  ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}

template<class numtyp, class acctyp>
__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
                                const int *dev_nbor, const int nbor_pitch,
                                acctyp *ans, size_t ans_pitch, int *err_flag,
                                const bool eflag, const bool vflag,
                                const int inum, const int nall) {

  __shared__ numtyp sp_lj[4];

   // ii indexes the two interacting particles in gi
  int ii=threadIdx.x;
  if (ii<4)
    sp_lj[ii]=special_lj[ii];
  ii+=INT_MUL(blockIdx.x,blockDim.x);
  __syncthreads();

  if (ii<inum) {

  acctyp energy=(numtyp)0;
  acctyp fx=(numtyp)0;
  acctyp fy=(numtyp)0;
  acctyp fz=(numtyp)0;
  acctyp torx=(numtyp)0;
  acctyp tory=(numtyp)0;
  acctyp torz=(numtyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(numtyp)0;

  const int *nbor=dev_nbor+ii;
  int i=*nbor;
  nbor+=nbor_pitch;
  nbor+=nbor_pitch;
  nbor+=nbor_pitch;
  int numj=*nbor;
  nbor+=nbor_pitch;
  const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);

  numtyp ix=_x_<numtyp>(i,0);
  numtyp iy=_x_<numtyp>(i,1);
  numtyp iz=_x_<numtyp>(i,2);
  int itype=_x_<numtyp>(i,7);
  numtyp a1[9], b1[9], g1[9];
  {
    numtyp t[9];
    gpu_quat_to_mat_trans(i,a1);
    gpu_shape_times3(itype,a1,t);
    gpu_transpose_times3(a1,t,g1);
    gpu_well_times3(itype,a1,t);
    gpu_transpose_times3(a1,t,b1);
  }

  numtyp factor_lj;
  for ( ; nbor<nbor_end; nbor+=nbor_pitch) {

  int j=*nbor;
  if (j < nall)
    factor_lj = (numtyp)1.0;
  else {
    factor_lj = sp_lj[j/nall];
    j %= nall;
  }
  int jtype=_x_<numtyp>(j,7);

  // Compute r12
  numtyp r12[3];
  r12[0] = _x_<numtyp>(j,0)-ix;
  r12[1] = _x_<numtyp>(j,1)-iy;
  r12[2] = _x_<numtyp>(j,2)-iz;
  numtyp ir = gpu_dot3(r12,r12);

  ir = rsqrt(ir);
  numtyp r = (numtyp)1.0/ir;

  numtyp a2[9];
  gpu_quat_to_mat_trans(j,a2);

  numtyp u_r, dUr[3], tUr[3], eta, teta[3];
  { // Compute U_r, dUr, eta, and teta
    // Compute g12
    numtyp g12[9];
    {
      numtyp g2[9];
      {
          gpu_shape_times3(jtype,a2,g12);
          gpu_transpose_times3(a2,g12,g2);
          gpu_plus3(g1,g2,g12);
      }

      { // Compute U_r and dUr

        // Compute kappa
        numtyp kappa[3];
        gpu_mldivide3(g12,r12,kappa,err_flag);

        // -- replace r12 with r12 hat
        r12[0]*=ir;
        r12[1]*=ir;
        r12[2]*=ir;

        // -- kappa is now / r
        kappa[0]*=ir;
        kappa[1]*=ir;
        kappa[2]*=ir;

        // energy

        // compute u_r and dUr
        numtyp uslj_rsq;
        {
          // Compute distance of closest approach
          numtyp h12, sigma12;
          sigma12 = gpu_dot3(r12,kappa);
          sigma12 = rsqrt((numtyp)0.5*sigma12);
          h12 = r-sigma12;

          // -- kappa is now ok
          kappa[0]*=r;
          kappa[1]*=r;
          kappa[2]*=r;

          numtyp sigma = _sigma_<numtyp>(itype,jtype);
          numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
          numtyp varrho = sigma/(h12+gum[0]*sigma);
          numtyp varrho6 = varrho*varrho*varrho;
          varrho6*=varrho6;
          numtyp varrho12 = varrho6*varrho6;
          u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);

          numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
          temp1 = temp1*(numtyp)24.0*epsilon;
          uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
          numtyp temp2 = gpu_dot3(kappa,r12);
          uslj_rsq = uslj_rsq*ir*ir;

          dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
          dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
          dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
        }

        // torque for particle 1
        {
          numtyp tempv[3], tempv2[3];
          tempv[0] = -uslj_rsq*kappa[0];
          tempv[1] = -uslj_rsq*kappa[1];
          tempv[2] = -uslj_rsq*kappa[2];
          gpu_row_times3(kappa,g1,tempv2);
          gpu_cross3(tempv,tempv2,tUr);
        }
      }
    }

    // Compute eta
    {
      eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
      numtyp det_g12 = gpu_det3(g12);
      eta = pow(eta/det_g12,gum[1]);
    }

    // Compute teta
    numtyp temp[9], tempv[3], tempv2[3];
    compute_eta_torque(g12,a1,itype,temp);
    numtyp temp1 = -eta*gum[1];

    tempv[0] = temp1*temp[0];
    tempv[1] = temp1*temp[1];
    tempv[2] = temp1*temp[2];
    gpu_cross3(a1,tempv,tempv2);
    teta[0] = tempv2[0];
    teta[1] = tempv2[1];
    teta[2] = tempv2[2];

    tempv[0] = temp1*temp[3];
    tempv[1] = temp1*temp[4];
    tempv[2] = temp1*temp[5];
    gpu_cross3(a1+3,tempv,tempv2);
    teta[0] += tempv2[0];
    teta[1] += tempv2[1];
    teta[2] += tempv2[2];

    tempv[0] = temp1*temp[6];
    tempv[1] = temp1*temp[7];
    tempv[2] = temp1*temp[8];
    gpu_cross3(a1+6,tempv,tempv2);
    teta[0] += tempv2[0];
    teta[1] += tempv2[1];
    teta[2] += tempv2[2];
  }

  numtyp chi, dchi[3], tchi[3];
  { // Compute chi and dchi

    // Compute b12
    numtyp b2[9], b12[9];
    {
      gpu_well_times3(jtype,a2,b12);
      gpu_transpose_times3(a2,b12,b2);
      gpu_plus3(b1,b2,b12);
    }

    // compute chi_12
    r12[0]*=r;
    r12[1]*=r;
    r12[2]*=r;
    numtyp iota[3];
    gpu_mldivide3(b12,r12,iota,err_flag);
    // -- iota is now iota/r
    iota[0]*=ir;
    iota[1]*=ir;
    iota[2]*=ir;
    r12[0]*=ir;
    r12[1]*=ir;
    r12[2]*=ir;
    chi = gpu_dot3(r12,iota);
    chi = pow(chi*(numtyp)2.0,gum[2]);

    // -- iota is now ok
    iota[0]*=r;
    iota[1]*=r;
    iota[2]*=r;

    numtyp temp1 = gpu_dot3(iota,r12);
    numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
    dchi[0] = temp2*(iota[0]-temp1*r12[0]);
    dchi[1] = temp2*(iota[1]-temp1*r12[1]);
    dchi[2] = temp2*(iota[2]-temp1*r12[2]);

    // compute t_chi
    numtyp tempv[3];
    gpu_row_times3(iota,b1,tempv);
    gpu_cross3(tempv,iota,tchi);
    temp1 = (numtyp)-4.0*ir*ir;
    tchi[0] *= temp1;
    tchi[1] *= temp1;
    tchi[2] *= temp1;
  }

  numtyp temp2 = factor_lj*eta*chi;
  if (eflag)
    energy+=u_r*temp2;
  numtyp temp1 = -eta*u_r*factor_lj;
  if (vflag) {
    r12[0]*=-r;
    r12[1]*=-r;
    r12[2]*=-r;
    numtyp ft=temp1*dchi[0]-temp2*dUr[0];
    fx+=ft;
    virial[0]+=r12[0]*ft;
    ft=temp1*dchi[1]-temp2*dUr[1];
    fy+=ft;
    virial[1]+=r12[1]*ft;
    virial[3]+=r12[0]*ft;
    ft=temp1*dchi[2]-temp2*dUr[2];
    fz+=ft;
    virial[2]+=r12[2]*ft;
    virial[4]+=r12[0]*ft;
    virial[5]+=r12[1]*ft;
  } else {
    fx+=temp1*dchi[0]-temp2*dUr[0];
    fy+=temp1*dchi[1]-temp2*dUr[1];
    fz+=temp1*dchi[2]-temp2*dUr[2];
  }

  // Torque on 1
  temp1 = -u_r*eta*factor_lj;
  temp2 = -u_r*chi*factor_lj;
  numtyp temp3 = -chi*eta*factor_lj;
  torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
  tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
  torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];

  } // for nbor

  // Store answers
  acctyp *ap1=ans+ii;
  if (eflag) {
    *ap1=energy;
    ap1+=ans_pitch;
  }
  if (vflag) {
    for (int i=0; i<6; i++) {
      *ap1=virial[i];
      ap1+=ans_pitch;
    }
  }
  *ap1=fx;
  ap1+=ans_pitch;
  *ap1=fy;
  ap1+=ans_pitch;
  *ap1=fz;
  ap1+=ans_pitch;
  *ap1=torx;
  ap1+=ans_pitch;
  *ap1=tory;
  ap1+=ans_pitch;
  *ap1=torz;

  } // if ii
}

template<class numtyp, class acctyp>
__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
                                 const int *dev_nbor, const int nbor_pitch,
                                 acctyp *ans, size_t ans_pitch, int *err_flag,
                                 const bool eflag, const bool vflag,
                                 const int start, const int inum,
                                 const int nall) {
  __shared__ numtyp sp_lj[4];

  // ii indexes the two interacting particles in gi
  int ii=threadIdx.x;
  if (ii<4)
    sp_lj[ii]=special_lj[ii];
  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
  __syncthreads();

  if (ii<inum) {

    acctyp energy=(numtyp)0;
    acctyp fx=(numtyp)0;
    acctyp fy=(numtyp)0;
    acctyp fz=(numtyp)0;
    acctyp virial[6];
    for (int i=0; i<6; i++)
      virial[i]=(numtyp)0;

    const int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    nbor+=nbor_pitch;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
    const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);

    numtyp ix=_x_<numtyp>(i,0);
    numtyp iy=_x_<numtyp>(i,1);
    numtyp iz=_x_<numtyp>(i,2);
    int itype=_x_<numtyp>(i,7);

    numtyp oner=_shape_<numtyp>(itype,0);
    numtyp one_well=_well_<numtyp>(itype,0);

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {

      int j=*nbor;
      if (j < nall)
        factor_lj = (numtyp)1.0;
      else {
        factor_lj = sp_lj[j/nall];
        j %= nall;
      }
      int jtype=_x_<numtyp>(j,7);

      // Compute r12
      numtyp r12[3];
      r12[0] = _x_<numtyp>(j,0)-ix;
      r12[1] = _x_<numtyp>(j,1)-iy;
      r12[2] = _x_<numtyp>(j,2)-iz;
      numtyp ir = gpu_dot3(r12,r12);

      ir = rsqrt(ir);
      numtyp r = (numtyp)1.0/ir;

      numtyp r12hat[3];
      r12hat[0]=r12[0]*ir;
      r12hat[1]=r12[1]*ir;
      r12hat[2]=r12[2]*ir;

      numtyp a2[9];
      gpu_quat_to_mat_trans(j,a2);

      numtyp u_r, dUr[3], eta;
      { // Compute U_r, dUr, eta, and teta
        // Compute g12
        numtyp g12[9];
        {
          {
            numtyp g2[9];
            gpu_shape_times3(jtype,a2,g12);
            gpu_transpose_times3(a2,g12,g2);
            g12[0]=g2[0]+oner;
            g12[4]=g2[4]+oner;
            g12[8]=g2[8]+oner;
            g12[1]=g2[1];
            g12[2]=g2[2];
            g12[3]=g2[3];
            g12[5]=g2[5];
            g12[6]=g2[6];
            g12[7]=g2[7];
          }

          { // Compute U_r and dUr

            // Compute kappa
            numtyp kappa[3];
            gpu_mldivide3(g12,r12,kappa,err_flag);

            // -- kappa is now / r
            kappa[0]*=ir;
            kappa[1]*=ir;
            kappa[2]*=ir;

            // energy

            // compute u_r and dUr
            numtyp uslj_rsq;
            {
              // Compute distance of closest approach
              numtyp h12, sigma12;
              sigma12 = gpu_dot3(r12hat,kappa);
              sigma12 = rsqrt((numtyp)0.5*sigma12);
              h12 = r-sigma12;

              // -- kappa is now ok
              kappa[0]*=r;
              kappa[1]*=r;
              kappa[2]*=r;

              numtyp sigma = _sigma_<numtyp>(itype,jtype);
              numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
              numtyp varrho = sigma/(h12+gum[0]*sigma);
              numtyp varrho6 = varrho*varrho*varrho;
              varrho6*=varrho6;
              numtyp varrho12 = varrho6*varrho6;
              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);

              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
              temp1 = temp1*(numtyp)24.0*epsilon;
              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
              numtyp temp2 = gpu_dot3(kappa,r12hat);
              uslj_rsq = uslj_rsq*ir*ir;

              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
            }
          }
        }

        // Compute eta
        {
          eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
          numtyp det_g12 = gpu_det3(g12);
          eta = pow(eta/det_g12,gum[1]);
        }
      }

      numtyp chi, dchi[3];
      { // Compute chi and dchi

        // Compute b12
        numtyp b12[9];
        {
          numtyp b2[9];
          gpu_well_times3(jtype,a2,b12);
          gpu_transpose_times3(a2,b12,b2);
          b12[0]=b2[0]+one_well;
          b12[4]=b2[4]+one_well;
          b12[8]=b2[8]+one_well;
          b12[1]=b2[1];
          b12[2]=b2[2];
          b12[3]=b2[3];
          b12[5]=b2[5];
          b12[6]=b2[6];
          b12[7]=b2[7];
        }

        // compute chi_12
        numtyp iota[3];
        gpu_mldivide3(b12,r12,iota,err_flag);
        // -- iota is now iota/r
        iota[0]*=ir;
        iota[1]*=ir;
        iota[2]*=ir;
        chi = gpu_dot3(r12hat,iota);
        chi = pow(chi*(numtyp)2.0,gum[2]);

        // -- iota is now ok
        iota[0]*=r;
        iota[1]*=r;
        iota[2]*=r;

        numtyp temp1 = gpu_dot3(iota,r12hat);
        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
      }

      numtyp temp2 = factor_lj*eta*chi;
      if (eflag)
        energy+=u_r*temp2;
      numtyp temp1 = -eta*u_r*factor_lj;
      if (vflag) {
        r12[0]*=-1;
        r12[1]*=-1;
        r12[2]*=-1;
        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
        fx+=ft;
        virial[0]+=r12[0]*ft;
        ft=temp1*dchi[1]-temp2*dUr[1];
        fy+=ft;
        virial[1]+=r12[1]*ft;
        virial[3]+=r12[0]*ft;
        ft=temp1*dchi[2]-temp2*dUr[2];
        fz+=ft;
        virial[2]+=r12[2]*ft;
        virial[4]+=r12[0]*ft;
        virial[5]+=r12[1]*ft;
      } else {
        fx+=temp1*dchi[0]-temp2*dUr[0];
        fy+=temp1*dchi[1]-temp2*dUr[1];
        fz+=temp1*dchi[2]-temp2*dUr[2];
      }
    } // for nbor

    // Store answers
    acctyp *ap1=ans+ii;
    if (eflag) {
      *ap1=energy;
      ap1+=ans_pitch;
    }
    if (vflag) {
      for (int i=0; i<6; i++) {
        *ap1=virial[i];
        ap1+=ans_pitch;
      }
    }
    *ap1=fx;
    ap1+=ans_pitch;
    *ap1=fy;
    ap1+=ans_pitch;
    *ap1=fz;
  } // if ii
}

template<class numtyp, class acctyp>
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
                          const int *dev_ij, const int nbor_pitch, acctyp *ans,
                          size_t ans_pitch, int *err_flag, const bool eflag,
                          const bool vflag, const int start, const int inum,
                          const int nall) {
  __shared__ numtyp sp_lj[4];

  // ii indexes the two interacting particles in gi
  int ii=threadIdx.x;
  if (ii<4)
    sp_lj[ii]=special_lj[ii];
  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
  __syncthreads();

  if (ii<inum) {

    acctyp energy=(numtyp)0;
    acctyp fx=(numtyp)0;
    acctyp fy=(numtyp)0;
    acctyp fz=(numtyp)0;
    acctyp virial[6];
    for (int i=0; i<6; i++)
      virial[i]=(numtyp)0;

    const int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
    const int *list=dev_ij+*nbor;
    const int *list_end=list+numj;

    numtyp ix=_x_<numtyp>(i,0);
    numtyp iy=_x_<numtyp>(i,1);
    numtyp iz=_x_<numtyp>(i,2);
    int itype=_x_<numtyp>(i,7);

    numtyp factor_lj;
    for ( ; list<list_end; list++) {

      int j=*list;
      if (j < nall)
        factor_lj = (numtyp)1.0;
      else {
        factor_lj = sp_lj[j/nall];
        j %= nall;
      }
      int jtype=_x_<numtyp>(j,7);

      // Compute r12
      numtyp delx = ix-_x_<numtyp>(j,0);
      numtyp dely = iy-_x_<numtyp>(j,1);
      numtyp delz = iz-_x_<numtyp>(j,2);
      numtyp r2inv = delx*delx+dely*dely+delz*delz;

      if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
          _form_(itype,jtype)==SPHERE_SPHERE) {
        r2inv=(numtyp)1.0/r2inv;
        numtyp r6inv = r2inv*r2inv*r2inv;
        numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
                                    _lj1_<numtyp>(itype,jtype).y);
        force*=factor_lj;

        fx+=delx*force;
        fy+=dely*force;
        fz+=delz*force;

        if (eflag) {
          numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
                          _lj3_<numtyp>(itype,jtype).y);
          energy+=factor_lj*(e-_offset_<numtyp>(1,1));
        }
        if (vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }

    } // for nbor

    // Store answers
    acctyp *ap1=ans+ii;
    if (eflag) {
      *ap1+=energy;
      ap1+=ans_pitch;
    }
    if (vflag) {
      for (int i=0; i<6; i++) {
        *ap1+=virial[i];
        ap1+=ans_pitch;
      }
    }
    *ap1+=fx;
    ap1+=ans_pitch;
    *ap1+=fy;
    ap1+=ans_pitch;
    *ap1+=fz;

  } // if ii
}

template<class numtyp, class acctyp>
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
                               const int *dev_ij, const int nbor_pitch,
                               acctyp *ans, size_t ans_pitch,int *err_flag,
                               const bool eflag, const bool vflag,
                               const int start, const int inum, const int nall){
  // ii indexes the two interacting particles in gi
  int ii=threadIdx.x;
  __shared__ numtyp sp_lj[4];
  __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  if (ii<4)
    sp_lj[ii]=special_lj[ii];
  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    int itype=ii/MAX_SHARED_TYPES;
    int jtype=ii%MAX_SHARED_TYPES;
    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
    form[ii]=_form_(itype,jtype);
    lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
    lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
    if (eflag) {
      lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
      lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
      offset[ii]=_offset_<numtyp>(itype,jtype);
    }
  }
  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
  __syncthreads();

  if (ii<inum) {

    acctyp energy=(numtyp)0;
    acctyp fx=(numtyp)0;
    acctyp fy=(numtyp)0;
    acctyp fz=(numtyp)0;
    acctyp virial[6];
    for (int i=0; i<6; i++)
      virial[i]=(numtyp)0;

    const int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
    const int *list=dev_ij+*nbor;
    const int *list_end=list+numj;

    numtyp ix=_x_<numtyp>(i,0);
    numtyp iy=_x_<numtyp>(i,1);
    numtyp iz=_x_<numtyp>(i,2);
    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));

    numtyp factor_lj;
    for ( ; list<list_end; list++) {

      int j=*list;
      if (j < nall)
        factor_lj = (numtyp)1.0;
      else {
        factor_lj = sp_lj[j/nall];
        j %= nall;
      }
      int mtype=itype+_x_<numtyp>(j,7);

      // Compute r12
      numtyp delx = ix-_x_<numtyp>(j,0);
      numtyp dely = iy-_x_<numtyp>(j,1);
      numtyp delz = iz-_x_<numtyp>(j,2);
      numtyp r2inv = delx*delx+dely*dely+delz*delz;

      if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
        r2inv=(numtyp)1.0/r2inv;
        numtyp r6inv = r2inv*r2inv*r2inv;
        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);

        fx+=delx*force;
        fy+=dely*force;
        fz+=delz*force;

        if (eflag) {
          numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
          energy+=factor_lj*(e-offset[mtype]);
        }
        if (vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }

    } // for nbor

    // Store answers
    acctyp *ap1=ans+ii;
    if (eflag) {
      *ap1+=energy;
      ap1+=ans_pitch;
    }
    if (vflag) {
      for (int i=0; i<6; i++) {
        *ap1+=virial[i];
        ap1+=ans_pitch;
      }
    }
    *ap1+=fx;
    ap1+=ans_pitch;
    *ap1+=fy;
    ap1+=ans_pitch;
    *ap1+=fz;

  } // if ii
}

#endif