git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6407 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-06-14 13:34:06 +00:00 · 2011-06-14 13:34:06 +00:00 · 945343265e
parent 9418bacff7
commit 945343265e
8 changed files with 3622 additions and 0 deletions
--- a/lib/gpu/cmmc_msm_gpu_kernel.cu
+++ b/lib/gpu/cmmc_msm_gpu_kernel.cu
@ -0,0 +1,531 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef CMMM_GPU_KERNEL
+#define CMMM_GPU_KERNEL
+
+#ifdef NV_KERNEL
+
+#include "nv_kernel_def.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+__inline double fetch_q(const int& i, const double *q)
+{
+  return q[i];
+}
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+__inline float fetch_q(const int& i, const float *q)
+{
+  return tex1Dfetch(q_tex, i);
+}
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+#define fetch_q(i,y) q_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
+
+#endif
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const int smooth, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+  __local numtyp _ia;
+  __local numtyp _ia2;
+  __local numtyp _ia3;
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    _ia=(numtyp)-1.0/sqrt(cut_coulsq);
+    _ia2=(numtyp)-1.0/cut_coulsq;
+    _ia3=_ia2*_ia;
+    
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<lj1[mtype].x) {
+        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
+        numtyp r2inv=(numtyp)1.0/rsq;
+
+        if (rsq < lj1[mtype].y) {
+          if (lj3[mtype].x == (numtyp)2) {
+            inv1=r2inv*r2inv;
+            inv2=inv1*inv1;
+          } else if (lj3[mtype].x == (numtyp)1) {
+            inv2=r2inv*sqrt(r2inv);
+            inv1=inv2*inv2;
+          } else {
+            inv1=r2inv*r2inv*r2inv;
+            inv2=inv1;
+          }
+          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+        } else
+          force_lj = (numtyp)0.0;
+
+        numtyp ir, r2_ia2, r4_ia4, r6_ia6;
+        if (rsq < cut_coulsq) {
+          ir = (numtyp)1.0/sqrt(rsq);
+          prefactor = qqrd2e*qtmp*fetch_q(j,q_);
+          r2_ia2 = rsq*_ia2;
+          r4_ia4 = r2_ia2*r2_ia2;
+          if (smooth==0)
+            forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
+                                        (numtyp)1.875*r4_ia4)-ir/rsq-
+                                        factor_coul*ir);
+          else {
+            r6_ia6 = r2_ia2*r4_ia4;
+            forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
+                                         r2_ia2-(numtyp)8.4375*r4_ia4+
+                                         (numtyp)2.1875*r6_ia6)-ir/rsq-
+                                         factor_coul*ir);
+          }
+        } else {
+          forcecoul = (numtyp)0.0;
+          prefactor = (numtyp)0.0;
+        }
+
+        force = forcecoul + force_lj * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            if (smooth==0)
+              e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
+                                           (numtyp)1.3125*r4_ia4-
+                                           (numtyp)0.3125*r4_ia4*r2_ia2)-
+                                           factor_coul*ir);
+            else
+              e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
+                                           r2_ia2+(numtyp)2.953125*r4_ia4-
+                                           (numtyp)1.40625*r6_ia6+
+                                           (numtyp)0.2734375*r4_ia4*r4_ia4));
+              
+          if (rsq < lj1[mtype].y) {
+            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
+                      lj3[mtype].w;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+      *ap1=e_coul;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const int smooth, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __local numtyp _ia;
+  __local numtyp _ia2;
+  __local numtyp _ia3;
+  _ia=(numtyp)-1.0/sqrt(cut_coulsq);
+  _ia2=(numtyp)1.0/cut_coulsq;
+  _ia3=_ia2*_ia;
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<lj1[mtype].x) {
+        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
+        numtyp r2inv=(numtyp)1.0/rsq;
+
+        if (rsq < lj1[mtype].y) {
+          if (lj3[mtype].x == (numtyp)2) {
+            inv1=r2inv*r2inv;
+            inv2=inv1*inv1;
+          } else if (lj3[mtype].x == (numtyp)1) {
+            inv2=r2inv*sqrt(r2inv);
+            inv1=inv2*inv2;
+          } else {
+            inv1=r2inv*r2inv*r2inv;
+            inv2=inv1;
+          }
+          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+        } else
+          force_lj = (numtyp)0.0;
+
+        numtyp ir, r2_ia2, r4_ia4, r6_ia6;
+        if (rsq < cut_coulsq) {
+          ir = (numtyp)1.0/sqrt(rsq);
+          prefactor = qqrd2e*qtmp*fetch_q(j,q_);
+          r2_ia2 = rsq*_ia2;
+          r4_ia4 = r2_ia2*r2_ia2;
+          if (smooth==0)
+            forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
+                                        (numtyp)1.875*r4_ia4)-ir/rsq-
+                                        factor_coul*ir);
+          else {
+            r6_ia6 = r2_ia2*r4_ia4;
+            forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
+                                         r2_ia2-(numtyp)8.4375*r4_ia4+
+                                         (numtyp)2.1875*r6_ia6)-ir/rsq-
+                                         factor_coul*ir);
+          }
+        } else {
+          forcecoul = (numtyp)0.0;
+          prefactor = (numtyp)0.0;
+        }
+
+        force = forcecoul + force_lj * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            if (smooth==0)
+              e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
+                                           (numtyp)1.3125*r4_ia4-
+                                           (numtyp)0.3125*r4_ia4*r2_ia2)-
+                                           factor_coul*ir);
+            else
+              e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
+                                           r2_ia2+(numtyp)2.953125*r4_ia4-
+                                           (numtyp)1.40625*r6_ia6+
+                                           (numtyp)0.2734375*r4_ia4*r4_ia4));
+          if (rsq < lj1[mtype].y) {
+            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
+                      lj3[mtype].w;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+      *ap1=e_coul;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii*/
+}
+
+#endif
+
--- a/lib/gpu/ellipsoid_nbor.cu
+++ b/lib/gpu/ellipsoid_nbor.cu
@ -0,0 +1,165 @@
+// **************************************************************************
+//                              ellipsoid_nbor.cu
+//                             -------------------
+//                               W. Michael Brown
+//
+//  Device code for Ellipsoid neighbor routines
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifndef ELLIPSOID_NBOR_H
+#define ELLIPSOID_NBOR_H
+
+#ifdef NV_KERNEL
+
+#include "nv_kernel_def.h"
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define MAX_SHARED_TYPES 8
+
+#endif
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#else
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#endif
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+
+// ---------------------------------------------------------------------------
+// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only unpack neighbors matching the specified inclusive range of forms
+// -- Only unpack neighbors within cutoff
+// ---------------------------------------------------------------------------
+__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, 
+                          const int ntypes, __global int *dev_nbor,
+                          const int nbor_pitch, const int start, const int inum, 
+                          __global int *dev_ij, const int form_low, 
+                          const int form_high) {
+                                
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X+start;
+
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+  
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=mul24(iw,ntypes);
+    int newj=0;  
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+      j &= NEIGHMASK;
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
+      numtyp2 cf=cut_form[mtype];
+      if (cf.y>=form_low && cf.y<=form_high) {
+        // Compute r12;
+        numtyp rsq=jx.x-ix.x;
+        rsq*=rsq;
+        numtyp t=jx.y-ix.y;
+        rsq+=t*t;
+        t=jx.z-ix.z;
+        rsq+=t*t;
+
+        if (rsq<cf.x) {
+          *packed=j;
+          packed+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    dev_nbor[ii+nbor_pitch]=newj;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only unpack neighbors matching the specified inclusive range of forms
+// -- Only unpack neighbors within cutoff
+// -- Fast version of routine that uses shared memory for LJ constants
+// ---------------------------------------------------------------------------
+__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
+                               __global int *dev_nbor, const int nbor_pitch, 
+                               const int start, const int inum, 
+                               __global int *dev_ij, const int form_low, 
+                               const int form_high) {
+                                
+  int ii=THREAD_ID_X;
+  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    cutsq[ii]=cut_form[ii].x;
+    form[ii]=cut_form[ii].y;
+  }
+  ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
+  __syncthreads();
+
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+  
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    int newj=0;  
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+      j &= NEIGHMASK;
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
+      
+      if (form[mtype]>=form_low && form[mtype]<=form_high) {
+        // Compute r12;
+        numtyp rsq=jx.x-ix.x;
+        rsq*=rsq;
+        numtyp t=jx.y-ix.y;
+        rsq+=t*t;
+        t=jx.z-ix.z;
+        rsq+=t*t;
+
+        if (rsq<cutsq[mtype]) {
+          *packed=j;
+          packed+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    dev_nbor[ii+nbor_pitch]=newj;
+  }
+}
+
+#endif
--- a/lib/gpu/gayberne.cu
+++ b/lib/gpu/gayberne.cu
@ -0,0 +1,429 @@
+// **************************************************************************
+//                                 gayberne.cu
+//                             -------------------
+//                               W. Michael Brown
+//
+//  Device code for Gay-Berne potential acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifndef GAYBERNE_CU
+#define GAYBERNE_CU
+
+#ifdef NV_KERNEL
+#include "ellipsoid_extra.h"
+#endif
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
+__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
+                                 numtyp ans[9])
+{
+  numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
+    m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
+    m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
+  den = (numtyp)1.0/den;
+  
+  ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
+		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
+		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
+		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
+		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
+  
+  ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
+		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
+		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
+		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
+		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
+  
+  ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
+		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
+		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
+		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
+		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
+  
+  ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
+		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
+		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
+		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
+		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
+  
+  ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
+		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
+		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
+		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
+		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
+  
+  ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
+		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
+		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
+		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
+		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
+  
+  ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
+		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
+		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
+		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
+		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
+  
+  ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
+		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
+		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
+		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
+		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
+  
+  ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
+		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
+		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
+		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
+		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
+}
+
+__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
+                               __global numtyp4* shape, __global numtyp4* well, 
+                               __global numtyp *gum, __global numtyp2* sig_eps, 
+                               const int ntypes, __global numtyp *lshape, 
+                               __global int *dev_nbor, const int stride, 
+                               __global acctyp4 *ans, const int astride, 
+                               __global acctyp *engv, __global int *err_flag, 
+                               const int eflag, const int vflag, const int inum,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *nbor_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+    numtyp a1[9], b1[9], g1[9];
+    numtyp4 ishape=shape[itype];
+    {
+      numtyp t[9];
+      gpu_quat_to_mat_trans(q,i,a1);
+      gpu_diag_times3(ishape,a1,t);
+      gpu_transpose_times3(a1,t,g1);
+      gpu_diag_times3(well[itype],a1,t);
+      gpu_transpose_times3(a1,t,b1);
+    }
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r12[3];
+      r12[0] = jx.x-ix.x;
+      r12[1] = jx.y-ix.y;
+      r12[2] = jx.z-ix.z;
+      numtyp ir = gpu_dot3(r12,r12);
+
+      ir = rsqrt(ir);
+      numtyp r = (numtyp)1.0/ir;
+
+      numtyp a2[9];
+      gpu_quat_to_mat_trans(q,j,a2);
+  
+      numtyp u_r, dUr[3], tUr[3], eta, teta[3];
+      { // Compute U_r, dUr, eta, and teta
+        // Compute g12
+        numtyp g12[9];
+        {
+          numtyp g2[9];
+          {
+              gpu_diag_times3(shape[jtype],a2,g12);
+              gpu_transpose_times3(a2,g12,g2);
+              gpu_plus3(g1,g2,g12);
+          }
+
+          { // Compute U_r and dUr
+    
+            // Compute kappa
+            numtyp kappa[3];
+            gpu_mldivide3(g12,r12,kappa,err_flag);
+
+            // -- replace r12 with r12 hat
+            r12[0]*=ir;
+            r12[1]*=ir;
+            r12[2]*=ir;
+
+            // -- kappa is now / r
+            kappa[0]*=ir;
+            kappa[1]*=ir;
+            kappa[2]*=ir;
+
+            // energy
+  
+            // compute u_r and dUr
+            numtyp uslj_rsq;
+            {
+              // Compute distance of closest approach
+              numtyp h12, sigma12;
+              sigma12 = gpu_dot3(r12,kappa);
+              sigma12 = rsqrt((numtyp)0.5*sigma12);
+              h12 = r-sigma12;
+
+              // -- kappa is now ok
+              kappa[0]*=r;
+              kappa[1]*=r;
+              kappa[2]*=r;
+          
+              int mtype=mul24(ntypes,itype)+jtype;
+              numtyp sigma = sig_eps[mtype].x;
+              numtyp epsilon = sig_eps[mtype].y;
+              numtyp varrho = sigma/(h12+gum[0]*sigma);
+              numtyp varrho6 = varrho*varrho*varrho;
+              varrho6*=varrho6;
+              numtyp varrho12 = varrho6*varrho6;
+              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+              temp1 = temp1*(numtyp)24.0*epsilon;
+              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+              numtyp temp2 = gpu_dot3(kappa,r12);
+              uslj_rsq = uslj_rsq*ir*ir;
+
+              dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
+              dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
+              dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
+            }
+
+            // torque for particle 1
+            {
+              numtyp tempv[3], tempv2[3];
+              tempv[0] = -uslj_rsq*kappa[0];
+              tempv[1] = -uslj_rsq*kappa[1];
+              tempv[2] = -uslj_rsq*kappa[2];
+              gpu_row_times3(kappa,g1,tempv2);
+              gpu_cross3(tempv,tempv2,tUr);
+            }
+          }
+        }
+     
+        // Compute eta
+        {
+          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
+          numtyp det_g12 = gpu_det3(g12);
+          eta = pow(eta/det_g12,gum[1]);
+        }
+    
+        // Compute teta
+        numtyp temp[9], tempv[3], tempv2[3];
+        compute_eta_torque(g12,a1,ishape,temp);
+        numtyp temp1 = -eta*gum[1];
+
+        tempv[0] = temp1*temp[0];
+        tempv[1] = temp1*temp[1];
+        tempv[2] = temp1*temp[2];
+        gpu_cross3(a1,tempv,tempv2);
+        teta[0] = tempv2[0];
+        teta[1] = tempv2[1];
+        teta[2] = tempv2[2];
+  
+        tempv[0] = temp1*temp[3];
+        tempv[1] = temp1*temp[4];
+        tempv[2] = temp1*temp[5];
+        gpu_cross3(a1+3,tempv,tempv2);
+        teta[0] += tempv2[0];
+        teta[1] += tempv2[1];
+        teta[2] += tempv2[2];
+
+        tempv[0] = temp1*temp[6];
+        tempv[1] = temp1*temp[7];
+        tempv[2] = temp1*temp[8];
+        gpu_cross3(a1+6,tempv,tempv2);
+        teta[0] += tempv2[0];
+        teta[1] += tempv2[1];
+        teta[2] += tempv2[2];
+      }
+  
+      numtyp chi, dchi[3], tchi[3];
+      { // Compute chi and dchi
+
+        // Compute b12
+        numtyp b2[9], b12[9];
+        {
+          gpu_diag_times3(well[jtype],a2,b12);
+          gpu_transpose_times3(a2,b12,b2);
+          gpu_plus3(b1,b2,b12);
+        }
+
+        // compute chi_12
+        r12[0]*=r;
+        r12[1]*=r;
+        r12[2]*=r;
+        numtyp iota[3];
+        gpu_mldivide3(b12,r12,iota,err_flag);
+        // -- iota is now iota/r
+        iota[0]*=ir;
+        iota[1]*=ir;
+        iota[2]*=ir;
+        r12[0]*=ir;
+        r12[1]*=ir;
+        r12[2]*=ir;
+        chi = gpu_dot3(r12,iota);
+        chi = pow(chi*(numtyp)2.0,gum[2]);
+
+        // -- iota is now ok
+        iota[0]*=r;
+        iota[1]*=r;
+        iota[2]*=r;
+
+        numtyp temp1 = gpu_dot3(iota,r12);
+        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
+                                                          gum[2]);
+        dchi[0] = temp2*(iota[0]-temp1*r12[0]);
+        dchi[1] = temp2*(iota[1]-temp1*r12[1]);
+        dchi[2] = temp2*(iota[2]-temp1*r12[2]);
+
+        // compute t_chi
+        numtyp tempv[3];
+        gpu_row_times3(iota,b1,tempv);
+        gpu_cross3(tempv,iota,tchi);
+        temp1 = (numtyp)-4.0*ir*ir;
+        tchi[0] *= temp1;
+        tchi[1] *= temp1;
+        tchi[2] *= temp1;
+      }
+
+      numtyp temp2 = factor_lj*eta*chi;
+      if (eflag>0)
+        energy+=u_r*temp2;
+      numtyp temp1 = -eta*u_r*factor_lj;
+      if (vflag>0) {
+        r12[0]*=-r;
+        r12[1]*=-r;
+        r12[2]*=-r;
+        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+        f.x+=ft;
+        virial[0]+=r12[0]*ft;
+        ft=temp1*dchi[1]-temp2*dUr[1];
+        f.y+=ft;
+        virial[1]+=r12[1]*ft;
+        virial[3]+=r12[0]*ft;
+        ft=temp1*dchi[2]-temp2*dUr[2];
+        f.z+=ft;
+        virial[2]+=r12[2]*ft;
+        virial[4]+=r12[0]*ft;
+        virial[5]+=r12[1]*ft;
+      } else {
+        f.x+=temp1*dchi[0]-temp2*dUr[0];
+        f.y+=temp1*dchi[1]-temp2*dUr[1];
+        f.z+=temp1*dchi[2]-temp2*dUr[2];
+      }
+
+      // Torque on 1
+      temp1 = -u_r*eta*factor_lj;
+      temp2 = -u_r*chi*factor_lj;
+      numtyp temp3 = -chi*eta*factor_lj;
+      tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
+      tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
+      tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
+ 
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[7][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=tor.x;
+    red_acc[4][tid]=tor.y;
+    red_acc[5][tid]=tor.z;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<6; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    tor.x=red_acc[3][tid];
+    tor.y=red_acc[4][tid];
+    tor.z=red_acc[5][tid];
+
+    if (eflag>0 || vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+      red_acc[6][tid]=energy;
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<7; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+      energy=red_acc[6][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=astride;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=astride;
+      }
+    }
+    ans[ii]=f;
+    ans[ii+astride]=tor;
+  } // if ii
+}
+
+#endif
+
--- a/lib/gpu/gayberne_lj.cu
+++ b/lib/gpu/gayberne_lj.cu
@ -0,0 +1,594 @@
+// **************************************************************************
+//                                gayberne_lj.cu
+//                             -------------------
+//                               W. Michael Brown
+//
+//  Device code for Gay-Berne - Lennard-Jones potential acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifndef GAYBERNE_LJ_CU
+#define GAYBERNE_LJ_CU
+
+#ifdef NV_KERNEL
+#include "ellipsoid_extra.h"
+#endif
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
+__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
+                               __global numtyp4* shape,__global numtyp4* well, 
+                               __global numtyp *gum, __global numtyp2* sig_eps, 
+                               const int ntypes, __global numtyp *lshape, 
+                               __global int *dev_nbor, const int stride, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               __global int *err_flag, const int eflag, 
+                               const int vflag,const int start, const int inum, 
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *nbor_end=nbor+stride*numj;
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+      
+    numtyp oner=shape[itype].x;
+    numtyp one_well=well[itype].x;
+  
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r12[3];
+      r12[0] = jx.x-ix.x;
+      r12[1] = jx.y-ix.y;
+      r12[2] = jx.z-ix.z;
+      numtyp ir = gpu_dot3(r12,r12);
+
+      ir = rsqrt(ir);
+      numtyp r = (numtyp)1.0/ir;
+      
+      numtyp r12hat[3];
+      r12hat[0]=r12[0]*ir;
+      r12hat[1]=r12[1]*ir;
+      r12hat[2]=r12[2]*ir;
+
+      numtyp a2[9];
+      gpu_quat_to_mat_trans(q,j,a2);
+  
+      numtyp u_r, dUr[3], eta;
+      { // Compute U_r, dUr, eta, and teta
+        // Compute g12
+        numtyp g12[9];
+        {
+          {
+            numtyp g2[9];
+            gpu_diag_times3(shape[jtype],a2,g12);
+            gpu_transpose_times3(a2,g12,g2);
+            g12[0]=g2[0]+oner;
+            g12[4]=g2[4]+oner;
+            g12[8]=g2[8]+oner;
+            g12[1]=g2[1];
+            g12[2]=g2[2];
+            g12[3]=g2[3];
+            g12[5]=g2[5];
+            g12[6]=g2[6];
+            g12[7]=g2[7];    
+          }
+  
+          { // Compute U_r and dUr
+    
+            // Compute kappa
+            numtyp kappa[3];
+            gpu_mldivide3(g12,r12,kappa,err_flag);
+
+            // -- kappa is now / r
+            kappa[0]*=ir;
+            kappa[1]*=ir;
+            kappa[2]*=ir;
+  
+            // energy
+  
+            // compute u_r and dUr
+            numtyp uslj_rsq;
+            {
+              // Compute distance of closest approach
+              numtyp h12, sigma12;
+              sigma12 = gpu_dot3(r12hat,kappa);
+              sigma12 = rsqrt((numtyp)0.5*sigma12);
+              h12 = r-sigma12;
+
+              // -- kappa is now ok
+              kappa[0]*=r;
+              kappa[1]*=r;
+              kappa[2]*=r;
+          
+              int mtype=mul24(ntypes,itype)+jtype;
+              numtyp sigma = sig_eps[mtype].x;
+              numtyp epsilon = sig_eps[mtype].y;
+              numtyp varrho = sigma/(h12+gum[0]*sigma);
+              numtyp varrho6 = varrho*varrho*varrho;
+              varrho6*=varrho6;
+              numtyp varrho12 = varrho6*varrho6;
+              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+              temp1 = temp1*(numtyp)24.0*epsilon;
+              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+              numtyp temp2 = gpu_dot3(kappa,r12hat);
+              uslj_rsq = uslj_rsq*ir*ir;
+
+              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
+              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
+              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
+            }
+          }
+        }
+     
+        // Compute eta
+        {
+          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
+          numtyp det_g12 = gpu_det3(g12);
+          eta = pow(eta/det_g12,gum[1]);
+        }
+      }
+  
+      numtyp chi, dchi[3];
+      { // Compute chi and dchi
+
+        // Compute b12
+        numtyp b12[9];
+        {
+          numtyp b2[9];
+          gpu_diag_times3(well[jtype],a2,b12);
+          gpu_transpose_times3(a2,b12,b2);
+          b12[0]=b2[0]+one_well;
+          b12[4]=b2[4]+one_well;
+          b12[8]=b2[8]+one_well;
+          b12[1]=b2[1];
+          b12[2]=b2[2];
+          b12[3]=b2[3];
+          b12[5]=b2[5];
+          b12[6]=b2[6];
+          b12[7]=b2[7];    
+        }
+
+        // compute chi_12
+        numtyp iota[3];
+        gpu_mldivide3(b12,r12,iota,err_flag);
+        // -- iota is now iota/r
+        iota[0]*=ir;
+        iota[1]*=ir;
+        iota[2]*=ir;
+        chi = gpu_dot3(r12hat,iota);
+        chi = pow(chi*(numtyp)2.0,gum[2]);
+
+        // -- iota is now ok
+        iota[0]*=r;
+        iota[1]*=r;
+        iota[2]*=r;
+
+        numtyp temp1 = gpu_dot3(iota,r12hat);
+        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
+                                                     gum[2]);
+        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
+        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
+        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
+      }
+
+      numtyp temp2 = factor_lj*eta*chi;
+      if (eflag>0)
+        energy+=u_r*temp2;
+      numtyp temp1 = -eta*u_r*factor_lj;
+      if (vflag>0) {
+        r12[0]*=-1;
+        r12[1]*=-1;
+        r12[2]*=-1;
+        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+        f.x+=ft;
+        virial[0]+=r12[0]*ft;
+        ft=temp1*dchi[1]-temp2*dUr[1];
+        f.y+=ft;
+        virial[1]+=r12[1]*ft;
+        virial[3]+=r12[0]*ft;
+        ft=temp1*dchi[2]-temp2*dUr[2];
+        f.z+=ft;
+        virial[2]+=r12[2]*ft;
+        virial[4]+=r12[0]*ft;
+        virial[5]+=r12[1]*ft;
+      } else {
+        f.x+=temp1*dchi[0]-temp2*dUr[0];
+        f.y+=temp1*dchi[1]-temp2*dUr[1];
+        f.z+=temp1*dchi[2]-temp2*dUr[2];
+      }
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
+                        __global numtyp4* lj3, const int lj_types, 
+                        __global numtyp *gum, 
+                        const int stride, __global int *dev_ij, 
+                        __global acctyp4 *ans, __global acctyp *engv, 
+                        __global int *err_flag, const int eflag, 
+                        const int vflag, const int start, const int inum, 
+                        const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *list_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int ii=itype*lj_types+jtype;
+      if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
+          energy+=factor_lj*(e-lj3[ii].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1+=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1+=virial[i];
+        ap1+=inum;
+      }
+    }
+    acctyp4 old=ans[ii];
+    old.x+=f.x;
+    old.y+=f.y;
+    old.z+=f.z;
+    ans[ii]=old;
+  } // if ii
+}
+
+__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
+                             __global numtyp4* lj3_in, __global numtyp *gum, 
+                             const int stride, __global int *dev_ij,
+                             __global acctyp4 *ans, __global acctyp *engv,
+                             __global int *err_flag, const int eflag,
+                             const int vflag, const int start, const int inum,
+                             const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];                              
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (tid<4)
+    sp_lj[tid]=gum[tid+3];    
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *list_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1+=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1+=virial[i];
+        ap1+=inum;
+      }
+    }
+    acctyp4 old=ans[ii];
+    old.x+=f.x;
+    old.y+=f.y;
+    old.z+=f.z;
+    ans[ii]=old;
+  } // if ii
+}
+
+#endif
+
--- a/lib/gpu/lj_class2_long.cu
+++ b/lib/gpu/lj_class2_long.cu
@ -0,0 +1,472 @@
+// **************************************************************************
+//                              lj_class2_long.cu
+//                             -------------------
+//                               W. Michael Brown
+//
+//  Device code for COMPASS LJ long acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : Mon May 16 2011
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifndef LJCL_GPU_KERNEL
+#define LJCL_GPU_KERNEL
+
+#ifdef NV_KERNEL
+
+#include "nv_kernel_def.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+__inline double fetch_q(const int& i, const double *q)
+{
+  return q[i];
+}
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+__inline float fetch_q(const int& i, const float *q)
+{
+  return tex1Dfetch(q_tex, i);
+}
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+#define fetch_q(i,y) q_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
+
+#endif
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<lj1[mtype].z) {
+        numtyp r2inv=(numtyp)1.0/rsq;
+        numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
+
+        if (rsq < lj1[mtype].w) {
+          numtyp rinv=sqrt(r2inv);
+          r3inv=r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = sqrt(rsq);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = exp(-grij*grij);
+          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else {
+          forcecoul = (numtyp)0.0;
+          prefactor = (numtyp)0.0;
+        }
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].w) {
+            numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+      *ap1=e_coul;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<lj1[mtype].z) {
+        numtyp r2inv=(numtyp)1.0/rsq;
+        numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
+
+        if (rsq < lj1[mtype].w) {
+          numtyp rinv=sqrt(r2inv);
+          r3inv=r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = sqrt(rsq);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = exp(-grij*grij);
+          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else {
+          forcecoul = (numtyp)0.0;
+          prefactor = (numtyp)0.0;
+        }
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].w) {
+            numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+      *ap1=e_coul;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii*/
+}
+
+#endif
+
--- a/lib/gpu/re_squared.cu
+++ b/lib/gpu/re_squared.cu
@ -0,0 +1,526 @@
+// **************************************************************************
+//                                re_squared.cu
+//                             -------------------
+//                               W. Michael Brown
+//
+//  Device code for RE-Squared potential acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : Fri May 06 2011
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifndef RE_SQUARED_CU
+#define RE_SQUARED_CU
+
+#ifdef NV_KERNEL
+#include "ellipsoid_extra.h"
+#endif
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
+__inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
+{
+  numtyp ans;
+  ans = m2[0]*m[4]*m[8] - m2[0]*m[5]*m[7] -
+        m[3]*m2[1]*m[8] + m[3]*m2[2]*m[7] +
+        m[6]*m2[1]*m[5] - m[6]*m2[2]*m[4] +
+        m[0]*m2[4]*m[8] - m[0]*m2[5]*m[7] -
+        m2[3]*m[1]*m[8] + m2[3]*m[2]*m[7] +
+        m[6]*m[1]*m2[5] - m[6]*m[2]*m2[4] +
+        m[0]*m[4]*m2[8] - m[0]*m[5]*m2[7] -
+        m[3]*m[1]*m2[8] + m[3]*m[2]*m2[7] +
+        m2[6]*m[1]*m[5] - m2[6]*m[2]*m[4];
+  return ans;
+}
+
+__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
+                               __global numtyp4* shape, __global numtyp4* well, 
+                               __global numtyp *splj, __global numtyp2* sig_eps, 
+                               const int ntypes, __global int *dev_nbor,
+                               const int stride,  __global acctyp4 *ans,
+                               const int astride, __global acctyp *engv,
+                               __global int *err_flag, const int eflag,
+                               const int vflag, const int inum,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=splj[0];    
+  sp_lj[1]=splj[1];    
+  sp_lj[2]=splj[2];    
+  sp_lj[3]=splj[3];
+  
+  __local numtyp b_alpha, cr60;
+  b_alpha=(numtyp)45.0/(numtyp)56.0;
+  cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *nbor_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp a1[9];       // Rotation matrix (lab->body)
+    numtyp aTe1[9];     // A'*E
+    numtyp gamma1[9];   // A'*S^2*A
+    numtyp sa1[9];      // S^2*A;
+    numtyp lA1_0[9], lA1_1[9], lA1_2[9]; // -A*rotation generator (x,y, or z)
+    numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9];  // A'*S^2*lA
+    numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9];   // lAtwo+lA'*sa
+    numtyp4 ishape;
+    
+    ishape=shape[itype];
+    numtyp4 ishape2;
+    ishape2.x=ishape.x*ishape.x;
+    ishape2.y=ishape.y*ishape.y;
+    ishape2.z=ishape.z*ishape.z;
+    numtyp ilshape = ishape.x*ishape.y*ishape.z;
+    
+    {
+      numtyp aTs[9];    // A1'*S1^2
+      gpu_quat_to_mat_trans(q,i,a1);
+      gpu_transpose_times_diag3(a1,well[itype],aTe1);
+      gpu_transpose_times_diag3(a1,ishape2,aTs);
+      gpu_diag_times3(ishape2,a1,sa1);
+      gpu_times3(aTs,a1,gamma1);
+      gpu_rotation_generator_x(a1,lA1_0);
+      gpu_rotation_generator_y(a1,lA1_1);
+      gpu_rotation_generator_z(a1,lA1_2);
+      gpu_times3(aTs,lA1_0,lAtwo1_0);
+      gpu_transpose_times3(lA1_0,sa1,lAsa1_0);
+      gpu_plus3(lAsa1_0,lAtwo1_0,lAsa1_0);
+      gpu_times3(aTs,lA1_1,lAtwo1_1);
+      gpu_transpose_times3(lA1_1,sa1,lAsa1_1);
+      gpu_plus3(lAsa1_1,lAtwo1_1,lAsa1_1);
+      gpu_times3(aTs,lA1_2,lAtwo1_2);
+      gpu_transpose_times3(lA1_2,sa1,lAsa1_2);
+      gpu_plus3(lAsa1_2,lAtwo1_2,lAsa1_2);
+    }
+    ishape2.x=(numtyp)1.0/ishape2.x;
+    ishape2.y=(numtyp)1.0/ishape2.y;
+    ishape2.z=(numtyp)1.0/ishape2.z;
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r[3], rhat[3];
+      numtyp rnorm;
+      r[0] = jx.x-ix.x;
+      r[1] = jx.y-ix.y;
+      r[2] = jx.z-ix.z;
+      rnorm = gpu_dot3(r,r);
+      rnorm = rsqrt(rnorm);
+      rhat[0] = r[0]*rnorm;
+      rhat[1] = r[1]*rnorm;
+      rhat[2] = r[2]*rnorm;
+
+
+      numtyp a2[9];       // Rotation matrix (lab->body)
+      numtyp gamma2[9];   // A'*S^2*A
+      numtyp4 jshape;
+    
+      jshape=shape[jtype];
+      numtyp4 jshape2;
+      jshape2.x=jshape.x*jshape.x;
+      jshape2.y=jshape.y*jshape.y;
+      jshape2.z=jshape.z*jshape.z;
+      {
+        numtyp aTs[9];    // A1'*S1^2
+        gpu_quat_to_mat_trans(q,j,a2);
+        gpu_transpose_times_diag3(a2,jshape2,aTs);
+        gpu_times3(aTs,a2,gamma2);
+      }
+
+      numtyp temp[9], s[3], z1[3], z2[3], v1[3], v2[3];
+      numtyp sigma12, sigma1, sigma2;
+      gpu_plus3(gamma1,gamma2,temp);
+      gpu_mldivide3(temp,rhat,s,err_flag);
+      sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
+      gpu_times_column3(a1,rhat,z1);
+      gpu_times_column3(a2,rhat,z2);
+      v1[0] = z1[0]*ishape2.x;
+      v1[1] = z1[1]*ishape2.y;
+      v1[2] = z1[2]*ishape2.z;
+      v2[0] = z2[0]/jshape2.x;
+      v2[1] = z2[1]/jshape2.y;
+      v2[2] = z2[2]/jshape2.z;
+      sigma1 = sqrt(gpu_dot3(z1,v1));
+      sigma2 = sqrt(gpu_dot3(z2,v2));
+
+      numtyp H12[9];
+      numtyp dH;
+      H12[0] = gamma1[0]*sigma1+gamma2[0]*sigma2;
+      H12[1] = gamma1[1]*sigma1+gamma2[1]*sigma2;
+      H12[2] = gamma1[2]*sigma1+gamma2[2]*sigma2;
+      H12[3] = gamma1[3]*sigma1+gamma2[3]*sigma2;
+      H12[4] = gamma1[4]*sigma1+gamma2[4]*sigma2;
+      H12[5] = gamma1[5]*sigma1+gamma2[5]*sigma2;
+      H12[6] = gamma1[6]*sigma1+gamma2[6]*sigma2;
+      H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2;
+      H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2;
+      dH=gpu_det3(H12);
+      
+      numtyp sigma1p2, sigma2p2, lambda, nu;
+      sigma1p2 = sigma1*sigma1;
+      sigma2p2 = sigma2*sigma2;
+      numtyp jlshape = jshape.x*jshape.y*jshape.z;
+      lambda = ilshape*sigma1p2 + jlshape*sigma2p2;
+
+
+      sigma1=(numtyp)1.0/sigma1;
+      sigma2=(numtyp)1.0/sigma2;
+
+      nu = sqrt(dH/(sigma1+sigma2));
+      gpu_times3(aTe1,a1,temp);
+
+      numtyp sigma, epsilon;
+      int mtype=mul24(ntypes,itype)+jtype;
+      sigma = sig_eps[mtype].x;
+      epsilon = sig_eps[mtype].y*factor_lj;
+
+      numtyp w[3], temp2[9];
+      numtyp h12,eta,chi,sprod,sigh,tprod;
+      numtyp aTe2[9];     // A'*E
+      gpu_transpose_times_diag3(a2,well[jtype],aTe2);
+      gpu_times3(aTe2,a2,temp2);
+      gpu_plus3(temp,temp2,temp);
+      gpu_mldivide3(temp,rhat,w,err_flag);
+      h12 = (numtyp)1.0/rnorm-sigma12;
+      eta = lambda/nu;
+      chi = (numtyp)2.0*gpu_dot3(rhat,w);
+      sprod = ilshape * jlshape;
+      sigh = sigma/h12;
+      tprod = eta*chi*sigh;
+
+      numtyp stemp, Ua;
+      stemp = h12*(numtyp)0.5;
+      Ua = (ishape.x+stemp)*(ishape.y+stemp)*
+           (ishape.z+stemp)*(jshape.x+stemp)*
+           (jshape.y+stemp)*(jshape.z+stemp);
+      Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*sprod/Ua;
+      Ua = epsilon*Ua/(numtyp)-36.0;
+
+      numtyp Ur;
+      stemp = h12/cr60;
+      Ur = (ishape.x+stemp)*(ishape.y+stemp)*
+           (ishape.z+stemp)*(jshape.x+stemp)*
+           (jshape.y+stemp)*(jshape.z+stemp);
+      Ur = ((numtyp)1.0+b_alpha*tprod)*sprod/Ur;
+      numtyp sigh6=sigh*sigh*sigh;
+      sigh6*=sigh6;
+      Ur = epsilon*Ur*sigh6/(numtyp)2025.0;
+
+      energy+=Ua+Ur;
+
+      // force
+
+      numtyp vsigma1[3], vsigma2[3], gsigma1[9], gsigma2[9];
+      numtyp sec, sigma12p3, sigma1p3, sigma2p3;
+      sec = sigma*eta*chi;
+      sigma12p3 = sigma12*sigma12*sigma12;
+      sigma1p3 = sigma1/sigma1p2;
+      sigma2p3 = sigma2/sigma2p2;
+      vsigma1[0] = -sigma1p3*v1[0];
+      vsigma1[1] = -sigma1p3*v1[1];
+      vsigma1[2] = -sigma1p3*v1[2];
+      vsigma2[0] = -sigma2p3*v2[0];
+      vsigma2[1] = -sigma2p3*v2[1];
+      vsigma2[2] = -sigma2p3*v2[2];
+      gsigma1[0] = -gamma1[0]*sigma1p2;
+      gsigma1[1] = -gamma1[1]*sigma1p2;
+      gsigma1[2] = -gamma1[2]*sigma1p2;
+      gsigma1[3] = -gamma1[3]*sigma1p2;
+      gsigma1[4] = -gamma1[4]*sigma1p2;
+      gsigma1[5] = -gamma1[5]*sigma1p2;
+      gsigma1[6] = -gamma1[6]*sigma1p2;
+      gsigma1[7] = -gamma1[7]*sigma1p2;
+      gsigma1[8] = -gamma1[8]*sigma1p2;
+      gsigma2[0] = -gamma2[0]*sigma2p2;
+      gsigma2[1] = -gamma2[1]*sigma2p2;
+      gsigma2[2] = -gamma2[2]*sigma2p2;
+      gsigma2[3] = -gamma2[3]*sigma2p2;
+      gsigma2[4] = -gamma2[4]*sigma2p2;
+      gsigma2[5] = -gamma2[5]*sigma2p2;
+      gsigma2[6] = -gamma2[6]*sigma2p2;
+      gsigma2[7] = -gamma2[7]*sigma2p2;
+      gsigma2[8] = -gamma2[8]*sigma2p2;
+
+      numtyp tsig1sig2, tdH, teta1, teta2;
+      numtyp fourw[3], spr[3];
+      tsig1sig2 = eta/((numtyp)2.0*(sigma1+sigma2));
+      tdH = eta/((numtyp)2.0*dH);
+      teta1 = (numtyp)2.0*eta/lambda;
+      teta2 = teta1*jlshape/sigma2p3;
+      teta1 = teta1*ilshape/sigma1p3;
+      fourw[0] = (numtyp)4.0*w[0];
+      fourw[1] = (numtyp)4.0*w[1];
+      fourw[2] = (numtyp)4.0*w[2];
+      spr[0] = (numtyp)0.5*sigma12p3*s[0];
+      spr[1] = (numtyp)0.5*sigma12p3*s[1];
+      spr[2] = (numtyp)0.5*sigma12p3*s[2];
+
+      numtyp hsec, dspu, pbsu;
+      stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
+              (numtyp)1.0/(jshape.x*(numtyp)2.0+h12)+
+              (numtyp)1.0/(jshape.y*(numtyp)2.0+h12)+
+              (numtyp)1.0/(jshape.z*(numtyp)2.0+h12);
+      hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec);
+      dspu = (numtyp)1.0/h12-hsec+stemp;
+      pbsu = (numtyp)3.0*sigma*hsec;
+  
+      numtyp dspr, pbsr;
+      stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
+              (numtyp)1.0/(ishape.y*cr60+h12)+
+              (numtyp)1.0/(ishape.z*cr60+h12)+
+              (numtyp)1.0/(jshape.x*cr60+h12)+
+              (numtyp)1.0/(jshape.y*cr60+h12)+
+              (numtyp)1.0/(jshape.z*cr60+h12);
+      hsec = (numtyp)1.0/(h12+b_alpha*sec);
+      dspr = (numtyp)7.0/h12-hsec+stemp;
+      pbsr = b_alpha*sigma*hsec;
+  
+      numtyp dH12[9];
+      numtyp dUa, dUr, deta, dchi, ddH, dh12;
+      numtyp dsigma1, dsigma2;
+
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+        numtyp u[3], u1[3], u2[3];
+        u[0] = -rhat[i]*rhat[0];
+        u[1] = -rhat[i]*rhat[1];
+        u[2] = -rhat[i]*rhat[2];
+        u[i] += (numtyp)1.0;
+        u[0] *= rnorm;
+        u[1] *= rnorm;
+        u[2] *= rnorm;
+        gpu_times_column3(a1,u,u1);
+        gpu_times_column3(a2,u,u2);
+        dsigma1=gpu_dot3(u1,vsigma1);
+        dsigma2=gpu_dot3(u2,vsigma2);
+        dH12[0] = dsigma1*gsigma1[0]+dsigma2*gsigma2[0];
+        dH12[1] = dsigma1*gsigma1[1]+dsigma2*gsigma2[1];
+        dH12[2] = dsigma1*gsigma1[2]+dsigma2*gsigma2[2];
+        dH12[3] = dsigma1*gsigma1[3]+dsigma2*gsigma2[3];
+        dH12[4] = dsigma1*gsigma1[4]+dsigma2*gsigma2[4];
+        dH12[5] = dsigma1*gsigma1[5]+dsigma2*gsigma2[5];
+        dH12[6] = dsigma1*gsigma1[6]+dsigma2*gsigma2[6];
+        dH12[7] = dsigma1*gsigma1[7]+dsigma2*gsigma2[7];
+        dH12[8] = dsigma1*gsigma1[8]+dsigma2*gsigma2[8];
+        ddH = det_prime(H12,dH12);
+        deta = (dsigma1+dsigma2)*tsig1sig2;
+        deta -= ddH*tdH;
+        deta -= dsigma1*teta1+dsigma2*teta2;
+        dchi = gpu_dot3(u,fourw);
+        dh12 = rhat[i]+gpu_dot3(u,spr);
+        dUa = pbsu*(eta*dchi+deta*chi)-dh12*dspu;
+        dUr = pbsr*(eta*dchi+deta*chi)-dh12*dspr;
+        numtyp force=dUr*Ur+dUa*Ua;
+        if (i==0) {
+          f.x+=force;
+          if (vflag>0)
+            virial[0]+=-r[0]*force;
+        } else if (i==1) {
+          f.y+=force;
+          if (vflag>0) {
+            virial[1]+=-r[1]*force;
+            virial[3]+=-r[0]*force;
+          }
+        } else {
+          f.z+=force;
+          if (vflag>0) {
+            virial[2]+=-r[2]*force;
+            virial[4]+=-r[0]*force;
+            virial[5]+=-r[1]*force;
+          }
+        }
+      }
+
+      // torque on i
+      sigma1=(numtyp)1.0/sigma1;
+
+      numtyp fwae[3], p[3];
+      gpu_row_times3(fourw,aTe1,fwae);
+
+      {
+        gpu_times_column3(lA1_0,rhat,p);
+        dsigma1 = gpu_dot3(p,vsigma1);
+        dH12[0] = lAsa1_0[0]*sigma1+dsigma1*gsigma1[0];
+        dH12[1] = lAsa1_0[1]*sigma1+dsigma1*gsigma1[1];
+        dH12[2] = lAsa1_0[2]*sigma1+dsigma1*gsigma1[2];
+        dH12[3] = lAsa1_0[3]*sigma1+dsigma1*gsigma1[3];
+        dH12[4] = lAsa1_0[4]*sigma1+dsigma1*gsigma1[4];
+        dH12[5] = lAsa1_0[5]*sigma1+dsigma1*gsigma1[5];
+        dH12[6] = lAsa1_0[6]*sigma1+dsigma1*gsigma1[6];
+        dH12[7] = lAsa1_0[7]*sigma1+dsigma1*gsigma1[7];
+        dH12[8] = lAsa1_0[8]*sigma1+dsigma1*gsigma1[8];
+        ddH = det_prime(H12,dH12);
+        deta = tsig1sig2*dsigma1-tdH*ddH;
+        deta -= teta1*dsigma1;
+        numtyp tempv[3];
+        gpu_times_column3(lA1_0,w,tempv);
+        dchi = -gpu_dot3(fwae,tempv);
+        gpu_times_column3(lAtwo1_0,spr,tempv);
+        dh12 = -gpu_dot3(s,tempv);
+
+        dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
+        dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
+        tor.x -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        gpu_times_column3(lA1_1,rhat,p);
+        dsigma1 = gpu_dot3(p,vsigma1);
+        dH12[0] = lAsa1_1[0]*sigma1+dsigma1*gsigma1[0];
+        dH12[1] = lAsa1_1[1]*sigma1+dsigma1*gsigma1[1];
+        dH12[2] = lAsa1_1[2]*sigma1+dsigma1*gsigma1[2];
+        dH12[3] = lAsa1_1[3]*sigma1+dsigma1*gsigma1[3];
+        dH12[4] = lAsa1_1[4]*sigma1+dsigma1*gsigma1[4];
+        dH12[5] = lAsa1_1[5]*sigma1+dsigma1*gsigma1[5];
+        dH12[6] = lAsa1_1[6]*sigma1+dsigma1*gsigma1[6];
+        dH12[7] = lAsa1_1[7]*sigma1+dsigma1*gsigma1[7];
+        dH12[8] = lAsa1_1[8]*sigma1+dsigma1*gsigma1[8];
+        ddH = det_prime(H12,dH12);
+        deta = tsig1sig2*dsigma1-tdH*ddH;
+        deta -= teta1*dsigma1;
+        numtyp tempv[3];
+        gpu_times_column3(lA1_1,w,tempv);
+        dchi = -gpu_dot3(fwae,tempv);
+        gpu_times_column3(lAtwo1_1,spr,tempv);
+        dh12 = -gpu_dot3(s,tempv);
+
+        dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
+        dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
+        tor.y -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        gpu_times_column3(lA1_2,rhat,p);
+        dsigma1 = gpu_dot3(p,vsigma1);
+        dH12[0] = lAsa1_2[0]*sigma1+dsigma1*gsigma1[0];
+        dH12[1] = lAsa1_2[1]*sigma1+dsigma1*gsigma1[1];
+        dH12[2] = lAsa1_2[2]*sigma1+dsigma1*gsigma1[2];
+        dH12[3] = lAsa1_2[3]*sigma1+dsigma1*gsigma1[3];
+        dH12[4] = lAsa1_2[4]*sigma1+dsigma1*gsigma1[4];
+        dH12[5] = lAsa1_2[5]*sigma1+dsigma1*gsigma1[5];
+        dH12[6] = lAsa1_2[6]*sigma1+dsigma1*gsigma1[6];
+        dH12[7] = lAsa1_2[7]*sigma1+dsigma1*gsigma1[7];
+        dH12[8] = lAsa1_2[8]*sigma1+dsigma1*gsigma1[8];
+        ddH = det_prime(H12,dH12);
+        deta = tsig1sig2*dsigma1-tdH*ddH;
+        deta -= teta1*dsigma1;
+        numtyp tempv[3];
+        gpu_times_column3(lA1_2,w,tempv);
+        dchi = -gpu_dot3(fwae,tempv);
+        gpu_times_column3(lAtwo1_2,spr,tempv);
+        dh12 = -gpu_dot3(s,tempv);
+
+        dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
+        dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
+        tor.z -= (dUa*Ua+dUr*Ur);
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[7][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=tor.x;
+    red_acc[4][tid]=tor.y;
+    red_acc[5][tid]=tor.z;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<6; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    tor.x=red_acc[3][tid];
+    tor.y=red_acc[4][tid];
+    tor.z=red_acc[5][tid];
+
+    if (eflag>0 || vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+      red_acc[6][tid]=energy;
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<7; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+      energy=red_acc[6][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=astride;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=astride;
+      }
+    }
+    ans[ii]=f;
+    ans[ii+astride]=tor;
+  } // if ii
+
+}
+
+#endif
+
--- a/lib/gpu/re_squared_lj.cu
+++ b/lib/gpu/re_squared_lj.cu
@ -0,0 +1,892 @@
+// **************************************************************************
+//                               re_squared_lj.cu
+//                             -------------------
+//                               W. Michael Brown
+//
+//  Device code for RE-Squared - Lennard-Jones potential acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : Fri May 06 2011
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifndef RE_SQUARED_LJ_CU
+#define RE_SQUARED_LJ_CU
+
+#ifdef NV_KERNEL
+#include "ellipsoid_extra.h"
+#endif
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
+
+__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
+                   __global numtyp4* shape, __global numtyp4* well, 
+                   __global numtyp *splj, __global numtyp2* sig_eps,
+                   const int ntypes, __global int *dev_nbor, const int stride, 
+                   __global acctyp4 *ans, const int astride, 
+                   __global acctyp *engv, __global int *err_flag, 
+                   const int eflag, const int vflag, const int inum,
+                   const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=splj[0];    
+  sp_lj[1]=splj[1];    
+  sp_lj[2]=splj[2];    
+  sp_lj[3]=splj[3];
+  
+  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
+  b_alpha=(numtyp)45.0/(numtyp)56.0;
+  cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);    
+  solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0);
+  solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0);
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *nbor_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp a[9];       // Rotation matrix (lab->body)
+    numtyp aTe[9];     // A'*E
+    numtyp lA_0[9], lA_1[9], lA_2[9]; // -A*rotation generator (x,y, or z)
+
+    numtyp4 ishape;
+    ishape=shape[itype];
+    numtyp ilshape=ishape.x*ishape.y*ishape.z;
+
+    {
+      gpu_quat_to_mat_trans(q,i,a);
+      gpu_transpose_times_diag3(a,well[itype],aTe);
+      gpu_rotation_generator_x(a,lA_0);
+      gpu_rotation_generator_y(a,lA_1);
+      gpu_rotation_generator_z(a,lA_2);
+    }
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r[3], rhat[3];
+      numtyp rnorm;
+      r[0] = jx.x-ix.x;
+      r[1] = jx.y-ix.y;
+      r[2] = jx.z-ix.z;
+      rnorm = gpu_dot3(r,r);
+      rnorm = rsqrt(rnorm);
+      rhat[0] = r[0]*rnorm;
+      rhat[1] = r[1]*rnorm;
+      rhat[2] = r[2]*rnorm;
+
+      numtyp sigma, epsilon;
+      int mtype=mul24(ntypes,itype)+jtype;
+      sigma = sig_eps[mtype].x;
+      epsilon = sig_eps[mtype].y*factor_lj;
+
+      numtyp aTs[9]; 
+      numtyp4 scorrect;
+      numtyp half_sigma=sigma*(numtyp)0.5;
+      scorrect.x = ishape.x+half_sigma;
+      scorrect.y = ishape.y+half_sigma;
+      scorrect.z = ishape.z+half_sigma;
+      scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5;
+      scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
+      scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
+      gpu_transpose_times_diag3(a,scorrect,aTs);
+
+      // energy
+
+      numtyp gamma[9], s[3];
+      gpu_times3(aTs,a,gamma);
+      gpu_mldivide3(gamma,rhat,s,err_flag);
+
+      numtyp sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
+      numtyp temp[9], w[3];
+      gpu_times3(aTe,a,temp);
+      temp[0] += (numtyp)1.0;
+      temp[4] += (numtyp)1.0;
+      temp[8] += (numtyp)1.0;
+      gpu_mldivide3(temp,rhat,w,err_flag);
+
+      numtyp h12 = (numtyp)1.0/rnorm-sigma12;
+      numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
+      numtyp sigh = sigma/h12;
+      numtyp tprod = chi*sigh;
+
+      numtyp Ua, Ur;
+      numtyp h12p3 = h12*h12*h12;
+      numtyp sigmap3 = sigma*sigma*sigma;
+      numtyp stemp = h12*(numtyp)0.5;
+      Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
+      Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
+      Ua = epsilon*Ua*sigmap3*solv_f_a;
+    
+      stemp = h12/cr60;
+      Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
+           (numtyp)60.0;
+      Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur;
+      numtyp sigh6=sigh*sigh*sigh;
+      sigh6*=sigh6;
+      Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r;
+
+      energy+=Ua+Ur;
+
+      // force
+
+      numtyp fourw[3], spr[3];
+      numtyp sec = sigma*chi;
+      numtyp sigma12p3 = sigma12*sigma12*sigma12;
+      fourw[0] = (numtyp)4.0*w[0];
+      fourw[1] = (numtyp)4.0*w[1];
+      fourw[2] = (numtyp)4.0*w[2];
+      spr[0] = (numtyp)0.5*sigma12p3*s[0];
+      spr[1] = (numtyp)0.5*sigma12p3*s[1];
+      spr[2] = (numtyp)0.5*sigma12p3*s[2];
+
+      stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
+              (numtyp)3.0/h12;
+      numtyp hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec);
+      numtyp dspu = (numtyp)1.0/h12-hsec+stemp;
+      numtyp pbsu = (numtyp)3.0*sigma*hsec;
+  
+      stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
+              (numtyp)1.0/(ishape.y*cr60+h12)+
+              (numtyp)1.0/(ishape.z*cr60+h12)+
+              (numtyp)3.0/h12;
+      hsec = (numtyp)1.0/(h12+b_alpha*sec);
+      numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
+      numtyp pbsr = b_alpha*sigma*hsec;
+  
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+        numtyp u[3];
+        u[0] = -rhat[i]*rhat[0];
+        u[1] = -rhat[i]*rhat[1];
+        u[2] = -rhat[i]*rhat[2];
+        u[i] += (numtyp)1.0;
+        u[0] *= rnorm;
+        u[1] *= rnorm;
+        u[2] *= rnorm;
+        numtyp dchi = gpu_dot3(u,fourw);
+        numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        numtyp force=dUr*Ur+dUa*Ua;
+        if (i==0) {
+          f.x+=force;
+          if (vflag>0)
+            virial[0]+=-r[0]*force;
+        } else if (i==1) {
+          f.y+=force;
+          if (vflag>0) {
+            virial[1]+=-r[1]*force;
+            virial[3]+=-r[0]*force;
+          }
+        } else {
+          f.z+=force;
+          if (vflag>0) {
+            virial[2]+=-r[2]*force;
+            virial[4]+=-r[0]*force;
+            virial[5]+=-r[1]*force;
+          }
+        }
+
+      }
+    
+      // torque on i
+      numtyp fwae[3];
+      gpu_row_times3(fourw,aTe,fwae);
+      {
+        numtyp tempv[3], p[3], lAtwo[9];
+        gpu_times_column3(lA_0,rhat,p);
+        gpu_times_column3(lA_0,w,tempv);
+        numtyp dchi = -gpu_dot3(fwae,tempv);
+        gpu_times3(aTs,lA_0,lAtwo);
+        gpu_times_column3(lAtwo,spr,tempv);
+        numtyp dh12 = -gpu_dot3(s,tempv);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        tor.x -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        numtyp tempv[3], p[3], lAtwo[9];
+        gpu_times_column3(lA_1,rhat,p);
+        gpu_times_column3(lA_1,w,tempv);
+        numtyp dchi = -gpu_dot3(fwae,tempv);
+        gpu_times3(aTs,lA_1,lAtwo);
+        gpu_times_column3(lAtwo,spr,tempv);
+        numtyp dh12 = -gpu_dot3(s,tempv);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        tor.y -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        numtyp tempv[3], p[3], lAtwo[9];
+        gpu_times_column3(lA_2,rhat,p);
+        gpu_times_column3(lA_2,w,tempv);
+        numtyp dchi = -gpu_dot3(fwae,tempv);
+        gpu_times3(aTs,lA_2,lAtwo);
+        gpu_times_column3(lAtwo,spr,tempv);
+        numtyp dh12 = -gpu_dot3(s,tempv);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        tor.z -= (dUa*Ua+dUr*Ur);
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[7][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=tor.x;
+    red_acc[4][tid]=tor.y;
+    red_acc[5][tid]=tor.z;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<6; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    tor.x=red_acc[3][tid];
+    tor.y=red_acc[4][tid];
+    tor.z=red_acc[5][tid];
+
+    if (eflag>0 || vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+      red_acc[6][tid]=energy;
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<7; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+      energy=red_acc[6][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1+=energy;
+      ap1+=astride;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1+=virial[i];
+        ap1+=astride;
+      }
+    }
+    acctyp4 old=ans[ii];
+    old.x+=f.x;
+    old.y+=f.y;
+    old.z+=f.z;
+    ans[ii]=old;
+    
+    old=ans[ii+astride];
+    old.x+=tor.x;
+    old.y+=tor.y;
+    old.z+=tor.z;
+    ans[ii+astride]=old;
+  } // if ii
+
+}
+
+__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
+                               __global numtyp4* shape,__global numtyp4* well, 
+                               __global numtyp *splj, __global numtyp2* sig_eps, 
+                               const int ntypes, __global int *dev_nbor,
+                               const int stride, __global acctyp4 *ans,
+                               __global acctyp *engv, __global int *err_flag,
+                               const int eflag, const int vflag,const int start,
+                               const int inum, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=splj[0];    
+  sp_lj[1]=splj[1];    
+  sp_lj[2]=splj[2];    
+  sp_lj[3]=splj[3];
+  
+  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
+  b_alpha=(numtyp)45.0/(numtyp)56.0;
+  cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);    
+  solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0);
+  solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0);
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int j=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *nbor_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+  
+    numtyp4 jx=x_[j];
+    int jtype=jx.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int i=*nbor;
+      factor_lj = sp_lj[sbmask(i)];
+      i &= NEIGHMASK;
+
+      numtyp4 ix=x_[i];
+      int itype=ix.w;
+
+      numtyp a[9];       // Rotation matrix (lab->body)
+      numtyp aTe[9];     // A'*E
+      numtyp4 ishape;
+    
+      ishape=shape[itype];
+      gpu_quat_to_mat_trans(q,i,a);
+      gpu_transpose_times_diag3(a,well[itype],aTe);
+
+      // Compute r12
+      numtyp r[3], rhat[3];
+      numtyp rnorm;
+      r[0] = ix.x-jx.x;
+      r[1] = ix.y-jx.y;
+      r[2] = ix.z-jx.z;
+      rnorm = gpu_dot3(r,r);
+      rnorm = rsqrt(rnorm);
+      rhat[0] = r[0]*rnorm;
+      rhat[1] = r[1]*rnorm;
+      rhat[2] = r[2]*rnorm;
+
+      numtyp sigma, epsilon;
+      int mtype=mul24(ntypes,itype)+jtype;
+      sigma = sig_eps[mtype].x;
+      epsilon = sig_eps[mtype].y*factor_lj;
+
+      numtyp aTs[9]; 
+      numtyp4 scorrect;
+      numtyp half_sigma=sigma * (numtyp)0.5;
+      scorrect.x = ishape.x+half_sigma;
+      scorrect.y = ishape.y+half_sigma;
+      scorrect.z = ishape.z+half_sigma;
+      scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5;
+      scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
+      scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
+      gpu_transpose_times_diag3(a,scorrect,aTs);
+      
+      // energy
+
+      numtyp gamma[9], s[3];
+      gpu_times3(aTs,a,gamma);
+      gpu_mldivide3(gamma,rhat,s,err_flag);
+
+      numtyp sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
+      numtyp temp[9], w[3];
+      gpu_times3(aTe,a,temp);
+      temp[0] += (numtyp)1.0;
+      temp[4] += (numtyp)1.0;
+      temp[8] += (numtyp)1.0;
+      gpu_mldivide3(temp,rhat,w,err_flag);
+
+      numtyp h12 = (numtyp)1.0/rnorm-sigma12;
+      numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
+      numtyp sigh = sigma/h12;
+      numtyp tprod = chi*sigh;
+
+      numtyp Ua, Ur;
+      numtyp h12p3 = h12*h12*h12;
+      numtyp sigmap3 = sigma*sigma*sigma;
+      numtyp stemp = h12/(numtyp)2.0;
+      Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
+      numtyp ilshape=ishape.x*ishape.y*ishape.z;
+      Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
+      Ua = epsilon*Ua*sigmap3*solv_f_a;
+    
+      stemp = h12/cr60;
+      Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
+           (numtyp)60.0;
+      Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur;
+      numtyp sigh6=sigh*sigh*sigh;
+      sigh6*=sigh6;
+      Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r;
+
+      energy+=Ua+Ur;
+
+      // force
+
+      numtyp fourw[3], spr[3];
+      numtyp sec = sigma*chi;
+      numtyp sigma12p3 = sigma12*sigma12*sigma12;
+      fourw[0] = (numtyp)4.0*w[0];
+      fourw[1] = (numtyp)4.0*w[1];
+      fourw[2] = (numtyp)4.0*w[2];
+      spr[0] = (numtyp)0.5*sigma12p3*s[0];
+      spr[1] = (numtyp)0.5*sigma12p3*s[1];
+      spr[2] = (numtyp)0.5*sigma12p3*s[2];
+
+      stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
+              (numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
+              (numtyp)3.0/h12;
+      numtyp hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec);
+      numtyp dspu = (numtyp)1.0/h12-hsec+stemp;
+      numtyp pbsu = (numtyp)3.0*sigma*hsec;
+  
+      stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
+              (numtyp)1.0/(ishape.y*cr60+h12)+
+              (numtyp)1.0/(ishape.z*cr60+h12)+
+              (numtyp)3.0/h12;
+      hsec = (numtyp)1.0/(h12+b_alpha*sec);
+      numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
+      numtyp pbsr = b_alpha*sigma*hsec;
+  
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+        numtyp u[3];
+        u[0] = -rhat[i]*rhat[0];
+        u[1] = -rhat[i]*rhat[1];
+        u[2] = -rhat[i]*rhat[2];
+        u[i] += (numtyp)1.0;
+        u[0] *= rnorm;
+        u[1] *= rnorm;
+        u[2] *= rnorm;
+        numtyp dchi = gpu_dot3(u,fourw);
+        numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        numtyp force=dUr*Ur+dUa*Ua;
+        if (i==0) {
+          f.x+=force;
+          if (vflag>0)
+            virial[0]+=-r[0]*force;
+        } else if (i==1) {
+          f.y+=force;
+          if (vflag>0) {
+            virial[1]+=-r[1]*force;
+            virial[3]+=-r[0]*force;
+          }
+        } else {
+          f.z+=force;
+          if (vflag>0) {
+            virial[2]+=-r[2]*force;
+            virial[4]+=-r[0]*force;
+            virial[5]+=-r[1]*force;
+          }
+        }
+
+      }
+    
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[7][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<3; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+
+    if (eflag>0 || vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+      red_acc[6][tid]=energy;
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<7; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+      energy=red_acc[6][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
+                        __global numtyp4* lj3, const int lj_types, 
+                        __global numtyp *gum, 
+                        const int stride, __global int *dev_ij, 
+                        __global acctyp4 *ans, __global acctyp *engv, 
+                        __global int *err_flag, const int eflag, 
+                        const int vflag, const int start, const int inum, 
+                        const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[0];    
+  sp_lj[1]=gum[1];    
+  sp_lj[2]=gum[2];    
+  sp_lj[3]=gum[3];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *list_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int ii=itype*lj_types+jtype;
+      if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
+          energy+=factor_lj*(e-lj3[ii].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1+=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1+=virial[i];
+        ap1+=inum;
+      }
+    }
+    acctyp4 old=ans[ii];
+    old.x+=f.x;
+    old.y+=f.y;
+    old.z+=f.z;
+    ans[ii]=old;
+  } // if ii
+}
+
+__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
+                             __global numtyp4* lj3_in, __global numtyp *gum, 
+                             const int stride, __global int *dev_ij,
+                             __global acctyp4 *ans, __global acctyp *engv,
+                             __global int *err_flag, const int eflag,
+                             const int vflag, const int start, const int inum,
+                             const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];                              
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (tid<4)
+    sp_lj[tid]=gum[tid];    
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *list_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1+=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1+=virial[i];
+        ap1+=inum;
+      }
+    }
+    acctyp4 old=ans[ii];
+    old.x+=f.x;
+    old.y+=f.y;
+    old.z+=f.z;
+    ans[ii]=old;
+  } // if ii
+}
+
+#endif
+
--- a/lib/gpu/replace_code.sh
+++ b/lib/gpu/replace_code.sh
@ -0,0 +1,13 @@
+#!/bin/tcsh
+
+set files = `echo *.h *.cpp *.cu`
+rm -rf /tmp/cpp5678
+mkdir /tmp/cpp5678
+mkdir /tmp/cpp5678/lgpu
+
+foreach file ( $files )
+#	/bin/cp $file /tmp/cpp5678/$file:t:t
+	# ------ Sed Replace
+	sed -i 's/atom->dev_engv/ans->dev_engv/g' $file
+end
+