lammps/lib/gpu/gb_gpu.cu

/***************************************************************************
                                  gb_gpu.cu
                             -------------------
                               W. Michael Brown

  Gay-Berne anisotropic potential GPU calcultation

   *** Force decomposition by Atom Version ***

 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________

    begin                : Tue Jun 23 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/

/* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   ----------------------------------------------------------------------- */

#include <iostream>
#include <cassert>
#include "nvc_macros.h"
#include "nvc_timer.h"
#include "nvc_device.h"
#include "gb_gpu_memory.cu"
#include "gb_gpu_kernel.h"

using namespace std;

static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
#define GBMT GB_GPU_Memory<numtyp,acctyp>

// ---------------------------------------------------------------------------
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only pack neighbors matching the specified inclusive range of forms
// -- Only pack neighbors within cutoff
// ---------------------------------------------------------------------------
template<class numtyp>
__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, 
                                 const int start, const int inum, 
                                 const int *dev_ij, const int form_low, 
                                 const int form_high, const int nall) {
                                
  // ii indexes the two interacting particles in gi
  int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;

  if (ii<inum) {
    int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
    const int *list=dev_ij+*nbor;
    const int *list_end=list+numj;
    nbor+=nbor_pitch;
    int *nbor_newj=nbor;
    nbor+=nbor_pitch;
  
    numtyp ix=_x_<numtyp>(i,0);
    numtyp iy=_x_<numtyp>(i,1);
    numtyp iz=_x_<numtyp>(i,2);
    int itype=_x_<numtyp>(i,7);

    int newj=0;  
    for ( ; list<list_end; list++) {
      int j=*list;
      if (j>=nall)
        j%=nall;
      int jtype=_x_<numtyp>(j,7);
      
      if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
        // Compute r12;
        numtyp rsq=_x_<numtyp>(j,0)-ix;
        rsq*=rsq;
        numtyp t=_x_<numtyp>(j,1)-iy;
        rsq+=t*t;
        t=_x_<numtyp>(j,2)-iz;
        rsq+=t*t;

        if (rsq< _cutsq_<numtyp>(itype,jtype)) {
          *nbor=j;
          nbor+=nbor_pitch;
          newj++;
        }
      }
    }
    *nbor_newj=newj;
  }
}

// ---------------------------------------------------------------------------
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only pack neighbors matching the specified inclusive range of forms
// -- Only pack neighbors within cutoff
// -- Fast version of routine that uses shared memory for LJ constants
// ---------------------------------------------------------------------------
template<class numtyp>
__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, 
                                      const int start, const int inum, 
                                      const int *dev_ij, const int form_low, 
                                      const int form_high, const int nall) {
                                
  int ii=threadIdx.x;
  __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    int itype=ii/MAX_SHARED_TYPES;
    int jtype=ii%MAX_SHARED_TYPES;
    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
    form[ii]=_form_(itype,jtype);
  }
  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;

  if (ii<inum) {
    int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
    const int *list=dev_ij+*nbor;
    const int *list_end=list+numj;
    nbor+=nbor_pitch;
    int *nbor_newj=nbor;
    nbor+=nbor_pitch;
  
    numtyp ix=_x_<numtyp>(i,0);
    numtyp iy=_x_<numtyp>(i,1);
    numtyp iz=_x_<numtyp>(i,2);
    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));

    int newj=0;  
    for ( ; list<list_end; list++) {
      int j=*list;
      if (j>=nall)
        j%=nall;
      int mtype=itype+_x_<numtyp>(j,7);
      
      if (form[mtype]>=form_low && form[mtype]<=form_high) {
        // Compute r12;
        numtyp rsq=_x_<numtyp>(j,0)-ix;
        rsq*=rsq;
        numtyp t=_x_<numtyp>(j,1)-iy;
        rsq+=t*t;
        t=_x_<numtyp>(j,2)-iz;
        rsq+=t*t;

        if (rsq<cutsq[mtype]) {
          *nbor=j;
          nbor+=nbor_pitch;
          newj++;
        }
      }
    }
    *nbor_newj=newj;
  }
}

template<class numtyp, class acctyp>
void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, 
                const int inum, const int form_low, const int form_high) {
  if (gbm.shared_types)
    kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
  else
    kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
}

// ---------------------------------------------------------------------------
// Convert something to a string
// ---------------------------------------------------------------------------
#include <sstream>
template <class t>
inline string gb_gpu_toa(const t& in) {
  ostringstream o;
  o.precision(2);
  o << in;
  return o.str();
}

// ---------------------------------------------------------------------------
// Return string with GPU info
// ---------------------------------------------------------------------------
string gb_gpu_name(const int id, const int max_nbors) {
  string name=GBMF[0].gpu.name(id)+", "+
              gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
              gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
              gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
              gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
                                               max_nbors))+" Atoms";
  return name;
}

// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
                  const double upsilon, const double mu, double **shape,
                  double **well, double **cutsq, double **sigma, 
                  double **epsilon, double *host_lshape, int **form,
                  double **host_lj1, double **host_lj2, double **host_lj3, 
                  double **host_lj4, double **offset, double *special_lj,
                  const int max_nbors, const int thread, const int gpu_id) {
  assert(thread<MAX_GPU_THREADS);
  
  if (GBMF[thread].gpu.num_devices()==0)
    return 0;                   

  ij_size=IJ_SIZE;
  return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
                           well, cutsq, sigma, epsilon, host_lshape, form,
                           host_lj1, host_lj2, host_lj3, host_lj4, offset,
                           special_lj, max_nbors, false, gpu_id);
}

// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
void gb_gpu_clear(const int thread) {
  GBMF[thread].clear();
}

// ---------------------------------------------------------------------------
// copy atom positions, quaternions, and optionally types to device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x, 
                         double **host_quat, const int *host_type, 
                         const bool rebuild, cudaStream_t &stream) {
  atom.time_atom.start();
  atom.reset_write_buffer();
 
  // Rows 1-3 of dev_x are position; rows 4-7 are quaternion
  atom.add_atom_data(host_x[0],3);
  atom.add_atom_data(host_x[0]+1,3);
  atom.add_atom_data(host_x[0]+2,3);
  atom.add_atom_data(host_quat[0],4);
  atom.add_atom_data(host_quat[0]+1,4);
  atom.add_atom_data(host_quat[0]+2,4);
  atom.add_atom_data(host_quat[0]+3,4);
   
  int csize=7;
  
  // If a rebuild occured, copy type data
  if (rebuild) {
    atom.add_atom_data(host_type);
    csize++;
  }
  
  atom.copy_atom_data(csize,stream);
  atom.time_atom.stop();
}

void gb_gpu_atom(double **host_x, double **host_quat, 
                 const int *host_type, const bool rebuild, const int thread) {
  _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
               GBMF[thread].pair_stream);
}

// ---------------------------------------------------------------------------
// Signal that we need to transfer a new neighbor list
// ---------------------------------------------------------------------------
template <class gbmtyp>
int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, 
                          const int inum, int *ilist, const int *numj,
                          const int *type, bool &success) {
  if (nall>gbm.max_atoms) {
    success=false;
    return 0;
  }
  success=true;
    
  gbm.nbor.time_nbor.start();

  gbm.atom.nall(nall);
  gbm.atom.inum(inum);

  if (gbm.multiple_forms) {
    int p=0, acc=0;
    for (int i=0; i<inum; i++) {
      int itype=type[ilist[i]];
      if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
        gbm.host_olist[p]=ilist[i];
        gbm.nbor.host_ij[p]=numj[ilist[i]];
        gbm.nbor.host_ij[p+inum]=acc;
        acc+=numj[ilist[i]];
        p++;
      }
    }
    gbm.last_ellipse=p;
    for (int i=0; i<inum; i++) {
      int itype=type[ilist[i]];
      if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
        gbm.host_olist[p]=ilist[i];
        gbm.nbor.host_ij[p]=numj[ilist[i]];
        gbm.nbor.host_ij[p+inum]=acc;
        acc+=numj[ilist[i]];
        p++;
      }
    }
    gbm.nbor.ij_total=0;
    gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
    gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
                                      gbm.nbor.dev_nbor.row_size(),
                                      gbm.nbor.dev_nbor.row_size(),2,inum,
                                      gbm.pair_stream);
  } else {
    gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
    gbm.last_ellipse=inum;
  }

  gbm.nbor.time_nbor.stop();
  
  if (gbm.multiple_forms)
    return gbm.host_olist.begin();
  return ilist;
}

int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
                         int *ilist, const int *numj, const int *type,
                         const int thread, bool &success) {
  return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
                             success);
}

// ---------------------------------------------------------------------------
// Copy a set of ij_size ij interactions to device and compute energies,
// forces, and torques for those interactions
// ---------------------------------------------------------------------------
template <class gbmtyp>
void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
  gbm.nbor.time_nbor.add_to_total();
  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
  
  gbm.nbor.time_nbor.start();
  gbm.nbor.add(num_ij,gbm.pair_stream);
  gbm.nbor.time_nbor.stop();
}

void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
  _gb_gpu_nbors(GBMF[thread],num_ij,eflag);
}

// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques for all ij interactions
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, 
                      const bool rebuild) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=BLOCK_1D;

  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));

  if (gbm.multiple_forms) {
    gbm.time_kernel.start();
    if (gbm.last_ellipse>0) {
      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
      GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
                               static_cast<double>(BX)));
      pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
      gbm.time_kernel.stop();
  
      gbm.time_gayberne.start();                                 
      kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
            eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
      gbm.time_gayberne.stop();

      if (gbm.last_ellipse==gbm.atom.inum()) {
        gbm.time_kernel2.start();
        gbm.time_kernel2.stop();
        gbm.time_gayberne2.start();
        gbm.time_gayberne2.stop();
        gbm.time_pair.start();
        gbm.time_pair.stop();
        return;
      }
                    
      // ------------ SPHERE_ELLIPSE ---------------
        
      gbm.time_kernel2.start();
      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
                               gbm.last_ellipse)/BX));
      pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
                 ELLIPSE_SPHERE);
      gbm.time_kernel2.stop();

      gbm.time_gayberne2.start();
      kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
      gbm.time_gayberne2.stop();
   } else {
      gbm.atom.ans.zero();
      gbm.time_kernel.stop();
      gbm.time_gayberne.start();                                 
      gbm.time_gayberne.stop();
      gbm.time_kernel2.start();
      gbm.time_kernel2.stop();
      gbm.time_gayberne2.start();
      gbm.time_gayberne2.stop();
    }
    
    // ------------         LJ      ---------------
    gbm.time_pair.start();
    if (gbm.last_ellipse<gbm.atom.inum()) {
      if (gbm.shared_types)
        kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
            gbm.atom.ans.begin(), gbm.atom.ans.row_size(), 
            gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, 
            gbm.atom.inum(), gbm.atom.nall());
      else
        kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
    }
    gbm.time_pair.stop();
  } else {
    gbm.time_kernel.start();
    pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
    gbm.time_kernel.stop();
  
    gbm.time_gayberne.start();                                 
    kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
         (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
          gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
          gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), 
          eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
    gbm.time_gayberne.stop();
  }
}

void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
                     const int thread) {
  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
}

// ---------------------------------------------------------------------------
// Get energies, forces, and torques to host
// ---------------------------------------------------------------------------
template<class numtyp, class acctyp>
double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
                      const bool eflag, const bool vflag, const bool eflag_atom,
                      const bool vflag_atom, double *eatom, double **vatom,
                      double *virial) {
  double evdw;

  gbm.atom.time_answer.start();
  gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);

  gbm.atom.time_atom.add_to_total();
  gbm.nbor.time_nbor.add_to_total();
  gbm.time_kernel.add_to_total();
  gbm.time_gayberne.add_to_total();
  if (gbm.multiple_forms) {
    gbm.time_kernel2.add_to_total();
    gbm.time_gayberne2.add_to_total();
    gbm.time_pair.add_to_total();
  }      
  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
  
  evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
  gbm.atom.add_forces(ilist,f);
  gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
  gbm.atom.time_answer.stop();
  gbm.atom.time_answer.add_to_total();
  return evdw;
}

double gb_gpu_forces(double **f, double **tor, const int *ilist,
                     const bool eflag, const bool vflag, const bool eflag_atom,
                     const bool vflag_atom, double *eatom, double **vatom,
                     double *virial, const int thread) {
  return _gb_gpu_forces<PRECISION,ACC_PRECISION>
                       (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
                        vflag_atom,eatom,vatom,virial);
}

void gb_gpu_time(const int i) {
  cout.precision(4);
  cout << "Atom copy:     " << GBMF[i].atom.time_atom.total_seconds() 
       << " s.\n"
       << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() 
       << " s.\n"
       << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
                               GBMF[i].time_kernel2.total_seconds() << " s.\n"
       << "Force calc:    " << GBMF[i].time_gayberne.total_seconds()+
                               GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
  if (GBMF[i].multiple_forms)
    cout << "LJ calc:       " << GBMF[i].time_pair.total_seconds() << " s.\n";
  cout << "Answer copy:   " << GBMF[i].atom.time_answer.total_seconds() 
       << " s.\n";
}

int gb_gpu_num_devices() {
  return GBMF[0].gpu.num_devices();
}

double gb_gpu_bytes() {
  return GBMF[0].host_memory_usage();
}
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3048 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2009-08-12 03:00:24 +08:00			`/***************************************************************************`
			`gb_gpu.cu`
			`-------------------`
			`W. Michael Brown`

			`Gay-Berne anisotropic potential GPU calcultation`

			`* Force decomposition by Atom Version *`

			`__________________________________________________________________________`
			`This file is part of the LAMMPS GPU Library`
			`__________________________________________________________________________`

			`begin : Tue Jun 23 2009`
			`copyright : (C) 2009 by W. Michael Brown`
			`email : wmbrown@sandia.gov`
			`***************************************************************************/`

			`/* -----------------------------------------------------------------------`
			`Copyright (2009) Sandia Corporation. Under the terms of Contract`
			`DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains`
			`certain rights in this software. This software is distributed under`
			`the GNU General Public License.`
			`----------------------------------------------------------------------- */`

			`#include <iostream>`
			`#include <cassert>`
			`#include "nvc_macros.h"`
			`#include "nvc_timer.h"`
			`#include "nvc_device.h"`
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3056 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2009-08-14 00:05:41 +08:00			`#include "gb_gpu_memory.cu"`
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3048 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2009-08-12 03:00:24 +08:00			`#include "gb_gpu_kernel.h"`

			`using namespace std;`

			`static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];`
			`#define GBMT GB_GPU_Memory<numtyp,acctyp>`

			`// ---------------------------------------------------------------------------`
			`// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access`
			`// -- Only pack neighbors matching the specified inclusive range of forms`
			`// -- Only pack neighbors within cutoff`
			`// ---------------------------------------------------------------------------`
			`template<class numtyp>`
			`__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,`
			`const int start, const int inum,`
			`const int *dev_ij, const int form_low,`
			`const int form_high, const int nall) {`

			`// ii indexes the two interacting particles in gi`
			`int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;`

			`if (ii<inum) {`
			`int *nbor=dev_nbor+ii;`
			`int i=*nbor;`
			`nbor+=nbor_pitch;`
			`int numj=*nbor;`
			`nbor+=nbor_pitch;`
			`const int list=dev_ij+nbor;`
			`const int *list_end=list+numj;`
			`nbor+=nbor_pitch;`
			`int *nbor_newj=nbor;`
			`nbor+=nbor_pitch;`

			`numtyp ix=_x_<numtyp>(i,0);`
			`numtyp iy=_x_<numtyp>(i,1);`
			`numtyp iz=_x_<numtyp>(i,2);`
			`int itype=_x_<numtyp>(i,7);`

			`int newj=0;`
			`for ( ; list<list_end; list++) {`
			`int j=*list;`
			`if (j>=nall)`
			`j%=nall;`
			`int jtype=_x_<numtyp>(j,7);`

			`if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {`
			`// Compute r12;`
			`numtyp rsq=_x_<numtyp>(j,0)-ix;`
			`rsq*=rsq;`
			`numtyp t=_x_<numtyp>(j,1)-iy;`
			`rsq+=t*t;`
			`t=_x_<numtyp>(j,2)-iz;`
			`rsq+=t*t;`

			`if (rsq< _cutsq_<numtyp>(itype,jtype)) {`
			`*nbor=j;`
			`nbor+=nbor_pitch;`
			`newj++;`
			`}`
			`}`
			`}`
			`*nbor_newj=newj;`
			`}`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access`
			`// -- Only pack neighbors matching the specified inclusive range of forms`
			`// -- Only pack neighbors within cutoff`
			`// -- Fast version of routine that uses shared memory for LJ constants`
			`// ---------------------------------------------------------------------------`
			`template<class numtyp>`
			`__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,`
			`const int start, const int inum,`
			`const int *dev_ij, const int form_low,`
			`const int form_high, const int nall) {`

			`int ii=threadIdx.x;`
			`__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];`
			`__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];`
			`if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {`
			`int itype=ii/MAX_SHARED_TYPES;`
			`int jtype=ii%MAX_SHARED_TYPES;`
			`cutsq[ii]=_cutsq_<numtyp>(itype,jtype);`
			`form[ii]=_form_(itype,jtype);`
			`}`
			`ii+=INT_MUL(blockIdx.x,blockDim.x)+start;`

			`if (ii<inum) {`
			`int *nbor=dev_nbor+ii;`
			`int i=*nbor;`
			`nbor+=nbor_pitch;`
			`int numj=*nbor;`
			`nbor+=nbor_pitch;`
			`const int list=dev_ij+nbor;`
			`const int *list_end=list+numj;`
			`nbor+=nbor_pitch;`
			`int *nbor_newj=nbor;`
			`nbor+=nbor_pitch;`

			`numtyp ix=_x_<numtyp>(i,0);`
			`numtyp iy=_x_<numtyp>(i,1);`
			`numtyp iz=_x_<numtyp>(i,2);`
			`int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));`

			`int newj=0;`
			`for ( ; list<list_end; list++) {`
			`int j=*list;`
			`if (j>=nall)`
			`j%=nall;`
			`int mtype=itype+_x_<numtyp>(j,7);`

			`if (form[mtype]>=form_low && form[mtype]<=form_high) {`
			`// Compute r12;`
			`numtyp rsq=_x_<numtyp>(j,0)-ix;`
			`rsq*=rsq;`
			`numtyp t=_x_<numtyp>(j,1)-iy;`
			`rsq+=t*t;`
			`t=_x_<numtyp>(j,2)-iz;`
			`rsq+=t*t;`

			`if (rsq<cutsq[mtype]) {`
			`*nbor=j;`
			`nbor+=nbor_pitch;`
			`newj++;`
			`}`
			`}`
			`}`
			`*nbor_newj=newj;`
			`}`
			`}`

			`template<class numtyp, class acctyp>`
			`void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,`
			`const int inum, const int form_low, const int form_high) {`
			`if (gbm.shared_types)`
			`kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>`
			`(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,`
			`gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());`
			`else`
			`kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>`
			`(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,`
			`gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Convert something to a string`
			`// ---------------------------------------------------------------------------`
			`#include <sstream>`
			`template <class t>`
			`inline string gb_gpu_toa(const t& in) {`
			`ostringstream o;`
			`o.precision(2);`
			`o << in;`
			`return o.str();`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Return string with GPU info`
			`// ---------------------------------------------------------------------------`
			`string gb_gpu_name(const int id, const int max_nbors) {`
			`string name=GBMF[0].gpu.name(id)+", "+`
			`gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+`
			`gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+`
			`gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+`
			`gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),`
			`max_nbors))+" Atoms";`
			`return name;`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Allocate memory on host and device and copy constants to device`
			`// ---------------------------------------------------------------------------`
			`int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,`
			`const double upsilon, const double mu, double **shape,`
			`double well, double cutsq, double **sigma,`
			`double *epsilon, double host_lshape, int **form,`
			`double host_lj1, double host_lj2, double **host_lj3,`
			`double host_lj4, double offset, double *special_lj,`
			`const int max_nbors, const int thread, const int gpu_id) {`
			`assert(thread<MAX_GPU_THREADS);`

			`if (GBMF[thread].gpu.num_devices()==0)`
			`return 0;`

			`ij_size=IJ_SIZE;`
			`return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,`
			`well, cutsq, sigma, epsilon, host_lshape, form,`
			`host_lj1, host_lj2, host_lj3, host_lj4, offset,`
			`special_lj, max_nbors, false, gpu_id);`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Clear memory on host and device`
			`// ---------------------------------------------------------------------------`
			`void gb_gpu_clear(const int thread) {`
			`GBMF[thread].clear();`
			`}`

			`// ---------------------------------------------------------------------------`
			`// copy atom positions, quaternions, and optionally types to device`
			`// ---------------------------------------------------------------------------`
			`template <class numtyp, class acctyp>`
			`inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,`
			`double *host_quat, const int host_type,`
			`const bool rebuild, cudaStream_t &stream) {`
			`atom.time_atom.start();`
			`atom.reset_write_buffer();`

			`// Rows 1-3 of dev_x are position; rows 4-7 are quaternion`
			`atom.add_atom_data(host_x[0],3);`
			`atom.add_atom_data(host_x[0]+1,3);`
			`atom.add_atom_data(host_x[0]+2,3);`
			`atom.add_atom_data(host_quat[0],4);`
			`atom.add_atom_data(host_quat[0]+1,4);`
			`atom.add_atom_data(host_quat[0]+2,4);`
			`atom.add_atom_data(host_quat[0]+3,4);`

			`int csize=7;`

			`// If a rebuild occured, copy type data`
			`if (rebuild) {`
			`atom.add_atom_data(host_type);`
			`csize++;`
			`}`

			`atom.copy_atom_data(csize,stream);`
			`atom.time_atom.stop();`
			`}`

			`void gb_gpu_atom(double host_x, double host_quat,`
			`const int *host_type, const bool rebuild, const int thread) {`
			`_gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,`
			`GBMF[thread].pair_stream);`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Signal that we need to transfer a new neighbor list`
			`// ---------------------------------------------------------------------------`
			`template <class gbmtyp>`
			`int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,`
			`const int inum, int ilist, const int numj,`
			`const int *type, bool &success) {`
			`if (nall>gbm.max_atoms) {`
			`success=false;`
			`return 0;`
			`}`
			`success=true;`

			`gbm.nbor.time_nbor.start();`

			`gbm.atom.nall(nall);`
			`gbm.atom.inum(inum);`

			`if (gbm.multiple_forms) {`
			`int p=0, acc=0;`
			`for (int i=0; i<inum; i++) {`
			`int itype=type[ilist[i]];`
			`if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {`
			`gbm.host_olist[p]=ilist[i];`
			`gbm.nbor.host_ij[p]=numj[ilist[i]];`
			`gbm.nbor.host_ij[p+inum]=acc;`
			`acc+=numj[ilist[i]];`
			`p++;`
			`}`
			`}`
			`gbm.last_ellipse=p;`
			`for (int i=0; i<inum; i++) {`
			`int itype=type[ilist[i]];`
			`if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {`
			`gbm.host_olist[p]=ilist[i];`
			`gbm.nbor.host_ij[p]=numj[ilist[i]];`
			`gbm.nbor.host_ij[p+inum]=acc;`
			`acc+=numj[ilist[i]];`
			`p++;`
			`}`
			`}`
			`gbm.nbor.ij_total=0;`
			`gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);`
			`gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+`
			`gbm.nbor.dev_nbor.row_size(),`
			`gbm.nbor.dev_nbor.row_size(),2,inum,`
			`gbm.pair_stream);`
			`} else {`
			`gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);`
			`gbm.last_ellipse=inum;`
			`}`

			`gbm.nbor.time_nbor.stop();`

			`if (gbm.multiple_forms)`
			`return gbm.host_olist.begin();`
			`return ilist;`
			`}`

			`int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,`
			`int ilist, const int numj, const int *type,`
			`const int thread, bool &success) {`
			`return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,`
			`success);`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Copy a set of ij_size ij interactions to device and compute energies,`
			`// forces, and torques for those interactions`
			`// ---------------------------------------------------------------------------`
			`template <class gbmtyp>`
			`void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {`
			`gbm.nbor.time_nbor.add_to_total();`
			`// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed`

			`gbm.nbor.time_nbor.start();`
			`gbm.nbor.add(num_ij,gbm.pair_stream);`
			`gbm.nbor.time_nbor.stop();`
			`}`

			`void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {`
			`_gb_gpu_nbors(GBMF[thread],num_ij,eflag);`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Calculate energies, forces, and torques for all ij interactions`
			`// ---------------------------------------------------------------------------`
			`template <class numtyp, class acctyp>`
			`void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,`
			`const bool rebuild) {`
			`// Compute the block size and grid size to keep all cores busy`
			`const int BX=BLOCK_1D;`

			`int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));`

			`if (gbm.multiple_forms) {`
			`gbm.time_kernel.start();`
			`if (gbm.last_ellipse>0) {`
			`// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------`
			`GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/`
			`static_cast<double>(BX)));`
			`pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);`
			`gbm.time_kernel.stop();`

			`gbm.time_gayberne.start();`
			`kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>`
			`(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),`
			`gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),`
			`gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),`
			`eflag, vflag, gbm.last_ellipse, gbm.atom.nall());`
			`gbm.time_gayberne.stop();`

			`if (gbm.last_ellipse==gbm.atom.inum()) {`
			`gbm.time_kernel2.start();`
			`gbm.time_kernel2.stop();`
			`gbm.time_gayberne2.start();`
			`gbm.time_gayberne2.stop();`
			`gbm.time_pair.start();`
			`gbm.time_pair.stop();`
			`return;`
			`}`

			`// ------------ SPHERE_ELLIPSE ---------------`

			`gbm.time_kernel2.start();`
			`GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-`
			`gbm.last_ellipse)/BX));`
			`pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,`
			`ELLIPSE_SPHERE);`
			`gbm.time_kernel2.stop();`

			`gbm.time_gayberne2.start();`
			`kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>`
			`(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),`
			`gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),`
			`gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),`
			`eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());`
			`gbm.time_gayberne2.stop();`
			`} else {`
			`gbm.atom.ans.zero();`
			`gbm.time_kernel.stop();`
			`gbm.time_gayberne.start();`
			`gbm.time_gayberne.stop();`
			`gbm.time_kernel2.start();`
			`gbm.time_kernel2.stop();`
			`gbm.time_gayberne2.start();`
			`gbm.time_gayberne2.stop();`
			`}`

			`// ------------ LJ ---------------`
			`gbm.time_pair.start();`
			`if (gbm.last_ellipse<gbm.atom.inum()) {`
			`if (gbm.shared_types)`
			`kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>`
			`(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),`
			`gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),`
			`gbm.atom.ans.begin(), gbm.atom.ans.row_size(),`
			`gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse,`
			`gbm.atom.inum(), gbm.atom.nall());`
			`else`
			`kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>`
			`(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),`
			`gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),`
			`gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),`
			`eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());`
			`}`
			`gbm.time_pair.stop();`
			`} else {`
			`gbm.time_kernel.start();`
			`pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);`
			`gbm.time_kernel.stop();`

			`gbm.time_gayberne.start();`
			`kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>`
			`(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),`
			`gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),`
			`gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(),`
			`eflag, vflag, gbm.atom.inum(), gbm.atom.nall());`
			`gbm.time_gayberne.stop();`
			`}`
			`}`

			`void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,`
			`const int thread) {`
			`_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Get energies, forces, and torques to host`
			`// ---------------------------------------------------------------------------`
			`template<class numtyp, class acctyp>`
			`double _gb_gpu_forces(GBMT &gbm, double f, double tor, const int *ilist,`
			`const bool eflag, const bool vflag, const bool eflag_atom,`
			`const bool vflag_atom, double eatom, double *vatom,`
			`double *virial) {`
			`double evdw;`

			`gbm.atom.time_answer.start();`
			`gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);`

			`gbm.atom.time_atom.add_to_total();`
			`gbm.nbor.time_nbor.add_to_total();`
			`gbm.time_kernel.add_to_total();`
			`gbm.time_gayberne.add_to_total();`
			`if (gbm.multiple_forms) {`
			`gbm.time_kernel2.add_to_total();`
			`gbm.time_gayberne2.add_to_total();`
			`gbm.time_pair.add_to_total();`
			`}`
			`// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed`

			`evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);`
			`gbm.atom.add_forces(ilist,f);`
			`gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);`
			`gbm.atom.time_answer.stop();`
			`gbm.atom.time_answer.add_to_total();`
			`return evdw;`
			`}`

			`double gb_gpu_forces(double f, double tor, const int *ilist,`
			`const bool eflag, const bool vflag, const bool eflag_atom,`
			`const bool vflag_atom, double eatom, double *vatom,`
			`double *virial, const int thread) {`
			`return _gb_gpu_forces<PRECISION,ACC_PRECISION>`
			`(GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,`
			`vflag_atom,eatom,vatom,virial);`
			`}`

			`void gb_gpu_time(const int i) {`
			`cout.precision(4);`
			`cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds()`
			`<< " s.\n"`
			`<< "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds()`
			`<< " s.\n"`
			`<< "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+`
			`GBMF[i].time_kernel2.total_seconds() << " s.\n"`
			`<< "Force calc: " << GBMF[i].time_gayberne.total_seconds()+`
			`GBMF[i].time_gayberne2.total_seconds()<< " s.\n";`
			`if (GBMF[i].multiple_forms)`
			`cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n";`
			`cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds()`
			`<< " s.\n";`
			`}`

			`int gb_gpu_num_devices() {`
			`return GBMF[0].gpu.num_devices();`
			`}`

			`double gb_gpu_bytes() {`
			`return GBMF[0].host_memory_usage();`
			`}`