lammps/lib/gpu/pair_gpu_nbor.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
 
/* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
------------------------------------------------------------------------- */

#include "pair_gpu_precision.h"
#include "pair_gpu_nbor.h"
#include "math.h"

#ifdef USE_OPENCL
#include "pair_gpu_nbor_cl.h"
#else
#include "pair_gpu_nbor_ptx.h"
#include "pair_gpu_build_ptx.h"
#endif

int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
  if (_gpu_nbor)
    return (max_nbors+2)*sizeof(int);
  else if (_use_packing)
    return ((max_nbors+2)*2)*sizeof(int);
  else
    return (max_nbors+3)*sizeof(int);
}

bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, 
                       const int maxspecial, UCL_Device &devi, 
                       const bool gpu_nbor, const int gpu_host, 
                       const bool pre_cut) {
  clear();

  dev=&devi;
  _gpu_nbor=gpu_nbor;
  if (gpu_host==0)
    _gpu_host=false;
  else if (gpu_host==1)
    _gpu_host=true;
  else 
    // Not yet implemented
    assert(0==1);
  
  if (pre_cut || gpu_nbor==false)
    _alloc_packed=true;
  else
    _alloc_packed=false;

  bool success=true;
    
  // Initialize timers for the selected GPU
  time_nbor.init(*dev);
  time_kernel.init(*dev);
  time_nbor.zero();
  time_kernel.zero();

  _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
  if (_max_atoms==0)
    _max_atoms=1000;
    
  _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
  _max_nbors=max_nbors;

  _maxspecial=maxspecial;
  if (gpu_nbor==false)
    _maxspecial=0;

  if (gpu_nbor==false)
    success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
                                          UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  alloc(success);
  if (_use_packing==false)
    compile_kernels(devi);

  return success;
}

void PairGPUNbor::alloc(bool &success) { 
  dev_nbor.clear();
  host_acc.clear();
  if (_use_packing==false || _gpu_nbor) 
    success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
  else 
    success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
  success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev,
                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);

  _c_bytes=dev_nbor.row_bytes();
  if (_alloc_packed) {
    dev_packed.clear();
    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
                                         UCL_READ_ONLY)==UCL_SUCCESS);
    _c_bytes+=dev_packed.row_bytes();                                         
  } 
  if (_max_host>0) {
    host_nbor.clear();
    dev_host_nbor.clear();
    success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev,
                                        UCL_RW_OPTIMIZED)==UCL_SUCCESS);
    success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host,
                                            *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
    _c_bytes+=dev_host_nbor.row_bytes();
  }
  if (_maxspecial>0) {
    dev_nspecial.clear();
    dev_special.clear();
    dev_special_t.clear();
    int at=_max_atoms+_max_host;
    success=success && (dev_nspecial.alloc(3*at,*dev,
                                           UCL_READ_ONLY)==UCL_SUCCESS);
    success=success && (dev_special.alloc(_maxspecial*at,*dev,
                                          UCL_READ_ONLY)==UCL_SUCCESS);
    success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
                                            UCL_READ_ONLY)==UCL_SUCCESS);
    _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
                dev_special_t.row_bytes();
  }

  _allocated=true;
}
  
void PairGPUNbor::clear() {
  _gpu_bytes=0.0;
  _cell_bytes=0.0;
  _c_bytes=0.0;
  if (_allocated) {
    _allocated=false;

    host_packed.clear();
    host_acc.clear();
    dev_nbor.clear();
    dev_host_nbor.clear();
    dev_packed.clear();
    host_nbor.clear();
    dev_nspecial.clear();
    dev_special.clear();
    dev_special_t.clear();

    time_kernel.clear();
    time_nbor.clear();
  }

  if (_compiled) {
    if (_gpu_nbor) {
      k_cell_id.clear();
      k_cell_counts.clear();
      k_build_nbor.clear();
      k_transpose.clear();
      k_special.clear();
      delete build_program;
    } else {
      k_nbor.clear();
      delete nbor_program;
    }
    _compiled=false;
  }
}

double PairGPUNbor::host_memory_usage() const {
  if (_gpu_nbor) {
    if (_gpu_host)
      return host_nbor.row_bytes()*host_nbor.rows();
    else
      return 0;
  } else 
    return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+
           sizeof(PairGPUNbor);
}

void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
                           int **firstneigh, const int block_size) {  
  time_nbor.start();

  UCL_H_Vec<int> ilist_view;
  ilist_view.view(ilist,inum,*dev);
  ucl_copy(dev_nbor,ilist_view,true);

  UCL_D_Vec<int> nbor_offset;
  UCL_H_Vec<int> host_offset;

  int copy_count=0;
  int ij_count=0;
  int acc_count=0;
  int dev_count=0;
  int *h_ptr=host_packed.begin();
  _nbor_pitch=inum;
  
  for (int ii=0; ii<inum; ii++) {
    int i=ilist[ii];
    int nj=numj[i];
    host_acc[ii]=nj;
    host_acc[ii+inum]=acc_count;

    acc_count+=nj;
    
    int *jlist=firstneigh[i];
    for (int jj=0; jj<nj; jj++) {
      *h_ptr=jlist[jj];
      h_ptr++;
      ij_count++;
       
      if (ij_count==IJ_SIZE) {
        dev_nbor.sync();
        host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
        nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
        ucl_copy(nbor_offset,host_offset,true);
        copy_count++;
        ij_count=0;
        dev_count+=IJ_SIZE;
        h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
      }
    }
  }
  if (ij_count!=0) {
    dev_nbor.sync();
    host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
    nbor_offset.view_offset(dev_count,dev_packed,ij_count);
    ucl_copy(nbor_offset,host_offset,true);
  }
  UCL_D_Vec<int> acc_view;
  acc_view.view_offset(inum,dev_nbor,inum*2);
  ucl_copy(acc_view,host_acc,true);
  time_nbor.stop();
  
  if (_use_packing==false) {
    time_kernel.start();
    int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
    k_nbor.set_size(GX,block_size);
    k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
    time_kernel.stop();
  }
}

void PairGPUNbor::compile_kernels(UCL_Device &dev) {
  std::string flags="-cl-fast-relaxed-math -cl-mad-enable";

  if (_gpu_nbor==false) {
    nbor_program=new UCL_Program(dev);
    nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
    k_nbor.set_function(*nbor_program,"kernel_unpack");
  } else {
    build_program=new UCL_Program(dev);
    #ifdef USE_OPENCL
    std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
    exit(1);
    #else
    build_program->load_string(pair_gpu_build_kernel,flags.c_str());
    #endif
    k_cell_id.set_function(*build_program,"calc_cell_id");
    k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
    k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
    k_transpose.set_function(*build_program,"transpose");
    k_special.set_function(*build_program,"kernel_special");
    neigh_tex.get_texture(*build_program,"neigh_tex");
  }
  _compiled=true;
}

template <class numtyp, class acctyp>
void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
                                  const int nall, 
                                  PairGPUAtom<numtyp,acctyp> &atom, 
                                  double *boxlo, double *boxhi, int *tag, 
                                  int **nspecial, int **special, bool &success,
                                  int &mn) {
  const int nt=inum+host_inum;

  if (_maxspecial>0) {
    time_nbor.start();
    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
    view_nspecial.view(nspecial[0],nt*3,*dev);
    view_special.view(special[0],nt*_maxspecial,*dev);
    view_tag.view(tag,nall,*dev);
    ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
    ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
    ucl_copy(atom.dev_tag,view_tag,nall,false);
    time_nbor.stop();
    time_nbor.add_to_total();
    time_kernel.start();
    const int b2x=8;
    const int b2y=8;
    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
    k_transpose.set_size(g2x,g2y,b2x,b2y);
    k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial,
                    &nt);        
  } else
    time_kernel.start();

  _nbor_pitch=inum;
  neigh_tex.bind_float(atom.dev_x,4);

  int ncellx, ncelly, ncellz, ncell_3d;
  ncellx = static_cast<int>(ceil(((boxhi[0] - boxlo[0]) +
                                  2.0*_cell_size)/_cell_size));
  ncelly = static_cast<int>(ceil(((boxhi[1] - boxlo[1]) +
                                  2.0*_cell_size)/_cell_size));
  ncellz = static_cast<int>(ceil(((boxhi[2] - boxlo[2]) +
                                  2.0*_cell_size)/_cell_size));
  ncell_3d = ncellx * ncelly * ncellz;
  UCL_D_Vec<int> cell_counts;
  cell_counts.alloc(ncell_3d+1,dev_nbor);
  _cell_bytes=cell_counts.row_bytes();

  /* build cell list on GPU */
  const int neigh_block=128;
  const int GX=(int)ceil((float)nall/neigh_block);
  const numtyp boxlo0=static_cast<numtyp>(boxlo[0]);
  const numtyp boxlo1=static_cast<numtyp>(boxlo[1]);
  const numtyp boxlo2=static_cast<numtyp>(boxlo[2]);
  const numtyp boxhi0=static_cast<numtyp>(boxhi[0]);
  const numtyp boxhi1=static_cast<numtyp>(boxhi[1]);
  const numtyp boxhi2=static_cast<numtyp>(boxhi[2]);
  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
  k_cell_id.set_size(GX,neigh_block);
  k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
                &atom.dev_particle_id.begin(),
  				      &boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1, 
  				      &boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall);

  atom.sort_neighbor(nall);

  /* calculate cell count */
  k_cell_counts.set_size(GX,neigh_block);
  k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall, 
                    &ncell_3d);

  /* build the neighbor list */
  const int cell_block=64;
  k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
  k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
                   &cell_counts.begin(), &dev_nbor.begin(),
                   &dev_host_nbor.begin(), &_max_nbors, &cell_size_cast,
                   &ncellx, &ncelly, &ncellz, &inum, &nt, &nall);

  /* Get the maximum number of nbors and realloc if necessary */
  UCL_D_Vec<int> numj;
  numj.view_offset(inum,dev_nbor,inum);
  ucl_copy(host_acc,numj,inum,false);
  if (nt>inum) {
    UCL_H_Vec<int> host_offset;
    host_offset.view_offset(inum,host_acc,nt-inum);
    ucl_copy(host_offset,dev_host_nbor,nt-inum,false);
  }
  mn=host_acc[0];
  for (int i=1; i<nt; i++)
    mn=std::max(mn,host_acc[i]);

  if (mn>_max_nbors) {  
    mn=static_cast<int>(static_cast<double>(mn)*1.10);
    dev_nbor.clear();
    success=success && (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_cell_id,
                        UCL_READ_ONLY)==UCL_SUCCESS);
    _gpu_bytes=dev_nbor.row_bytes();
    if (_max_host>0) {
      host_nbor.clear();
      dev_host_nbor.clear();
      success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor,
                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
      success=success && (dev_host_nbor.alloc((mn+1)*_max_host,
                                        dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
      _gpu_bytes+=dev_host_nbor.row_bytes();
    }
    if (_alloc_packed) {
      dev_packed.clear();
      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
                                           UCL_READ_ONLY)==UCL_SUCCESS);
      _gpu_bytes+=dev_packed.row_bytes();
    }
    if (!success)
      return;
    _max_nbors=mn;
    time_kernel.stop();
    time_kernel.add_to_total();
    build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial,
                    special, success, mn);
    return;
  }
  
  if (_maxspecial>0) {
    const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
    k_special.set_size(GX2,cell_block);
    k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
                  &atom.dev_tag.begin(), &dev_nspecial.begin(), 
                  &dev_special.begin(), &inum, &nt, &nall);
  }
  time_kernel.stop();

  time_nbor.start();
  if (_gpu_host)
    ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);
  time_nbor.stop();
}

template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
     (const int inum, const int host_inum, const int nall, 
      PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *boxlo, double *boxhi,
      int *, int **, int **, bool &success, int &mn);
Changes from Mike Brown. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2010-11-23 08:40:35 +08:00			`/* ----------------------------------------------------------------------`
			`LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator`
			`http://lammps.sandia.gov, Sandia National Laboratories`
			`Steve Plimpton, sjplimp@sandia.gov`

			`Copyright (2003) Sandia Corporation. Under the terms of Contract`
			`DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains`
			`certain rights in this software. This software is distributed under`
			`the GNU General Public License.`

			`See the README file in the top-level LAMMPS directory.`
			`------------------------------------------------------------------------- */`
Getting rid of extra CR characters at ends of lines. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5285 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2010-11-24 03:52:03 +08:00
Changes from Mike Brown. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2010-11-23 08:40:35 +08:00			`/* ----------------------------------------------------------------------`
			`Contributing authors: Mike Brown (ORNL), brownw@ornl.gov`
			`Peng Wang (Nvidia), penwang@nvidia.com`
			`------------------------------------------------------------------------- */`

			`#include "pair_gpu_precision.h"`
			`#include "pair_gpu_nbor.h"`
			`#include "math.h"`

			`#ifdef USE_OPENCL`
			`#include "pair_gpu_nbor_cl.h"`
			`#else`
			`#include "pair_gpu_nbor_ptx.h"`
			`#include "pair_gpu_build_ptx.h"`
			`#endif`

			`int PairGPUNbor::bytes_per_atom(const int max_nbors) const {`
			`if (_gpu_nbor)`
			`return (max_nbors+2)*sizeof(int);`
			`else if (_use_packing)`
			`return ((max_nbors+2)2)sizeof(int);`
			`else`
			`return (max_nbors+3)*sizeof(int);`
			`}`

			`bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,`
			`const int maxspecial, UCL_Device &devi,`
			`const bool gpu_nbor, const int gpu_host,`
			`const bool pre_cut) {`
			`clear();`

			`dev=&devi;`
			`_gpu_nbor=gpu_nbor;`
			`if (gpu_host==0)`
			`_gpu_host=false;`
			`else if (gpu_host==1)`
			`_gpu_host=true;`
			`else`
			`// Not yet implemented`
			`assert(0==1);`

			`if (pre_cut \|\| gpu_nbor==false)`
			`_alloc_packed=true;`
			`else`
			`_alloc_packed=false;`

			`bool success=true;`

			`// Initialize timers for the selected GPU`
			`time_nbor.init(*dev);`
			`time_kernel.init(*dev);`
			`time_nbor.zero();`
			`time_kernel.zero();`

			`_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);`
			`if (_max_atoms==0)`
			`_max_atoms=1000;`

			`_max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);`
			`_max_nbors=max_nbors;`

			`_maxspecial=maxspecial;`
			`if (gpu_nbor==false)`
			`_maxspecial=0;`

			`if (gpu_nbor==false)`
			`success=success && (host_packed.alloc(2IJ_SIZE,dev,`
			`UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);`
			`alloc(success);`
			`if (_use_packing==false)`
			`compile_kernels(devi);`

			`return success;`
			`}`

			`void PairGPUNbor::alloc(bool &success) {`
			`dev_nbor.clear();`
			`host_acc.clear();`
			`if (_use_packing==false \|\| _gpu_nbor)`
			`success=success && (dev_nbor.alloc((_max_nbors+2)_max_atoms,dev,`
			`UCL_READ_ONLY)==UCL_SUCCESS);`
			`else`
			`success=success && (dev_nbor.alloc(3_max_atoms,dev,`
			`UCL_READ_ONLY)==UCL_SUCCESS);`
			`success=success && (host_acc.alloc((_max_atoms+_max_host)2,dev,`
			`UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);`

			`_c_bytes=dev_nbor.row_bytes();`
			`if (_alloc_packed) {`
			`dev_packed.clear();`
			`success=success && (dev_packed.alloc((_max_nbors+2)_max_atoms,dev,`
			`UCL_READ_ONLY)==UCL_SUCCESS);`
			`_c_bytes+=dev_packed.row_bytes();`
			`}`
			`if (_max_host>0) {`
			`host_nbor.clear();`
			`dev_host_nbor.clear();`
			`success=success && (host_nbor.alloc((_max_nbors+1)_max_host,dev,`
			`UCL_RW_OPTIMIZED)==UCL_SUCCESS);`
			`success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host,`
			`*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);`
			`_c_bytes+=dev_host_nbor.row_bytes();`
			`}`
			`if (_maxspecial>0) {`
			`dev_nspecial.clear();`
			`dev_special.clear();`
			`dev_special_t.clear();`
			`int at=_max_atoms+_max_host;`
			`success=success && (dev_nspecial.alloc(3at,dev,`
			`UCL_READ_ONLY)==UCL_SUCCESS);`
			`success=success && (dev_special.alloc(_maxspecialat,dev,`
			`UCL_READ_ONLY)==UCL_SUCCESS);`
			`success=success && (dev_special_t.alloc(_maxspecialat,dev,`
			`UCL_READ_ONLY)==UCL_SUCCESS);`
			`_gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+`
			`dev_special_t.row_bytes();`
			`}`

			`_allocated=true;`
			`}`

			`void PairGPUNbor::clear() {`
			`_gpu_bytes=0.0;`
			`_cell_bytes=0.0;`
			`_c_bytes=0.0;`
			`if (_allocated) {`
			`_allocated=false;`

			`host_packed.clear();`
			`host_acc.clear();`
			`dev_nbor.clear();`
			`dev_host_nbor.clear();`
			`dev_packed.clear();`
			`host_nbor.clear();`
			`dev_nspecial.clear();`
			`dev_special.clear();`
			`dev_special_t.clear();`

			`time_kernel.clear();`
			`time_nbor.clear();`
			`}`

			`if (_compiled) {`
			`if (_gpu_nbor) {`
			`k_cell_id.clear();`
			`k_cell_counts.clear();`
			`k_build_nbor.clear();`
			`k_transpose.clear();`
			`k_special.clear();`
			`delete build_program;`
			`} else {`
			`k_nbor.clear();`
			`delete nbor_program;`
			`}`
			`_compiled=false;`
			`}`
			`}`

			`double PairGPUNbor::host_memory_usage() const {`
			`if (_gpu_nbor) {`
			`if (_gpu_host)`
			`return host_nbor.row_bytes()*host_nbor.rows();`
			`else`
			`return 0;`
			`} else`
			`return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+`
			`sizeof(PairGPUNbor);`
			`}`

			`void PairGPUNbor::get_host(const int inum, int ilist, int numj,`
			`int **firstneigh, const int block_size) {`
			`time_nbor.start();`

			`UCL_H_Vec<int> ilist_view;`
			`ilist_view.view(ilist,inum,*dev);`
			`ucl_copy(dev_nbor,ilist_view,true);`

			`UCL_D_Vec<int> nbor_offset;`
			`UCL_H_Vec<int> host_offset;`

			`int copy_count=0;`
			`int ij_count=0;`
			`int acc_count=0;`
			`int dev_count=0;`
			`int *h_ptr=host_packed.begin();`
			`_nbor_pitch=inum;`

			`for (int ii=0; ii<inum; ii++) {`
			`int i=ilist[ii];`
			`int nj=numj[i];`
			`host_acc[ii]=nj;`
			`host_acc[ii+inum]=acc_count;`

			`acc_count+=nj;`

			`int *jlist=firstneigh[i];`
			`for (int jj=0; jj<nj; jj++) {`
			`*h_ptr=jlist[jj];`
			`h_ptr++;`
			`ij_count++;`

			`if (ij_count==IJ_SIZE) {`
			`dev_nbor.sync();`
			`host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);`
			`nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);`
			`ucl_copy(nbor_offset,host_offset,true);`
			`copy_count++;`
			`ij_count=0;`
			`dev_count+=IJ_SIZE;`
			`h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));`
			`}`
			`}`
			`}`
			`if (ij_count!=0) {`
			`dev_nbor.sync();`
			`host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);`
			`nbor_offset.view_offset(dev_count,dev_packed,ij_count);`
			`ucl_copy(nbor_offset,host_offset,true);`
			`}`
			`UCL_D_Vec<int> acc_view;`
			`acc_view.view_offset(inum,dev_nbor,inum*2);`
			`ucl_copy(acc_view,host_acc,true);`
			`time_nbor.stop();`

			`if (_use_packing==false) {`
			`time_kernel.start();`
			`int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));`
			`k_nbor.set_size(GX,block_size);`
			`k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);`
			`time_kernel.stop();`
			`}`
			`}`

			`void PairGPUNbor::compile_kernels(UCL_Device &dev) {`
			`std::string flags="-cl-fast-relaxed-math -cl-mad-enable";`

			`if (_gpu_nbor==false) {`
			`nbor_program=new UCL_Program(dev);`
			`nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());`
			`k_nbor.set_function(*nbor_program,"kernel_unpack");`
			`} else {`
			`build_program=new UCL_Program(dev);`
			`#ifdef USE_OPENCL`
			`std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";`
			`exit(1);`
			`#else`
			`build_program->load_string(pair_gpu_build_kernel,flags.c_str());`
			`#endif`
			`k_cell_id.set_function(*build_program,"calc_cell_id");`
			`k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");`
			`k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");`
			`k_transpose.set_function(*build_program,"transpose");`
			`k_special.set_function(*build_program,"kernel_special");`
			`neigh_tex.get_texture(*build_program,"neigh_tex");`
			`}`
			`_compiled=true;`
			`}`

			`template <class numtyp, class acctyp>`
			`void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,`
			`const int nall,`
			`PairGPUAtom<numtyp,acctyp> &atom,`
			`double boxlo, double boxhi, int *tag,`
			`int nspecial, int special, bool &success,`
			`int &mn) {`
			`const int nt=inum+host_inum;`

			`if (_maxspecial>0) {`
			`time_nbor.start();`
			`UCL_H_Vec<int> view_nspecial, view_special, view_tag;`
			`view_nspecial.view(nspecial[0],nt3,dev);`
			`view_special.view(special[0],nt_maxspecial,dev);`
			`view_tag.view(tag,nall,*dev);`
			`ucl_copy(dev_nspecial,view_nspecial,nt*3,false);`
			`ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);`
			`ucl_copy(atom.dev_tag,view_tag,nall,false);`
			`time_nbor.stop();`
			`time_nbor.add_to_total();`
			`time_kernel.start();`
			`const int b2x=8;`
			`const int b2y=8;`
			`const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));`
			`const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));`
			`k_transpose.set_size(g2x,g2y,b2x,b2y);`
			`k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial,`
			`&nt);`
			`} else`
			`time_kernel.start();`

			`_nbor_pitch=inum;`
			`neigh_tex.bind_float(atom.dev_x,4);`

			`int ncellx, ncelly, ncellz, ncell_3d;`
			`ncellx = static_cast<int>(ceil(((boxhi[0] - boxlo[0]) +`
			`2.0*_cell_size)/_cell_size));`
			`ncelly = static_cast<int>(ceil(((boxhi[1] - boxlo[1]) +`
			`2.0*_cell_size)/_cell_size));`
			`ncellz = static_cast<int>(ceil(((boxhi[2] - boxlo[2]) +`
			`2.0*_cell_size)/_cell_size));`
			`ncell_3d = ncellx * ncelly * ncellz;`
			`UCL_D_Vec<int> cell_counts;`
			`cell_counts.alloc(ncell_3d+1,dev_nbor);`
			`_cell_bytes=cell_counts.row_bytes();`

			`/* build cell list on GPU */`
			`const int neigh_block=128;`
			`const int GX=(int)ceil((float)nall/neigh_block);`
			`const numtyp boxlo0=static_cast<numtyp>(boxlo[0]);`
			`const numtyp boxlo1=static_cast<numtyp>(boxlo[1]);`
			`const numtyp boxlo2=static_cast<numtyp>(boxlo[2]);`
			`const numtyp boxhi0=static_cast<numtyp>(boxhi[0]);`
			`const numtyp boxhi1=static_cast<numtyp>(boxhi[1]);`
			`const numtyp boxhi2=static_cast<numtyp>(boxhi[2]);`
			`const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);`
			`k_cell_id.set_size(GX,neigh_block);`
			`k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),`
			`&atom.dev_particle_id.begin(),`
			`&boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1,`
			`&boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall);`

			`atom.sort_neighbor(nall);`

			`/* calculate cell count */`
			`k_cell_counts.set_size(GX,neigh_block);`
			`k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall,`
			`&ncell_3d);`

			`/* build the neighbor list */`
			`const int cell_block=64;`
			`k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);`
			`k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),`
			`&cell_counts.begin(), &dev_nbor.begin(),`
			`&dev_host_nbor.begin(), &_max_nbors, &cell_size_cast,`
			`&ncellx, &ncelly, &ncellz, &inum, &nt, &nall);`

			`/* Get the maximum number of nbors and realloc if necessary */`
			`UCL_D_Vec<int> numj;`
			`numj.view_offset(inum,dev_nbor,inum);`
			`ucl_copy(host_acc,numj,inum,false);`
			`if (nt>inum) {`
			`UCL_H_Vec<int> host_offset;`
			`host_offset.view_offset(inum,host_acc,nt-inum);`
			`ucl_copy(host_offset,dev_host_nbor,nt-inum,false);`
			`}`
			`mn=host_acc[0];`
			`for (int i=1; i<nt; i++)`
			`mn=std::max(mn,host_acc[i]);`

			`if (mn>_max_nbors) {`
			`mn=static_cast<int>(static_cast<double>(mn)*1.10);`
			`dev_nbor.clear();`
			`success=success && (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_cell_id,`
			`UCL_READ_ONLY)==UCL_SUCCESS);`
			`_gpu_bytes=dev_nbor.row_bytes();`
			`if (_max_host>0) {`
			`host_nbor.clear();`
			`dev_host_nbor.clear();`
			`success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor,`
			`UCL_RW_OPTIMIZED)==UCL_SUCCESS);`
			`success=success && (dev_host_nbor.alloc((mn+1)*_max_host,`
			`dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);`
			`_gpu_bytes+=dev_host_nbor.row_bytes();`
			`}`
			`if (_alloc_packed) {`
			`dev_packed.clear();`
			`success=success && (dev_packed.alloc((mn+2)_max_atoms,dev,`
			`UCL_READ_ONLY)==UCL_SUCCESS);`
			`_gpu_bytes+=dev_packed.row_bytes();`
			`}`
			`if (!success)`
			`return;`
			`_max_nbors=mn;`
			`time_kernel.stop();`
			`time_kernel.add_to_total();`
			`build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial,`
			`special, success, mn);`
			`return;`
			`}`

			`if (_maxspecial>0) {`
			`const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));`
			`k_special.set_size(GX2,cell_block);`
			`k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),`
			`&atom.dev_tag.begin(), &dev_nspecial.begin(),`
			`&dev_special.begin(), &inum, &nt, &nall);`
			`}`
			`time_kernel.stop();`

			`time_nbor.start();`
			`if (_gpu_host)`
			`ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);`
			`time_nbor.stop();`
			`}`

			`template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>`
			`(const int inum, const int host_inum, const int nall,`
			`PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double boxlo, double boxhi,`
			`int , int , int *, bool &success, int &mn);`