lammps/lib/gpu/pair_gpu_atom.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */

#include "pair_gpu_atom.h"

#define PairGPUAtomT PairGPUAtom<numtyp,acctyp>

#ifdef WINDLL
#include <windows.h>
typedef bool (*__win_sort_alloc)(const int max_atoms);
typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin,
                           int *particle_begin);
__win_sort_alloc _win_sort_alloc;
__win_sort _win_sort;
#endif

template <class numtyp, class acctyp>
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
                              _vflag(false),_inum(0),_ilist(NULL) {
  #ifndef USE_OPENCL
  sort_config.op = CUDPP_ADD;
  sort_config.datatype = CUDPP_UINT;
  sort_config.algorithm = CUDPP_SORT_RADIX;
  sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;

  #ifdef WINDLL
  HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
  if (hinstLib == NULL) {
    printf("\nUnable to load gpu.dll\n");
    exit(1);
  }
  _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc");
  _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort");
  #endif

  #endif
}

template <class numtyp, class acctyp>
int PairGPUAtomT::bytes_per_atom() const {
  int id_space=0;
  if (_gpu_nbor)
    id_space=2;
  int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
  if (_rot)
    bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
  if (_charge)
    bytes+=sizeof(numtyp);
  return bytes;
}

template <class numtyp, class acctyp>
bool PairGPUAtomT::alloc(const int max_atoms) {
  bool success=true;

  int ans_elements=4;
  if (_rot)
    ans_elements+=4;

  // Ignore host/device transfers?
  bool cpuview=false;
  if (dev->device_type()==UCL_CPU)
    cpuview=true;

  // Allocate storage for CUDPP sort
  #ifndef USE_OPENCL
  #ifdef WINDLL
  _win_sort_alloc(max_atoms);
  #else
  if (_gpu_nbor) {
    CUDPPResult result = cudppPlan(&sort_plan, sort_config, max_atoms, 1, 0);
    if (CUDPP_SUCCESS != result)
      return false;
  }
  #endif
  #endif

  // --------------------------   Host allocations
  // Get a host write only buffer
  #ifdef GPU_CAST
  success=success && (host_x_cast.alloc(max_atoms*3,*dev,
                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  success=success && (host_type_cast.alloc(max_atoms,*dev,
                                           UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  #else
  success=success && (host_x.alloc(max_atoms*4,*dev,
                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  #endif
  success=success && (host_ans.alloc(ans_elements*max_atoms,*dev)==UCL_SUCCESS);
  success=success && (host_engv.alloc(_ev_fields*max_atoms,*dev)==UCL_SUCCESS);
  // Buffer for casting only if different precisions
  if (_charge)
    success=success && (host_q.alloc(max_atoms,*dev,
                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  // Buffer for casting only if different precisions
  if (_rot)
    success=success && (host_quat.alloc(max_atoms*4,*dev,
                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);


  // ---------------------------  Device allocations
  _gpu_bytes=0;
  if (cpuview) {
    #ifdef GPU_CAST
    assert(0==1);
    #else
    dev_x.view(host_x);
    #endif
    dev_engv.view(host_engv);
    dev_ans.view(host_ans);
    if (_rot)
      dev_quat.view(host_quat);
    if (_charge)
      dev_q.view(host_q);
  } else {
    #ifdef GPU_CAST
    success=success && (UCL_SUCCESS==dev_x.alloc(max_atoms*4,*dev));
    success=success && (UCL_SUCCESS==
                        dev_x_cast.alloc(max_atoms*3,*dev,UCL_READ_ONLY));
    success=success && (UCL_SUCCESS==
                        dev_type_cast.alloc(max_atoms,*dev,UCL_READ_ONLY));
    _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
    #else
    success=success && (UCL_SUCCESS==
                        dev_x.alloc(max_atoms*4,*dev,UCL_READ_ONLY));
    #endif
    success=success && (dev_engv.alloc(_ev_fields*max_atoms,*dev,
                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
    success=success && (dev_ans.alloc(ans_elements*max_atoms,
                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
    if (_charge) {
      success=success && (dev_q.alloc(max_atoms,*dev,
                                      UCL_READ_ONLY)==UCL_SUCCESS);
      _gpu_bytes+=dev_q.row_bytes();
    }
    if (_rot) {
      success=success && (dev_quat.alloc(max_atoms*4,*dev,
                                      UCL_READ_ONLY)==UCL_SUCCESS);
      _gpu_bytes+=dev_quat.row_bytes();
    }
  }
  if (_gpu_nbor) {
    success=success && (dev_cell_id.alloc(max_atoms,*dev)==UCL_SUCCESS);
    success=success && (dev_particle_id.alloc(max_atoms,*dev)==UCL_SUCCESS);
    _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
    if (_bonds) {
      success=success && (dev_tag.alloc(max_atoms,*dev)==UCL_SUCCESS);
      _gpu_bytes+=dev_tag.row_bytes();
    }
  }

  _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();

  return success;
}

template <class numtyp, class acctyp>
bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
                        const bool rot, UCL_Device &devi, const bool gpu_nbor,
                        const bool bonds) {
  clear();

  bool success=true;
  _gpu_nbor=gpu_nbor;
  _bonds=bonds;
  _charge=charge;
  _rot=rot;
  _other=_charge || _rot;
  dev=&devi;

  _e_fields=1;
  if (_charge)
    _e_fields++;
  _ev_fields=6+_e_fields;

  // Initialize atom and nbor data
  int max_local=static_cast<int>(static_cast<double>(inum)*1.10);
  if (max_local==0)
    max_local=1000;
  if (nall<=inum)
    _max_atoms=max_local*2;
  else
    _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);

  // Initialize timers for the selected device
  time_pos.init(*dev);
  time_other.init(*dev);
  time_answer.init(*dev);
  time_pos.zero();
  time_other.zero();
  time_answer.zero();
  _time_cast=0.0;

  #ifdef GPU_CAST
  compile_kernels(*dev);
  #endif

  _allocated=true;
  return success && alloc(_max_atoms);
}

template <class numtyp, class acctyp>
void PairGPUAtomT::clear_resize() {
  if (!_allocated)
    return;
  _allocated=false;

  dev_x.clear();
  if (_charge) {
    dev_q.clear();
    host_q.clear();
  }
  if (_rot) {
    dev_quat.clear();
    host_quat.clear();
  }
  dev_ans.clear();
  dev_engv.clear();
  #ifndef GPU_CAST
  host_x.clear();
  #else
  host_x_cast.clear();
  host_type_cast.clear();
  #endif
  host_ans.clear();
  host_engv.clear();
  dev_cell_id.clear();
  dev_particle_id.clear();
  dev_tag.clear();
  #ifdef GPU_CAST
  dev_x_cast.clear();
  dev_type_cast.clear();
  #endif

  #ifndef USE_OPENCL
  #ifndef WINDLL
  if (_gpu_nbor) cudppDestroyPlan(sort_plan);
  #endif
  #endif
}

template <class numtyp, class acctyp>
void PairGPUAtomT::clear() {
  _gpu_bytes=0;
  if (!_allocated)
    return;

  time_pos.clear();
  time_other.clear();
  time_answer.clear();
  clear_resize();
  _inum=0;
  _eflag=false;
  _vflag=false;

  #ifdef GPU_CAST
  if (_compiled) {
    k_cast_x.clear();
    delete atom_program;
    _compiled=false;
  }
  #endif
}

template <class numtyp, class acctyp>
double PairGPUAtomT::host_memory_usage() const {
  int atom_bytes=4;
  if (_charge)
    atom_bytes+=1;
  if (_rot)
    atom_bytes+=4;
  int ans_bytes=atom_bytes+_ev_fields;
  return _max_atoms*atom_bytes*sizeof(numtyp)+
         ans_bytes*(_max_atoms)*sizeof(acctyp)+
         sizeof(PairGPUAtom<numtyp,acctyp>);
}

template <class numtyp, class acctyp>
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
                                const bool ef_atom, const bool vf_atom) {
  time_answer.start();
  _eflag=eflag;
  _vflag=vflag;
  _ef_atom=ef_atom;
  _vf_atom=vf_atom;

  int csize=_ev_fields;
  if (!eflag)
    csize-=_e_fields;
  if (!vflag)
    csize-=6;

  if (csize>0)
    ucl_copy(host_engv,dev_engv,_inum*csize,true);
  if (_rot)
    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
  else
    ucl_copy(host_ans,dev_ans,_inum*4,true);
  time_answer.stop();
}

template <class numtyp, class acctyp>
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
                                const bool ef_atom, const bool vf_atom,
                                int *ilist) {
  _ilist=ilist;
  copy_answers(eflag,vflag,ef_atom,vf_atom);
}

template <class numtyp, class acctyp>
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
                                   double *virial) {
  if (_eflag==false && _vflag==false)
    return 0.0;

  double evdwl=0.0;
  if (_gpu_nbor) {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      if (_eflag) {
        if (_ef_atom) {
          evdwl+=*ap;
          eatom[i]+=*ap*0.5;
          ap+=_inum;
        } else {
          evdwl+=*ap;
          ap+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
            vatom[i][j]+=*ap*0.5;
            virial[j]+=*ap;
            ap+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
            virial[j]+=*ap;
            ap+=_inum;
          }
        }
      }
    }
    for (int j=0; j<6; j++)
      virial[j]*=0.5;
  } else {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      int ii=_ilist[i];
      if (_eflag) {
        if (_ef_atom) {
          evdwl+=*ap;
          eatom[ii]+=*ap*0.5;
          ap+=_inum;
        } else {
          evdwl+=*ap;
          ap+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
            vatom[ii][j]+=*ap*0.5;
            virial[j]+=*ap;
            ap+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
            virial[j]+=*ap;
            ap+=_inum;
          }
        }
      }
    }
    for (int j=0; j<6; j++)
      virial[j]*=0.5;
  }

  evdwl*=0.5;
  return evdwl;
}

template <class numtyp, class acctyp>
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
                                   double *virial, double &ecoul) {
  if (_eflag==false && _vflag==false) {
    ecoul=0.0;
    return 0.0;
  }

  if (_charge==false)
    return energy_virial(eatom,vatom,virial);

  double evdwl=0.0;
  double _ecoul=0.0;
  if (_gpu_nbor) {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      if (_eflag) {
        if (_ef_atom) {
          evdwl+=*ap;
          eatom[i]+=*ap*0.5;
          ap+=_inum;
          _ecoul+=*ap;
          eatom[i]+=*ap*0.5;
          ap+=_inum;
        } else {
          evdwl+=*ap;
          ap+=_inum;
          _ecoul+=*ap;
          ap+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
            vatom[i][j]+=*ap*0.5;
            virial[j]+=*ap;
            ap+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
            virial[j]+=*ap;
            ap+=_inum;
          }
        }
      }
    }
    for (int j=0; j<6; j++)
      virial[j]*=0.5;
  } else {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      int ii=_ilist[i];
      if (_eflag) {
        if (_ef_atom) {
          evdwl+=*ap;
          eatom[ii]+=*ap*0.5;
          ap+=_inum;
          _ecoul+=*ap;
          eatom[ii]+=*ap*0.5;
          ap+=_inum;
        } else {
          evdwl+=*ap;
          ap+=_inum;
          _ecoul+=*ap;
          ap+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
            vatom[ii][j]+=*ap*0.5;
            virial[j]+=*ap;
            ap+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
            virial[j]+=*ap;
            ap+=_inum;
          }
        }
      }
    }
    for (int j=0; j<6; j++)
      virial[j]*=0.5;
  }

  evdwl*=0.5;
  ecoul+=_ecoul*0.5;
  return evdwl;
}

template <class numtyp, class acctyp>
void PairGPUAtomT::get_answers(double **f, double **tor) {
  acctyp *ap=host_ans.begin();
  if (_gpu_nbor) {
    for (int i=0; i<_inum; i++) {
      f[i][0]+=*ap;
      ap++;
      f[i][1]+=*ap;
      ap++;
      f[i][2]+=*ap;
      ap+=2;
    }
    if (_rot) {
      for (int i=0; i<_inum; i++) {
        tor[i][0]+=*ap;
        ap++;
        tor[i][1]+=*ap;
        ap++;
        tor[i][2]+=*ap;
        ap+=2;
      }
    }
  } else {
    for (int i=0; i<_inum; i++) {
      int ii=_ilist[i];
      f[ii][0]+=*ap;
      ap++;
      f[ii][1]+=*ap;
      ap++;
      f[ii][2]+=*ap;
      ap+=2;
    }
    if (_rot) {
      for (int i=0; i<_inum; i++) {
        int ii=_ilist[i];
        tor[ii][0]+=*ap;
        ap++;
        tor[ii][1]+=*ap;
        ap++;
        tor[ii][2]+=*ap;
        ap+=2;
      }
    }
  }
}

// Sort arrays for neighbor list calculation
template <class numtyp, class acctyp>
void PairGPUAtomT::sort_neighbor(const int num_atoms) {
  #ifndef USE_OPENCL
  #ifdef WINDLL
  _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(),
            (int *)dev_particle_id.begin());
  #else
  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
                                 (int *)dev_particle_id.begin(),
                                 8*sizeof(unsigned), num_atoms);
  if (CUDPP_SUCCESS != result) {
    printf("Error in cudppSort\n");
    NVD_GERYON_EXIT;
  }
  #endif
  #endif
}

#ifdef GPU_CAST
#ifdef USE_OPENCL
#include "pair_gpu_atom_cl.h"
#else
#include "pair_gpu_atom_ptx.h"
#endif

template <class numtyp, class acctyp>
void PairGPUAtomT::compile_kernels(UCL_Device &dev) {
  atom_program=new UCL_Program(dev);
  atom_program->load_string(pair_gpu_atom_kernel,"");
  k_cast_x.set_function(*atom_program,"kernel_cast_x");
  _compiled=true;
}

#endif

template class PairGPUAtom<PRECISION,ACC_PRECISION>;