lammps/lib/gpu/lj_gpu_memory.cu

/***************************************************************************
                               lj_gpu_memory.cu
                             -------------------
                               W. Michael Brown

  Global variables for GPU Lennard-Jones Library

 __________________________________________________________________________
    This file is part of the LAMMPS GPU Library
 __________________________________________________________________________

    begin                : Tue Aug 4 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : wmbrown@sandia.gov
 ***************************************************************************/

/* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   ----------------------------------------------------------------------- */

#include "lj_gpu_memory.h"
#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>

template <class numtyp, class acctyp>
int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
  return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors);
}

template <class numtyp, class acctyp>
int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
  int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
                              bytes_per_atom(max_nbors));
  if (matoms>MAX_ATOMS)
    matoms=MAX_ATOMS;
  return matoms;
}

template <class numtyp, class acctyp>
bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
                          double **host_cutsq, double **host_sigma,
                          double **host_epsilon, double **host_lj1,
                          double **host_lj2, double **host_lj3,
                          double **host_lj4, double **host_offset,
                          double *host_special_lj, const int max_nbors,
                          const int me) {
  if (allocated)
    clear();

  if (me>=gpu.num_devices())
    return false;
  gpu.set(me);
  if (gpu.revision()<1.0)
    return false;

  // Initialize timers for the selected GPU
  time_pair.init();

  // Initialize atom and nbor data
  max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
  atom.init(max_atoms);
  nbor.init(ij_size,max_atoms,max_nbors);

  // Get a stream for computing pair potentials
  CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));

  // Use the write buffer from atom for data initialization
  NVC_HostT &host_write=atom.host_write;
  assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);

  // Copy data for bonded interactions
  special_lj.safe_alloc(4);
  special_lj.cast_copy(host_special_lj,host_write);

  // Copy sigma, epsilon, and cutsq onto GPU
  sigma.safe_alloc(ntypes,ntypes,sigma_get_texture<numtyp>());
  sigma.cast_copy(host_sigma[0],host_write);
  epsilon.safe_alloc(ntypes,ntypes,epsilon_get_texture<numtyp>());
  epsilon.cast_copy(host_epsilon[0],host_write);
  cutsq.safe_alloc(ntypes,ntypes,cutsq_get_texture<numtyp>());
  cutsq.cast_copy(host_cutsq[0],host_write);

  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  if (lj_types<=MAX_SHARED_TYPES) {
    lj_types=MAX_SHARED_TYPES;
    shared_types=true;
  }
  offset.safe_alloc(lj_types,lj_types,offset_get_texture<numtyp>());
  offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes);
  double *t1=host_lj1[0];
  double *t2=host_lj2[0];
  for (int i=0; i<ntypes*ntypes; i++) {
    host_write[i*2]=t1[i];
    host_write[i*2+1]=t2[i];
  }
  lj1.safe_alloc(lj_types,lj_types,lj1_get_texture<numtyp>());
  lj1.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
                       ntypes,ntypes);
  t1=host_lj3[0];
  t2=host_lj4[0];
  for (int i=0; i<ntypes*ntypes; i++) {
    host_write[i*2]=t1[i];
    host_write[i*2+1]=t2[i];
  }
  lj3.safe_alloc(lj_types,lj_types,lj3_get_texture<numtyp>());
  lj3.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
                       ntypes,ntypes);

  dev_error.safe_alloc(1);
  dev_error.zero();

  allocated=true;
  return true;
}

template <class numtyp, class acctyp>
void LJ_GPU_MemoryT::clear() {
  if (!allocated)
    return;
  allocated=false;

  // Check for any pair style specific errors here
  int err_flag;
  dev_error.copy_to_host(&err_flag);

  atom.clear();
  nbor.clear();

  CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream));

  dev_error.clear();
  sigma.clear();
  epsilon.clear();
  special_lj.clear();
  cutsq.clear();
  offset.clear();
  lj1.clear();
  lj3.clear();
}

template <class numtyp, class acctyp>
double LJ_GPU_MemoryT::host_memory_usage() const {
  return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+
         sizeof(LJ_GPU_Memory<numtyp,acctyp>);
}

template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>;