lammps/lib/gpu/lal_pppm.h

/***************************************************************************
                                   pppm.h
                             -------------------
                            W. Michael Brown (ORNL)

  Class for PPPM acceleration

 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

    begin                : 
    email                : brownw@ornl.gov
 ***************************************************************************/

#ifndef LAL_PPPM_H
#define LAL_PPPM_H

#include "mpi.h"
#include "lal_device.h"

#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif

namespace LAMMPS_AL {

template <class numtyp, class acctyp> class Device;

template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
class PPPM {
 public:
  PPPM();
  virtual ~PPPM();

  /// Clear any previous data and set up for a new LAMMPS run
  /** Success will be:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -2 if GPU could not be found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,
                const int nxlo_out, const int nylo_out, const int nzlo_out,
                const int nxhi_out, const int nyhi_out, const int nzhi_out,
                grdtyp **rho_coeff, grdtyp **vd_brick, 
                const double slab_volfactor, const int nx_pppm, 
                const int ny_pppm, const int nz_pppm, int &success);

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
    if (atom->resize(nall, success)) {
      pos_tex.bind_float(atom->dev_x,4);
      q_tex.bind_float(atom->dev_q,1);
    }
    ans->resize(inum,success);
  }

  /// Check if there is enough storage for local atoms and realloc if not
  inline void resize_local(const int inum, bool &success) {
  }
  
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear(const double cpu_time);

  /// Returns memory usage on device per atom
  int bytes_per_atom() const;

  /// Total host memory used by library for pair style
  double host_memory_usage() const;

  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
      ans->acc_timers();
      time_in.add_to_total();
      time_out.add_to_total();
      time_map.add_to_total();
      time_rho.add_to_total();
      time_interp.add_to_total();
    }
  }

  /// Zero timers
  inline void zero_timers() {
    atom->zero_timers();
    ans->zero_timers();
    time_in.zero();
    time_out.zero();
    time_map.zero();
    time_rho.zero();
    time_interp.zero();
  }

  /// Precomputations for charge assignment that can be done asynchronously
  inline void precompute(const int ago, const int nlocal, const int nall,
                         double **host_x, int *host_type, bool &success,
                         double *charge, double *boxlo, double *prd) {
    double delxinv=_nx_pppm/prd[0];
    double delyinv=_ny_pppm/prd[1];
    double delzinv=_nz_pppm/(prd[2]*_slab_volfactor);
    _precompute(ago,nlocal,nall,host_x,host_type,success,charge,boxlo,delxinv,
                delyinv,delzinv);
  }

  /// Returns non-zero if out of bounds atoms
  int spread(const int ago, const int nlocal, const int nall, double **host_x,
             int *host_type, bool &success, double *charge, double *boxlo,
             const double delxinv, const double delyinv, const double delzinv);

  void interp(const grdtyp qqrd2e_scale);

  // -------------------------- DEVICE DATA ------------------------- 

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;

  /// Geryon device
  UCL_Device *ucl_device;

  /// Device Timers
  UCL_Timer time_in, time_out, time_map, time_rho, time_interp;

  /// LAMMPS pointer for screen output
  FILE *screen;

  // --------------------------- ATOM DATA --------------------------

  /// Atom Data
  Atom<numtyp,acctyp> *atom;


  // --------------------------- GRID DATA --------------------------

  UCL_H_Vec<grdtyp> h_brick, h_vd_brick;
  UCL_D_Vec<grdtyp> d_brick;
  
  // Count of number of atoms assigned to each grid point
  UCL_D_Vec<int> d_brick_counts;
  // Atoms assigned to each grid point
  UCL_D_Vec<grdtyp4> d_brick_atoms;
  
  // Error checking for out of bounds atoms
  UCL_D_Vec<int> d_error_flag;
  UCL_H_Vec<int> h_error_flag;
  
  // Number of grid points in brick (including ghost)
  int _npts_x, _npts_y, _npts_z, _npts_yx;
  
  // Number of local grid points in brick
  int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride;
  
  // -------------------------- SPLINE DATA -------------------------
  UCL_D_Vec<grdtyp> d_rho_coeff;
  int _order, _nlower, _nupper, _order_m_1, _order2;
  int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out;

  // ------------------------ FORCE/ENERGY DATA -----------------------

  Answer<numtyp,acctyp> *ans;

  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pppm_program;
  UCL_Kernel k_particle_map, k_make_rho, k_interp;
  inline int block_size() { return _block_size; }

  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;
  UCL_Texture q_tex;

 protected:
  bool _allocated, _compiled, _precompute_done;
  int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms;
  double  _max_bytes, _max_an_bytes;
  double _cpu_idle_time;
  
  grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; 

  double _slab_volfactor;
  int _nx_pppm, _ny_pppm, _nz_pppm;
  
  void compile_kernels(UCL_Device &dev);
  void _precompute(const int ago, const int nlocal, const int nall,
                   double **host_x, int *host_type, bool &success,
                   double *charge, double *boxlo, const double delxinv,
                   const double delyinv, const double delzinv);
};

}

#endif
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7281 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2011-12-03 00:02:36 +08:00			`/***************************************************************************`
			`pppm.h`
			`-------------------`
			`W. Michael Brown (ORNL)`

			`Class for PPPM acceleration`

			`__________________________________________________________________________`
			`This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)`
			`__________________________________________________________________________`

			`begin :`
			`email : brownw@ornl.gov`
			`***************************************************************************/`

			`#ifndef LAL_PPPM_H`
			`#define LAL_PPPM_H`

			`#include "mpi.h"`
			`#include "lal_device.h"`

			`#ifdef USE_OPENCL`
			`#include "geryon/ocl_texture.h"`
			`#else`
			`#include "geryon/nvd_texture.h"`
			`#endif`

			`namespace LAMMPS_AL {`

			`template <class numtyp, class acctyp> class Device;`

			`template <class numtyp, class acctyp, class grdtyp, class grdtyp4>`
			`class PPPM {`
			`public:`
			`PPPM();`
			`virtual ~PPPM();`

			`/// Clear any previous data and set up for a new LAMMPS run`
			`/** Success will be:`
			`* - 0 if successfull`
			`* - -1 if fix gpu not found`
			`* - -2 if GPU could not be found`
			`* - -3 if there is an out of memory error`
			`* - -4 if the GPU library was not compiled for GPU`
			`* - -5 Double precision is not supported on card **/`
			`grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,`
			`const int nxlo_out, const int nylo_out, const int nzlo_out,`
			`const int nxhi_out, const int nyhi_out, const int nzhi_out,`
			`grdtyp rho_coeff, grdtyp vd_brick,`
			`const double slab_volfactor, const int nx_pppm,`
			`const int ny_pppm, const int nz_pppm, int &success);`

			`/// Check if there is enough storage for atom arrays and realloc if not`
			`/ \param success set to false if insufficient memory /`
			`inline void resize_atom(const int inum, const int nall, bool &success) {`
			`if (atom->resize(nall, success)) {`
			`pos_tex.bind_float(atom->dev_x,4);`
			`q_tex.bind_float(atom->dev_q,1);`
			`}`
			`ans->resize(inum,success);`
			`}`

			`/// Check if there is enough storage for local atoms and realloc if not`
			`inline void resize_local(const int inum, bool &success) {`
			`}`

			`/// Clear all host and device data`
			`/ \note This is called at the beginning of the init() routine /`
			`void clear(const double cpu_time);`

			`/// Returns memory usage on device per atom`
			`int bytes_per_atom() const;`

			`/// Total host memory used by library for pair style`
			`double host_memory_usage() const;`

			`/// Accumulate timers`
			`inline void acc_timers() {`
			`if (device->time_device()) {`
			`ans->acc_timers();`
			`time_in.add_to_total();`
			`time_out.add_to_total();`
			`time_map.add_to_total();`
			`time_rho.add_to_total();`
			`time_interp.add_to_total();`
			`}`
			`}`

			`/// Zero timers`
			`inline void zero_timers() {`
			`atom->zero_timers();`
			`ans->zero_timers();`
			`time_in.zero();`
			`time_out.zero();`
			`time_map.zero();`
			`time_rho.zero();`
			`time_interp.zero();`
			`}`

			`/// Precomputations for charge assignment that can be done asynchronously`
			`inline void precompute(const int ago, const int nlocal, const int nall,`
			`double *host_x, int host_type, bool &success,`
			`double charge, double boxlo, double *prd) {`
			`double delxinv=_nx_pppm/prd[0];`
			`double delyinv=_ny_pppm/prd[1];`
			`double delzinv=_nz_pppm/(prd[2]*_slab_volfactor);`
			`_precompute(ago,nlocal,nall,host_x,host_type,success,charge,boxlo,delxinv,`
			`delyinv,delzinv);`
			`}`

			`/// Returns non-zero if out of bounds atoms`
			`int spread(const int ago, const int nlocal, const int nall, double **host_x,`
			`int host_type, bool &success, double charge, double *boxlo,`
			`const double delxinv, const double delyinv, const double delzinv);`

			`void interp(const grdtyp qqrd2e_scale);`

			`// -------------------------- DEVICE DATA -------------------------`

			`/// Device Properties and Atom and Neighbor storage`
			`Device<numtyp,acctyp> *device;`

			`/// Geryon device`
			`UCL_Device *ucl_device;`

			`/// Device Timers`
			`UCL_Timer time_in, time_out, time_map, time_rho, time_interp;`

			`/// LAMMPS pointer for screen output`
			`FILE *screen;`

			`// --------------------------- ATOM DATA --------------------------`

			`/// Atom Data`
			`Atom<numtyp,acctyp> *atom;`


			`// --------------------------- GRID DATA --------------------------`

			`UCL_H_Vec<grdtyp> h_brick, h_vd_brick;`
			`UCL_D_Vec<grdtyp> d_brick;`

			`// Count of number of atoms assigned to each grid point`
			`UCL_D_Vec<int> d_brick_counts;`
			`// Atoms assigned to each grid point`
			`UCL_D_Vec<grdtyp4> d_brick_atoms;`

			`// Error checking for out of bounds atoms`
			`UCL_D_Vec<int> d_error_flag;`
			`UCL_H_Vec<int> h_error_flag;`

			`// Number of grid points in brick (including ghost)`
			`int _npts_x, _npts_y, _npts_z, _npts_yx;`

			`// Number of local grid points in brick`
			`int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride;`

			`// -------------------------- SPLINE DATA -------------------------`
			`UCL_D_Vec<grdtyp> d_rho_coeff;`
			`int _order, _nlower, _nupper, _order_m_1, _order2;`
			`int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out;`

			`// ------------------------ FORCE/ENERGY DATA -----------------------`

			`Answer<numtyp,acctyp> *ans;`

			`// ------------------------- DEVICE KERNELS -------------------------`
			`UCL_Program *pppm_program;`
			`UCL_Kernel k_particle_map, k_make_rho, k_interp;`
			`inline int block_size() { return _block_size; }`

			`// --------------------------- TEXTURES -----------------------------`
			`UCL_Texture pos_tex;`
			`UCL_Texture q_tex;`

			`protected:`
			`bool _allocated, _compiled, _precompute_done;`
			`int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms;`
			`double _max_bytes, _max_an_bytes;`
			`double _cpu_idle_time;`

			`grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv;`

			`double _slab_volfactor;`
			`int _nx_pppm, _ny_pppm, _nz_pppm;`

			`void compile_kernels(UCL_Device &dev);`
			`void _precompute(const int ago, const int nlocal, const int nall,`
			`double *host_x, int host_type, bool &success,`
			`double charge, double boxlo, const double delxinv,`
			`const double delyinv, const double delzinv);`
			`};`

			`}`

			`#endif`