lammps/lib/gpu/pair_gpu_device.h

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */

#ifndef PAIR_GPU_DEVICE_H
#define PAIR_GPU_DEVICE_H

#include "pair_gpu_atom.h"
#include "pair_gpu_nbor.h"
#include "mpi.h"
#include <sstream>
#include "stdio.h"
#include <string>

template <class numtyp, class acctyp>
class PairGPUDevice {
 public:
  PairGPUDevice();
  ~PairGPUDevice(); 
 
  /// Initialize the device for use by this process
  /** Sets up a per-device MPI communicator for load balancing and initializes
    * the device (>=first_gpu and <=last_gpu) that this proc will be using **/
  bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
                   const int last_gpu, const int gpu_mode, 
                   const double particle_split, const int nthreads);

  /// Initialize the device for Atom and Neighbor storage
  /** \param rot True if quaternions need to be stored
    * \param nlocal Total number of local particles to allocate memory for
    * \param host_nlocal Initial number of host particles to allocate memory for
    * \param nall Total number of local+ghost particles
    * \param gpu_nbor True if neighboring is performed on device
    * \param gpu_host 0 if host will not perform force calculations,
    *                 1 if gpu_nbor is true, and host needs a half nbor list,
    *                 2 if gpu_nbor is true, and host needs a full nbor list
    * \param max_nbors Initial number of rows in the neighbor matrix
    * \param cell_size cutoff+skin 
    * \param pre_cut True if cutoff test will be performed in separate kernel
    *                than the force kernel **/
  bool init(const bool charge, const bool rot, const int nlocal,
            const int host_nlocal, const int nall, const int maxspecial, 
            const bool gpu_nbor, const int gpu_host, const int max_nbors,
            const double cell_size, const bool pre_cut);

  /// Output a message for pair_style acceleration with device stats
  void init_message(FILE *screen, const char *name,
                    const int first_gpu, const int last_gpu);

  /// Output a message with timing information
  void output_times(UCL_Timer &time_pair, const double avg_split, 
                    const double max_bytes, FILE *screen);

  /// Clear all memory on host and device associated with atom and nbor data
  void clear();
  
  /// Clear all memory on host and device
  void clear_device();

  /// Start timer on host
  inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
  
  /// Stop timer on host
  inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
  
  /// Return host time
  inline double host_time() { return _cpu_full; }

  /// Return host memory usage in bytes
  double host_memory_usage() const;

  /// Return the number of procs sharing a device (size of device commincator)
  inline int procs_per_gpu() const { return _procs_per_gpu; }
  /// Return the number of threads per proc
  inline int num_threads() const { return _nthreads; }
  /// My rank within all processes
  inline int world_me() const { return _world_me; }
  /// Total number of processes
  inline int world_size() const { return _world_size; }
  /// MPI Barrier for world
  inline void world_barrier() { MPI_Barrier(_comm_world); }
  /// Return the replica MPI communicator
  inline MPI_Comm & replica() { return _comm_replica; }
  /// My rank within replica communicator
  inline int replica_me() const { return _replica_me; }
  /// Number of procs in replica communicator
  inline int replica_size() const { return _replica_size; }
  /// Return the per-GPU MPI communicator
  inline MPI_Comm & gpu_comm() { return _comm_gpu; }
  /// Return my rank in the device communicator
  inline int gpu_rank() const { return _gpu_rank; }
  /// MPI Barrier for gpu
  inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
  /// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH
  inline int gpu_mode() const { return _gpu_mode; }
  /// Index of first device used by a node
  inline int first_device() const { return _first_device; }
  /// Index of last device used by a node
  inline int last_device() const { return _last_device; }
  /// Particle split defined in fix
  inline double particle_split() const { return _particle_split; }
  /// Return the initialization count for the device
  inline int init_count() const { return _init_count; }

  // -------------------------- DEVICE DATA ------------------------- 

  /// Geryon Device
  UCL_Device *gpu;

  enum{GPU_FORCE, GPU_NEIGH};

  // --------------------------- ATOM DATA -------------------------- 

  /// Atom Data
  PairGPUAtom<numtyp,acctyp> atom;

  // --------------------------- NBOR DATA ----------------------------
  
  /// Neighbor Data
  PairGPUNbor nbor;

 private:
  int _init_count;
  bool _device_init;
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
      _replica_size;
  int _gpu_mode, _first_device, _last_device, _nthreads;
  double _particle_split;
  double _cpu_full;

  template <class t>
  inline std::string toa(const t& in) {
    std::ostringstream o;
    o.precision(2);
    o << in;
    return o.str();
  }

};

#endif
Changes from Mike Brown. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2010-11-23 08:40:35 +08:00			`/* ----------------------------------------------------------------------`
			`LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator`
			`http://lammps.sandia.gov, Sandia National Laboratories`
			`Steve Plimpton, sjplimp@sandia.gov`

			`Copyright (2003) Sandia Corporation. Under the terms of Contract`
			`DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains`
			`certain rights in this software. This software is distributed under`
			`the GNU General Public License.`

			`See the README file in the top-level LAMMPS directory.`
			`------------------------------------------------------------------------- */`
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5537 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2011-01-12 23:24:04 +08:00
Changes from Mike Brown. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2010-11-23 08:40:35 +08:00			`/* ----------------------------------------------------------------------`
			`Contributing authors: Mike Brown (ORNL), brownw@ornl.gov`
			`------------------------------------------------------------------------- */`

			`#ifndef PAIR_GPU_DEVICE_H`
			`#define PAIR_GPU_DEVICE_H`

			`#include "pair_gpu_atom.h"`
			`#include "pair_gpu_nbor.h"`
			`#include "mpi.h"`
			`#include <sstream>`
			`#include "stdio.h"`
			`#include <string>`

			`template <class numtyp, class acctyp>`
			`class PairGPUDevice {`
			`public:`
			`PairGPUDevice();`
			`~PairGPUDevice();`

			`/// Initialize the device for use by this process`
			`/** Sets up a per-device MPI communicator for load balancing and initializes`
			`* the device (>=first_gpu and <=last_gpu) that this proc will be using **/`
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5537 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2011-01-12 23:24:04 +08:00			`bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,`
			`const int last_gpu, const int gpu_mode,`
			`const double particle_split, const int nthreads);`
Changes from Mike Brown. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2010-11-23 08:40:35 +08:00
			`/// Initialize the device for Atom and Neighbor storage`
			`/** \param rot True if quaternions need to be stored`
			`* \param nlocal Total number of local particles to allocate memory for`
			`* \param host_nlocal Initial number of host particles to allocate memory for`
			`* \param nall Total number of local+ghost particles`
			`* \param gpu_nbor True if neighboring is performed on device`
			`* \param gpu_host 0 if host will not perform force calculations,`
			`* 1 if gpu_nbor is true, and host needs a half nbor list,`
			`* 2 if gpu_nbor is true, and host needs a full nbor list`
			`* \param max_nbors Initial number of rows in the neighbor matrix`
			`* \param cell_size cutoff+skin`
			`* \param pre_cut True if cutoff test will be performed in separate kernel`
			`* than the force kernel **/`
			`bool init(const bool charge, const bool rot, const int nlocal,`
			`const int host_nlocal, const int nall, const int maxspecial,`
			`const bool gpu_nbor, const int gpu_host, const int max_nbors,`
			`const double cell_size, const bool pre_cut);`

			`/// Output a message for pair_style acceleration with device stats`
			`void init_message(FILE screen, const char name,`
			`const int first_gpu, const int last_gpu);`

			`/// Output a message with timing information`
			`void output_times(UCL_Timer &time_pair, const double avg_split,`
			`const double max_bytes, FILE *screen);`

			`/// Clear all memory on host and device associated with atom and nbor data`
			`void clear();`

			`/// Clear all memory on host and device`
			`void clear_device();`

			`/// Start timer on host`
			`inline void start_host_timer() { _cpu_full=MPI_Wtime(); }`

			`/// Stop timer on host`
			`inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }`

			`/// Return host time`
			`inline double host_time() { return _cpu_full; }`

			`/// Return host memory usage in bytes`
			`double host_memory_usage() const;`

			`/// Return the number of procs sharing a device (size of device commincator)`
			`inline int procs_per_gpu() const { return _procs_per_gpu; }`
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5537 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2011-01-12 23:24:04 +08:00			`/// Return the number of threads per proc`
			`inline int num_threads() const { return _nthreads; }`
Changes from Mike Brown. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2010-11-23 08:40:35 +08:00			`/// My rank within all processes`
			`inline int world_me() const { return _world_me; }`
			`/// Total number of processes`
			`inline int world_size() const { return _world_size; }`
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5537 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2011-01-12 23:24:04 +08:00			`/// MPI Barrier for world`
			`inline void world_barrier() { MPI_Barrier(_comm_world); }`
			`/// Return the replica MPI communicator`
			`inline MPI_Comm & replica() { return _comm_replica; }`
			`/// My rank within replica communicator`
			`inline int replica_me() const { return _replica_me; }`
			`/// Number of procs in replica communicator`
			`inline int replica_size() const { return _replica_size; }`
			`/// Return the per-GPU MPI communicator`
			`inline MPI_Comm & gpu_comm() { return _comm_gpu; }`
			`/// Return my rank in the device communicator`
			`inline int gpu_rank() const { return _gpu_rank; }`
			`/// MPI Barrier for gpu`
			`inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }`
Changes from Mike Brown. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2010-11-23 08:40:35 +08:00			`/// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH`
			`inline int gpu_mode() const { return _gpu_mode; }`
			`/// Index of first device used by a node`
			`inline int first_device() const { return _first_device; }`
			`/// Index of last device used by a node`
			`inline int last_device() const { return _last_device; }`
			`/// Particle split defined in fix`
			`inline double particle_split() const { return _particle_split; }`
			`/// Return the initialization count for the device`
			`inline int init_count() const { return _init_count; }`

			`// -------------------------- DEVICE DATA -------------------------`

			`/// Geryon Device`
			`UCL_Device *gpu;`

			`enum{GPU_FORCE, GPU_NEIGH};`

			`// --------------------------- ATOM DATA --------------------------`

			`/// Atom Data`
			`PairGPUAtom<numtyp,acctyp> atom;`

			`// --------------------------- NBOR DATA ----------------------------`

			`/// Neighbor Data`
			`PairGPUNbor nbor;`

			`private:`
			`int _init_count;`
			`bool _device_init;`
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5537 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2011-01-12 23:24:04 +08:00			`MPI_Comm _comm_world, _comm_replica, _comm_gpu;`
			`int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,`
			`_replica_size;`
			`int _gpu_mode, _first_device, _last_device, _nthreads;`
Changes from Mike Brown. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2010-11-23 08:40:35 +08:00			`double _particle_split;`
			`double _cpu_full;`

			`template <class t>`
			`inline std::string toa(const t& in) {`
			`std::ostringstream o;`
			`o.precision(2);`
			`o << in;`
			`return o.str();`
			`}`

			`};`

			`#endif`