lammps/lib/cuda/cuda_shared.h

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

   Original Version:
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   See the README file in the top-level LAMMPS directory.

   -----------------------------------------------------------------------

   USER-CUDA Package and associated modifications:
   https://sourceforge.net/projects/lammpscuda/

   Christian Trott, christian.trott@tu-ilmenau.de
   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
   Theoretical Physics II, University of Technology Ilmenau, Germany

   See the README file in the USER-CUDA directory.

   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */

#ifndef _CUDA_SHARED_H_
#define _CUDA_SHARED_H_
#include "cuda_precision.h"

#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)

struct dev_array
{
	void* dev_data;			// pointer to memory address on cuda device
	unsigned dim[3];		// array dimensions
};

struct cuda_shared_atom		// relevent data from atom class
{
	dev_array dx; 			// cumulated distance for binning settings
	dev_array x;			// position
	dev_array v;			// velocity
	dev_array f;			// force
	dev_array tag;
	dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)
	dev_array mask;
	dev_array image;
	dev_array q;			// charges
	dev_array mass;			// per-type masses
	dev_array rmass;		// per-atom masses
	dev_array radius;		// per-atom radius
	dev_array density;
	dev_array omega;
	dev_array torque;
	dev_array molecule;

	dev_array special;
	int maxspecial;
	dev_array nspecial;
	int* special_flag;
	int molecular;

	dev_array eatom;		// per-atom energy
	dev_array vatom;		// per-atom virial
	int need_eatom;
	int need_vatom;

	dev_array x_type;		// position + type in X_FLOAT4 struct
	dev_array v_radius;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
	dev_array omega_rmass;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style

	double* mass_host;		// remember per-type host pointer to masses
	//int natoms;				// total # of atoms in system, could be 0
	int nghost;				// and ghost atoms on this proc
	int nlocal;				// # of owned
	int nall;			    // total # of atoms in this proc
	int nmax;				// max # of owned+ghost in arrays on this proc
	int ntypes;
	int q_flag;				// do we have charges?
	int rmass_flag;			// do we have per-atom masses?
	int firstgroup;
	int nfirst;

 	int update_nlocal;
 	int update_nmax;
 	int update_neigh;

	dev_array xhold;	    // position at last neighboring
 	X_FLOAT triggerneighsq;		// maximum square movement before reneighboring
 	int reneigh_flag;		// is reneighboring necessary
 	int maxhold;			// size of xhold
 	int dist_check; 		//perform distance check for reneighboring
 	dev_array binned_id;    //id of each binned atom (not tag!!)
 	dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
    float bin_extraspace;
	int bin_dim[3];
	int bin_nmax;
	dev_array map_array;
};

struct cuda_shared_pair		// relevent data from pair class
{
	char cudable_force;		// check for (cudable_force!=0)
	X_FLOAT cut_global;
	X_FLOAT cut_inner_global;
	X_FLOAT cut_coul_global;
	double** cut;			// type-type cutoff
	double** cutsq;			// type-type cutoff
	double** cut_inner;			// type-type cutoff for coul
	double** cut_coul;			// type-type cutoff for coul
	double** coeff1;		// tpye-type pair parameters
	double** coeff2;
	double** coeff3;
	double** coeff4;
	double** coeff5;
	double** coeff6;
	double** coeff7;
	double** coeff8;
	double** coeff9;
	double** coeff10;
	double** offset;
	double* special_lj;
	double* special_coul;
	dev_array virial; // ENERGY_FLOAT
	dev_array eng_vdwl; // ENERGY_FLOAT
	dev_array eng_coul; // ENERGY_FLOAT
	X_FLOAT cut_coulsq_global;
	F_FLOAT g_ewald,kappa;
	int freeze_group_bit;

	dev_array coeff1_gm;
	dev_array coeff2_gm;
	dev_array coeff3_gm;
	dev_array coeff4_gm;
	dev_array coeff5_gm;
	dev_array coeff6_gm;
	dev_array coeff7_gm;
	dev_array coeff8_gm;
	dev_array coeff9_gm;
	dev_array coeff10_gm;

	int lastgridsize;
	int n_energy_virial;
	int collect_forces_later;
	int use_block_per_atom;
	int override_block_per_atom;
	bool neighall;

};

struct cuda_shared_domain	// relevent data from domain class
{
	X_FLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
	X_FLOAT subhi[3];
	X_FLOAT boxlo[3];
	X_FLOAT boxhi[3];
	X_FLOAT prd[3];
	int periodicity[3];		// xyz periodicity as array

	int triclinic;
	X_FLOAT xy;
	X_FLOAT xz;
	X_FLOAT yz;
	X_FLOAT boxlo_lamda[3];
	X_FLOAT boxhi_lamda[3];
	X_FLOAT prd_lamda[3];
	X_FLOAT h[6];
	X_FLOAT h_inv[6];
	V_FLOAT h_rate[6];
	int update;
};

struct cuda_shared_pppm
{
   char cudable_force;
#ifdef FFT_CUFFT
   FFT_FLOAT* work1;
   FFT_FLOAT* work2;
   FFT_FLOAT* work3;
   PPPM_FLOAT* greensfn;
   PPPM_FLOAT* fkx;
   PPPM_FLOAT* fky;
   PPPM_FLOAT* fkz;
   PPPM_FLOAT* vg;
#endif
   int* part2grid;
   PPPM_FLOAT* density_brick;
   int* density_brick_int;
   PPPM_FLOAT density_intScale;
   PPPM_FLOAT* vdx_brick;
   PPPM_FLOAT* vdy_brick;
   PPPM_FLOAT* vdz_brick;
   PPPM_FLOAT* density_fft;
   ENERGY_FLOAT* energy;
   ENERGY_FLOAT* virial;
   int nxlo_in;
   int nxhi_in;
   int nxlo_out;
   int nxhi_out;
   int nylo_in;
   int nyhi_in;
   int nylo_out;
   int nyhi_out;
   int nzlo_in;
   int nzhi_in;
   int nzlo_out;
   int nzhi_out;
   int nx_pppm;
   int ny_pppm;
   int nz_pppm;
   PPPM_FLOAT qqrd2e;
   int order;
  // float3 sublo;
   PPPM_FLOAT* rho_coeff;
   int nmax;
   int nlocal;
   PPPM_FLOAT* debugdata;
   PPPM_FLOAT delxinv;
   PPPM_FLOAT delyinv;
   PPPM_FLOAT delzinv;
   int nlower;
   int nupper;
   PPPM_FLOAT shiftone;
   PPPM_FLOAT3* fH;
};

struct cuda_shared_comm
{
   int maxswap;
   int maxlistlength;
   dev_array pbc;
   dev_array slablo;
   dev_array slabhi;
   dev_array multilo;
   dev_array multihi;
   dev_array sendlist;
   int grow_flag;
   int comm_phase;

   int nsend;
   int* nsend_swap;
   int* send_size;
   int* recv_size;
   double** buf_send;
   void** buf_send_dev;
   double** buf_recv;
   void** buf_recv_dev;
   void* buffer;
   int buffer_size;
   double overlap_split_ratio;
};

struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
{
	int maxlocal;
	int inum;                // # of I atoms neighbors are stored for local indices of I atoms
	int inum_border2;
	dev_array inum_border;         // # of atoms which interact with border atoms
	dev_array ilist;
	dev_array ilist_border;
	dev_array numneigh;
	dev_array numneigh_inner;
	dev_array numneigh_border;
	dev_array firstneigh;
	dev_array neighbors;
	dev_array neighbors_border;
	dev_array neighbors_inner;
	int maxpage;
	dev_array page_pointers;
	dev_array* pages;
	int maxneighbors;
	int neigh_lists_per_page;
	double** cutneighsq;
	CUDA_FLOAT* cu_cutneighsq;
	int* binned_id;
	int* bin_dim;
	int bin_nmax;
	float bin_extraspace;
	double maxcut;
	dev_array ex_type;
	int nex_type;
	dev_array ex1_bit;
	dev_array ex2_bit;
	int nex_group;
	dev_array ex_mol_bit;
	int nex_mol;

};

struct cuda_compile_settings		// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
{
    int prec_glob;
    int prec_x;
    int prec_v;
    int prec_f;
    int prec_pppm;
    int prec_fft;
    int cufft;
    int arch;
};

struct cuda_timings_struct
{
	//Debug:
	double test1;
	double test2;
	//transfers
	double transfer_upload_tmp_constr;
	double transfer_download_tmp_deconstr;

	//communication
	double comm_forward_total;
	double comm_forward_mpi_upper;
	double comm_forward_mpi_lower;
	double comm_forward_kernel_pack;
	double comm_forward_kernel_unpack;
	double comm_forward_kernel_self;
	double comm_forward_upload;
	double comm_forward_download;

	double comm_exchange_total;
	double comm_exchange_mpi;
	double comm_exchange_kernel_pack;
	double comm_exchange_kernel_unpack;
	double comm_exchange_kernel_fill;
	double comm_exchange_cpu_pack;
	double comm_exchange_upload;
	double comm_exchange_download;

	double comm_border_total;
	double comm_border_mpi;
	double comm_border_kernel_pack;
	double comm_border_kernel_unpack;
	double comm_border_kernel_self;
	double comm_border_kernel_buildlist;
	double comm_border_upload;
	double comm_border_download;

	//pair forces
	double pair_xtype_conversion;
	double pair_kernel;
	double pair_virial;
	double pair_force_collection;

	//neighbor
	double neigh_bin;
	double neigh_build;
	double neigh_special;

	//PPPM
 	double pppm_particle_map;
    double pppm_make_rho;
    double pppm_brick2fft;
    double pppm_poisson;
    double pppm_fillbrick;
    double pppm_fieldforce;
    double pppm_compute;

};

struct cuda_shared_data		// holds space for all relevent data from the different classes
{
	void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
	int buffersize; //maxsize of buffer
	int buffer_new; //should be 1 if the pointer to buffer has changed
	void* flag;
	void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
	cuda_shared_atom atom;
	cuda_shared_pair pair;
	cuda_shared_domain domain;
	cuda_shared_pppm pppm;
	cuda_shared_comm comm;
	cuda_compile_settings compile_settings;
	cuda_timings_struct cuda_timings;
	int exchange_dim;
	int me; //mpi rank
	unsigned int datamask;
	int overlap_comm;
};


#endif // #ifndef _CUDA_SHARED_H_