lammps/lib/cuda/cuda_shared.h

381 lines
9.5 KiB
C

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_SHARED_H_
#define _CUDA_SHARED_H_
#include "cuda_precision.h"
#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
struct dev_array
{
void* dev_data; // pointer to memory address on cuda device
unsigned dim[3]; // array dimensions
};
struct cuda_shared_atom // relevent data from atom class
{
dev_array dx; // cumulated distance for binning settings
dev_array x; // position
dev_array v; // velocity
dev_array f; // force
dev_array tag;
dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1)
dev_array mask;
dev_array image;
dev_array q; // charges
dev_array mass; // per-type masses
dev_array rmass; // per-atom masses
dev_array radius; // per-atom radius
dev_array density;
dev_array omega;
dev_array torque;
dev_array molecule;
dev_array special;
int maxspecial;
dev_array nspecial;
int* special_flag;
int molecular;
dev_array eatom; // per-atom energy
dev_array vatom; // per-atom virial
int need_eatom;
int need_vatom;
dev_array x_type; // position + type in X_FLOAT4 struct
dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
double* mass_host; // remember per-type host pointer to masses
//int natoms; // total # of atoms in system, could be 0
int nghost; // and ghost atoms on this proc
int nlocal; // # of owned
int nall; // total # of atoms in this proc
int nmax; // max # of owned+ghost in arrays on this proc
int ntypes;
int q_flag; // do we have charges?
int rmass_flag; // do we have per-atom masses?
int firstgroup;
int nfirst;
int update_nlocal;
int update_nmax;
int update_neigh;
dev_array xhold; // position at last neighboring
X_FLOAT triggerneighsq; // maximum square movement before reneighboring
int reneigh_flag; // is reneighboring necessary
int maxhold; // size of xhold
int dist_check; //perform distance check for reneighboring
dev_array binned_id; //id of each binned atom (not tag!!)
dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
float bin_extraspace;
int bin_dim[3];
int bin_nmax;
dev_array map_array;
};
struct cuda_shared_pair // relevent data from pair class
{
char cudable_force; // check for (cudable_force!=0)
X_FLOAT cut_global;
X_FLOAT cut_inner_global;
X_FLOAT cut_coul_global;
double** cut; // type-type cutoff
double** cutsq; // type-type cutoff
double** cut_inner; // type-type cutoff for coul
double** cut_coul; // type-type cutoff for coul
double** coeff1; // tpye-type pair parameters
double** coeff2;
double** coeff3;
double** coeff4;
double** coeff5;
double** coeff6;
double** coeff7;
double** coeff8;
double** coeff9;
double** coeff10;
double** offset;
double* special_lj;
double* special_coul;
dev_array virial; // ENERGY_FLOAT
dev_array eng_vdwl; // ENERGY_FLOAT
dev_array eng_coul; // ENERGY_FLOAT
X_FLOAT cut_coulsq_global;
F_FLOAT g_ewald,kappa;
int freeze_group_bit;
dev_array coeff1_gm;
dev_array coeff2_gm;
dev_array coeff3_gm;
dev_array coeff4_gm;
dev_array coeff5_gm;
dev_array coeff6_gm;
dev_array coeff7_gm;
dev_array coeff8_gm;
dev_array coeff9_gm;
dev_array coeff10_gm;
int lastgridsize;
int n_energy_virial;
int collect_forces_later;
int use_block_per_atom;
int override_block_per_atom;
bool neighall;
};
struct cuda_shared_domain // relevent data from domain class
{
X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
X_FLOAT subhi[3];
X_FLOAT boxlo[3];
X_FLOAT boxhi[3];
X_FLOAT prd[3];
int periodicity[3]; // xyz periodicity as array
int triclinic;
X_FLOAT xy;
X_FLOAT xz;
X_FLOAT yz;
X_FLOAT boxlo_lamda[3];
X_FLOAT boxhi_lamda[3];
X_FLOAT prd_lamda[3];
X_FLOAT h[6];
X_FLOAT h_inv[6];
V_FLOAT h_rate[6];
int update;
};
struct cuda_shared_pppm
{
char cudable_force;
#ifdef FFT_CUFFT
FFT_FLOAT* work1;
FFT_FLOAT* work2;
FFT_FLOAT* work3;
PPPM_FLOAT* greensfn;
PPPM_FLOAT* fkx;
PPPM_FLOAT* fky;
PPPM_FLOAT* fkz;
PPPM_FLOAT* vg;
#endif
int* part2grid;
PPPM_FLOAT* density_brick;
int* density_brick_int;
PPPM_FLOAT density_intScale;
PPPM_FLOAT* vdx_brick;
PPPM_FLOAT* vdy_brick;
PPPM_FLOAT* vdz_brick;
PPPM_FLOAT* density_fft;
ENERGY_FLOAT* energy;
ENERGY_FLOAT* virial;
int nxlo_in;
int nxhi_in;
int nxlo_out;
int nxhi_out;
int nylo_in;
int nyhi_in;
int nylo_out;
int nyhi_out;
int nzlo_in;
int nzhi_in;
int nzlo_out;
int nzhi_out;
int nx_pppm;
int ny_pppm;
int nz_pppm;
PPPM_FLOAT qqrd2e;
int order;
// float3 sublo;
PPPM_FLOAT* rho_coeff;
int nmax;
int nlocal;
PPPM_FLOAT* debugdata;
PPPM_FLOAT delxinv;
PPPM_FLOAT delyinv;
PPPM_FLOAT delzinv;
int nlower;
int nupper;
PPPM_FLOAT shiftone;
PPPM_FLOAT3* fH;
};
struct cuda_shared_comm
{
int maxswap;
int maxlistlength;
dev_array pbc;
dev_array slablo;
dev_array slabhi;
dev_array multilo;
dev_array multihi;
dev_array sendlist;
int grow_flag;
int comm_phase;
int nsend;
int* nsend_swap;
int* send_size;
int* recv_size;
double** buf_send;
void** buf_send_dev;
double** buf_recv;
void** buf_recv_dev;
void* buffer;
int buffer_size;
double overlap_split_ratio;
};
struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
{
int maxlocal;
int inum; // # of I atoms neighbors are stored for local indices of I atoms
int inum_border2;
dev_array inum_border; // # of atoms which interact with border atoms
dev_array ilist;
dev_array ilist_border;
dev_array numneigh;
dev_array numneigh_inner;
dev_array numneigh_border;
dev_array firstneigh;
dev_array neighbors;
dev_array neighbors_border;
dev_array neighbors_inner;
int maxpage;
dev_array page_pointers;
dev_array* pages;
int maxneighbors;
int neigh_lists_per_page;
double** cutneighsq;
CUDA_FLOAT* cu_cutneighsq;
int* binned_id;
int* bin_dim;
int bin_nmax;
float bin_extraspace;
double maxcut;
dev_array ex_type;
int nex_type;
dev_array ex1_bit;
dev_array ex2_bit;
int nex_group;
dev_array ex_mol_bit;
int nex_mol;
};
struct cuda_compile_settings // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
{
int prec_glob;
int prec_x;
int prec_v;
int prec_f;
int prec_pppm;
int prec_fft;
int cufft;
int arch;
};
struct cuda_timings_struct
{
//Debug:
double test1;
double test2;
//transfers
double transfer_upload_tmp_constr;
double transfer_download_tmp_deconstr;
//communication
double comm_forward_total;
double comm_forward_mpi_upper;
double comm_forward_mpi_lower;
double comm_forward_kernel_pack;
double comm_forward_kernel_unpack;
double comm_forward_kernel_self;
double comm_forward_upload;
double comm_forward_download;
double comm_exchange_total;
double comm_exchange_mpi;
double comm_exchange_kernel_pack;
double comm_exchange_kernel_unpack;
double comm_exchange_kernel_fill;
double comm_exchange_cpu_pack;
double comm_exchange_upload;
double comm_exchange_download;
double comm_border_total;
double comm_border_mpi;
double comm_border_kernel_pack;
double comm_border_kernel_unpack;
double comm_border_kernel_self;
double comm_border_kernel_buildlist;
double comm_border_upload;
double comm_border_download;
//pair forces
double pair_xtype_conversion;
double pair_kernel;
double pair_virial;
double pair_force_collection;
//neighbor
double neigh_bin;
double neigh_build;
double neigh_special;
//PPPM
double pppm_particle_map;
double pppm_make_rho;
double pppm_brick2fft;
double pppm_poisson;
double pppm_fillbrick;
double pppm_fieldforce;
double pppm_compute;
};
struct cuda_shared_data // holds space for all relevent data from the different classes
{
void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
int buffersize; //maxsize of buffer
int buffer_new; //should be 1 if the pointer to buffer has changed
void* flag;
void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
cuda_shared_atom atom;
cuda_shared_pair pair;
cuda_shared_domain domain;
cuda_shared_pppm pppm;
cuda_shared_comm comm;
cuda_compile_settings compile_settings;
cuda_timings_struct cuda_timings;
int exchange_dim;
int me; //mpi rank
unsigned int datamask;
int overlap_comm;
};
#endif // #ifndef _CUDA_SHARED_H_