forked from lijiext/lammps
371 lines
9.6 KiB
C
371 lines
9.6 KiB
C
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
|
|
Original Version:
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
|
|
-----------------------------------------------------------------------
|
|
|
|
USER-CUDA Package and associated modifications:
|
|
https://sourceforge.net/projects/lammpscuda/
|
|
|
|
Christian Trott, christian.trott@tu-ilmenau.de
|
|
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
|
Theoretical Physics II, University of Technology Ilmenau, Germany
|
|
|
|
See the README file in the USER-CUDA directory.
|
|
|
|
This software is distributed under the GNU General Public License.
|
|
------------------------------------------------------------------------- */
|
|
|
|
#ifndef _CUDA_SHARED_H_
|
|
#define _CUDA_SHARED_H_
|
|
#include "cuda_precision.h"
|
|
|
|
#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
|
|
|
|
struct dev_array {
|
|
void* dev_data; // pointer to memory address on cuda device
|
|
unsigned dim[3]; // array dimensions
|
|
};
|
|
|
|
struct cuda_shared_atom { // relevent data from atom class
|
|
dev_array dx; // cumulated distance for binning settings
|
|
dev_array x; // position
|
|
dev_array v; // velocity
|
|
dev_array f; // force
|
|
dev_array tag;
|
|
dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1)
|
|
dev_array mask;
|
|
dev_array image;
|
|
dev_array q; // charges
|
|
dev_array mass; // per-type masses
|
|
dev_array rmass; // per-atom masses
|
|
dev_array radius; // per-atom radius
|
|
dev_array density;
|
|
dev_array omega;
|
|
dev_array torque;
|
|
dev_array molecule;
|
|
|
|
dev_array special;
|
|
int maxspecial;
|
|
dev_array nspecial;
|
|
int* special_flag;
|
|
int molecular;
|
|
|
|
dev_array eatom; // per-atom energy
|
|
dev_array vatom; // per-atom virial
|
|
int need_eatom;
|
|
int need_vatom;
|
|
|
|
dev_array x_type; // position + type in X_FLOAT4 struct
|
|
dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
|
|
dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
|
|
|
|
double* mass_host; // remember per-type host pointer to masses
|
|
//int natoms; // total # of atoms in system, could be 0
|
|
int nghost; // and ghost atoms on this proc
|
|
int nlocal; // # of owned
|
|
int nall; // total # of atoms in this proc
|
|
int nmax; // max # of owned+ghost in arrays on this proc
|
|
int ntypes;
|
|
int q_flag; // do we have charges?
|
|
int rmass_flag; // do we have per-atom masses?
|
|
int firstgroup;
|
|
int nfirst;
|
|
|
|
int update_nlocal;
|
|
int update_nmax;
|
|
int update_neigh;
|
|
|
|
dev_array xhold; // position at last neighboring
|
|
X_FLOAT triggerneighsq; // maximum square movement before reneighboring
|
|
int reneigh_flag; // is reneighboring necessary
|
|
int maxhold; // size of xhold
|
|
int dist_check; //perform distance check for reneighboring
|
|
dev_array binned_id; //id of each binned atom (not tag!!)
|
|
dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
|
|
float bin_extraspace;
|
|
int bin_dim[3];
|
|
int bin_nmax;
|
|
dev_array map_array;
|
|
};
|
|
|
|
struct cuda_shared_pair { // relevent data from pair class
|
|
char cudable_force; // check for (cudable_force!=0)
|
|
X_FLOAT cut_global;
|
|
X_FLOAT cut_inner_global;
|
|
X_FLOAT cut_coul_global;
|
|
double** cut; // type-type cutoff
|
|
double** cutsq; // type-type cutoff
|
|
double** cut_inner; // type-type cutoff for coul
|
|
double** cut_coul; // type-type cutoff for coul
|
|
double** coeff1; // tpye-type pair parameters
|
|
double** coeff2;
|
|
double** coeff3;
|
|
double** coeff4;
|
|
double** coeff5;
|
|
double** coeff6;
|
|
double** coeff7;
|
|
double** coeff8;
|
|
double** coeff9;
|
|
double** coeff10;
|
|
double** offset;
|
|
double* special_lj;
|
|
double* special_coul;
|
|
dev_array virial; // ENERGY_FLOAT
|
|
dev_array eng_vdwl; // ENERGY_FLOAT
|
|
dev_array eng_coul; // ENERGY_FLOAT
|
|
X_FLOAT cut_coulsq_global;
|
|
F_FLOAT g_ewald, kappa;
|
|
int freeze_group_bit;
|
|
|
|
dev_array coeff1_gm;
|
|
dev_array coeff2_gm;
|
|
dev_array coeff3_gm;
|
|
dev_array coeff4_gm;
|
|
dev_array coeff5_gm;
|
|
dev_array coeff6_gm;
|
|
dev_array coeff7_gm;
|
|
dev_array coeff8_gm;
|
|
dev_array coeff9_gm;
|
|
dev_array coeff10_gm;
|
|
|
|
int lastgridsize;
|
|
int n_energy_virial;
|
|
int collect_forces_later;
|
|
int use_block_per_atom;
|
|
int override_block_per_atom;
|
|
bool neighall;
|
|
|
|
};
|
|
|
|
struct cuda_shared_domain { // relevent data from domain class
|
|
X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
|
|
X_FLOAT subhi[3];
|
|
X_FLOAT boxlo[3];
|
|
X_FLOAT boxhi[3];
|
|
X_FLOAT prd[3];
|
|
int periodicity[3]; // xyz periodicity as array
|
|
|
|
int triclinic;
|
|
X_FLOAT xy;
|
|
X_FLOAT xz;
|
|
X_FLOAT yz;
|
|
X_FLOAT boxlo_lamda[3];
|
|
X_FLOAT boxhi_lamda[3];
|
|
X_FLOAT prd_lamda[3];
|
|
X_FLOAT h[6];
|
|
X_FLOAT h_inv[6];
|
|
V_FLOAT h_rate[6];
|
|
int update;
|
|
};
|
|
|
|
struct cuda_shared_pppm {
|
|
char cudable_force;
|
|
#ifdef FFT_CUFFT
|
|
FFT_FLOAT* work1;
|
|
FFT_FLOAT* work2;
|
|
FFT_FLOAT* work3;
|
|
PPPM_FLOAT* greensfn;
|
|
PPPM_FLOAT* fkx;
|
|
PPPM_FLOAT* fky;
|
|
PPPM_FLOAT* fkz;
|
|
PPPM_FLOAT* vg;
|
|
#endif
|
|
int* part2grid;
|
|
PPPM_FLOAT* density_brick;
|
|
int* density_brick_int;
|
|
PPPM_FLOAT density_intScale;
|
|
PPPM_FLOAT* vdx_brick;
|
|
PPPM_FLOAT* vdy_brick;
|
|
PPPM_FLOAT* vdz_brick;
|
|
PPPM_FLOAT* density_fft;
|
|
ENERGY_FLOAT* energy;
|
|
ENERGY_FLOAT* virial;
|
|
int nxlo_in;
|
|
int nxhi_in;
|
|
int nxlo_out;
|
|
int nxhi_out;
|
|
int nylo_in;
|
|
int nyhi_in;
|
|
int nylo_out;
|
|
int nyhi_out;
|
|
int nzlo_in;
|
|
int nzhi_in;
|
|
int nzlo_out;
|
|
int nzhi_out;
|
|
int nx_pppm;
|
|
int ny_pppm;
|
|
int nz_pppm;
|
|
PPPM_FLOAT qqrd2e;
|
|
int order;
|
|
// float3 sublo;
|
|
PPPM_FLOAT* rho_coeff;
|
|
int nmax;
|
|
int nlocal;
|
|
PPPM_FLOAT* debugdata;
|
|
PPPM_FLOAT delxinv;
|
|
PPPM_FLOAT delyinv;
|
|
PPPM_FLOAT delzinv;
|
|
int nlower;
|
|
int nupper;
|
|
PPPM_FLOAT shiftone;
|
|
PPPM_FLOAT3* fH;
|
|
};
|
|
|
|
struct cuda_shared_comm {
|
|
int maxswap;
|
|
int maxlistlength;
|
|
dev_array pbc;
|
|
dev_array slablo;
|
|
dev_array slabhi;
|
|
dev_array multilo;
|
|
dev_array multihi;
|
|
dev_array sendlist;
|
|
int grow_flag;
|
|
int comm_phase;
|
|
|
|
int nsend;
|
|
int* nsend_swap;
|
|
int* send_size;
|
|
int* recv_size;
|
|
double** buf_send;
|
|
void** buf_send_dev;
|
|
double** buf_recv;
|
|
void** buf_recv_dev;
|
|
void* buffer;
|
|
int buffer_size;
|
|
double overlap_split_ratio;
|
|
};
|
|
|
|
struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cuda_shared_data
|
|
int maxlocal;
|
|
int inum; // # of I atoms neighbors are stored for local indices of I atoms
|
|
int inum_border2;
|
|
dev_array inum_border; // # of atoms which interact with border atoms
|
|
dev_array ilist;
|
|
dev_array ilist_border;
|
|
dev_array numneigh;
|
|
dev_array numneigh_inner;
|
|
dev_array numneigh_border;
|
|
dev_array firstneigh;
|
|
dev_array neighbors;
|
|
dev_array neighbors_border;
|
|
dev_array neighbors_inner;
|
|
int maxpage;
|
|
dev_array page_pointers;
|
|
dev_array* pages;
|
|
int maxneighbors;
|
|
int neigh_lists_per_page;
|
|
double** cutneighsq;
|
|
CUDA_FLOAT* cu_cutneighsq;
|
|
int* binned_id;
|
|
int* bin_dim;
|
|
int bin_nmax;
|
|
float bin_extraspace;
|
|
double maxcut;
|
|
dev_array ex_type;
|
|
int nex_type;
|
|
dev_array ex1_bit;
|
|
dev_array ex2_bit;
|
|
int nex_group;
|
|
dev_array ex_mol_bit;
|
|
int nex_mol;
|
|
|
|
};
|
|
|
|
struct cuda_compile_settings { // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
|
|
int prec_glob;
|
|
int prec_x;
|
|
int prec_v;
|
|
int prec_f;
|
|
int prec_pppm;
|
|
int prec_fft;
|
|
int cufft;
|
|
int arch;
|
|
};
|
|
|
|
struct cuda_timings_struct {
|
|
//Debug:
|
|
double test1;
|
|
double test2;
|
|
//transfers
|
|
double transfer_upload_tmp_constr;
|
|
double transfer_download_tmp_deconstr;
|
|
|
|
//communication
|
|
double comm_forward_total;
|
|
double comm_forward_mpi_upper;
|
|
double comm_forward_mpi_lower;
|
|
double comm_forward_kernel_pack;
|
|
double comm_forward_kernel_unpack;
|
|
double comm_forward_kernel_self;
|
|
double comm_forward_upload;
|
|
double comm_forward_download;
|
|
|
|
double comm_exchange_total;
|
|
double comm_exchange_mpi;
|
|
double comm_exchange_kernel_pack;
|
|
double comm_exchange_kernel_unpack;
|
|
double comm_exchange_kernel_fill;
|
|
double comm_exchange_cpu_pack;
|
|
double comm_exchange_upload;
|
|
double comm_exchange_download;
|
|
|
|
double comm_border_total;
|
|
double comm_border_mpi;
|
|
double comm_border_kernel_pack;
|
|
double comm_border_kernel_unpack;
|
|
double comm_border_kernel_self;
|
|
double comm_border_kernel_buildlist;
|
|
double comm_border_upload;
|
|
double comm_border_download;
|
|
|
|
//pair forces
|
|
double pair_xtype_conversion;
|
|
double pair_kernel;
|
|
double pair_virial;
|
|
double pair_force_collection;
|
|
|
|
//neighbor
|
|
double neigh_bin;
|
|
double neigh_build;
|
|
double neigh_special;
|
|
|
|
//PPPM
|
|
double pppm_particle_map;
|
|
double pppm_make_rho;
|
|
double pppm_brick2fft;
|
|
double pppm_poisson;
|
|
double pppm_fillbrick;
|
|
double pppm_fieldforce;
|
|
double pppm_compute;
|
|
|
|
};
|
|
|
|
struct cuda_shared_data { // holds space for all relevent data from the different classes
|
|
void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
|
|
int buffersize; //maxsize of buffer
|
|
int buffer_new; //should be 1 if the pointer to buffer has changed
|
|
void* flag;
|
|
void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
|
|
cuda_shared_atom atom;
|
|
cuda_shared_pair pair;
|
|
cuda_shared_domain domain;
|
|
cuda_shared_pppm pppm;
|
|
cuda_shared_comm comm;
|
|
cuda_compile_settings compile_settings;
|
|
cuda_timings_struct cuda_timings;
|
|
int exchange_dim;
|
|
int me; //mpi rank
|
|
unsigned int datamask;
|
|
int overlap_comm;
|
|
};
|
|
|
|
|
|
#endif // #ifndef _CUDA_SHARED_H_
|