git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7143 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2011-10-20 14:35:39 +00:00
parent 10a6791ea2
commit a053fd4788
8 changed files with 29 additions and 1003 deletions

View File

@ -82,6 +82,12 @@ if (test $1 = 1) then
cp pair_eam_alloy_cuda.h ..
cp pair_eam_cuda.h ..
cp pair_eam_fs_cuda.h ..
cp pair_sw_cuda.h ..
cp pair_sw_cuda.cpp ..
cp pair_tersoff_cuda.h ..
cp pair_tersoff_cuda.cpp ..
cp pair_tersoff_zbl_cuda.h ..
cp pair_tersoff_zbl_cuda.cpp ..
fi
if (test -e ../pair_gran_hooke.cpp) then
@ -193,12 +199,9 @@ if (test $1 = 1) then
cp verlet_cuda.h ..
cp cuda.h ..
cp cuda_common.h ..
cp cuda_data.h ..
cp cuda_modify_flags.h ..
cp cuda_neigh_list.h ..
cp cuda_precision.h ..
cp cuda_shared.h ..
elif (test $1 = 0) then
@ -341,12 +344,15 @@ elif (test $1 = 0) then
rm -f ../pppm_cuda.h
rm -f ../verlet_cuda.h
rm -f ../pair_sw_cuda.h
rm -f ../pair_sw_cuda.cpp
rm -f ../pair_tersoff_cuda.h
rm -f ../pair_tersoff_cuda.cpp
rm -f ../pair_tersoff_zbl_cuda.h
rm -f ../pair_tersoff_zbl_cuda.cpp
rm -f ../cuda.h
rm -f ../cuda_common.h
rm -f ../cuda_data.h
rm -f ../cuda_modify_flags.h
rm -f ../cuda_neigh_list.h
rm -f ../cuda_precision.h
rm -f ../cuda_shared.h
fi

View File

@ -41,6 +41,9 @@ using namespace LAMMPS_NS;
#define BUFFACTOR 1.5
#define BUFMIN 1000
#define BUFEXTRA 1000
#define BIG 1.0e20
enum{SINGLE,MULTI};
@ -137,6 +140,7 @@ void CommCuda::init()
void CommCuda::setup()
{
if(cuda->shared_data.pair.neighall) cutghostuser = MAX(2.0*neighbor->cutneighmax,cutghostuser);
Comm::setup();
//upload changed geometry to device

View File

@ -46,6 +46,8 @@
using namespace LAMMPS_NS;
Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp)
{
cuda_exists=true;
@ -309,6 +311,7 @@ void Cuda::setSharedDataZero()
shared_data.pair.special_lj = 0;
shared_data.pair.special_coul = 0;
shared_data.pair.neighall = false;
shared_data.pppm.cudable_force = 0;

View File

@ -1,344 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_COMMON_H_
#define _CUDA_COMMON_H_
//#include "cutil.h"
#include "cuda_precision.h"
#include "cuda_wrapper_cu.h"
#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
//this can not be arbitrarly large, since constant space is limited.
//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
//Christian
#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
#define CUDA_MAX_NSPECIAL 25
// define some easy-to-use debug and emulation macros
#ifdef _DEBUG
#define MYDBG(a) a
#else
#define MYDBG(a)
#endif
#if __DEVICE_EMULATION__
#define MYEMU(a) a
#else
#define MYEMU(a)
#endif
#define MYEMUDBG(a) MYEMU(MYDBG(a))
// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
#define MY_ADD_PREFIX(prefix, var) prefix##_##var
#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
#define MY_VAR_TO_STR(var) #var
#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
#define CUDA_USE_TEXTURE
#define CUDA_USE_FLOAT4
//constants used by many classes
//domain
#define _boxhi MY_AP(boxhi)
#define _boxlo MY_AP(boxlo)
#define _subhi MY_AP(subhi)
#define _sublo MY_AP(sublo)
#define _box_size MY_AP(box_size)
#define _prd MY_AP(prd)
#define _periodicity MY_AP(periodicity)
#define _triclinic MY_AP(triclinic)
#define _boxhi_lamda MY_AP(boxhi_lamda)
#define _boxlo_lamda MY_AP(boxlo_lamda)
#define _prd_lamda MY_AP(prd_lamda)
#define _h MY_AP(h)
#define _h_inv MY_AP(h_inv)
#define _h_rate MY_AP(h_rate)
__device__ __constant__ X_FLOAT _boxhi[3];
__device__ __constant__ X_FLOAT _boxlo[3];
__device__ __constant__ X_FLOAT _subhi[3];
__device__ __constant__ X_FLOAT _sublo[3];
__device__ __constant__ X_FLOAT _box_size[3];
__device__ __constant__ X_FLOAT _prd[3];
__device__ __constant__ int _periodicity[3];
__device__ __constant__ int _triclinic;
__device__ __constant__ X_FLOAT _boxhi_lamda[3];
__device__ __constant__ X_FLOAT _boxlo_lamda[3];
__device__ __constant__ X_FLOAT _prd_lamda[3];
__device__ __constant__ X_FLOAT _h[6];
__device__ __constant__ X_FLOAT _h_inv[6];
__device__ __constant__ V_FLOAT _h_rate[6];
//atom properties
#define _x MY_AP(x)
#define _v MY_AP(v)
#define _f MY_AP(f)
#define _tag MY_AP(tag)
#define _type MY_AP(type)
#define _mask MY_AP(mask)
#define _image MY_AP(image)
#define _q MY_AP(q)
#define _mass MY_AP(mass)
#define _rmass MY_AP(rmass)
#define _rmass_flag MY_AP(rmass_flag)
#define _eatom MY_AP(eatom)
#define _vatom MY_AP(vatom)
#define _x_type MY_AP(x_type)
#define _radius MY_AP(radius)
#define _density MY_AP(density)
#define _omega MY_AP(omega)
#define _torque MY_AP(torque)
#define _special MY_AP(special)
#define _maxspecial MY_AP(maxspecial)
#define _nspecial MY_AP(nspecial)
#define _special_flag MY_AP(special_flag)
#define _molecule MY_AP(molecule)
#define _v_radius MY_AP(v_radius)
#define _omega_rmass MY_AP(omega_rmass)
#define _freeze_group_bit MY_AP(freeze_group_bit)
#define _map_array MY_AP(map_array)
__device__ __constant__ X_FLOAT* _x; //holds pointer to positions
__device__ __constant__ V_FLOAT* _v;
__device__ __constant__ F_FLOAT* _f;
__device__ __constant__ int* _tag;
__device__ __constant__ int* _type;
__device__ __constant__ int* _mask;
__device__ __constant__ int* _image;
__device__ __constant__ V_FLOAT* _mass;
__device__ __constant__ F_FLOAT* _q;
__device__ __constant__ V_FLOAT* _rmass;
__device__ __constant__ int _rmass_flag;
__device__ __constant__ ENERGY_FLOAT* _eatom;
__device__ __constant__ ENERGY_FLOAT* _vatom;
__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions
__device__ __constant__ X_FLOAT* _radius;
__device__ __constant__ F_FLOAT* _density;
__device__ __constant__ V_FLOAT* _omega;
__device__ __constant__ F_FLOAT* _torque;
__device__ __constant__ int* _special;
__device__ __constant__ int _maxspecial;
__device__ __constant__ int* _nspecial;
__device__ __constant__ int _special_flag[4];
__device__ __constant__ int* _molecule;
__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions
__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions
__device__ __constant__ int _freeze_group_bit;
__device__ __constant__ int* _map_array;
#ifdef CUDA_USE_TEXTURE
#define _x_tex MY_AP(x_tex)
#if X_PRECISION == 1
texture<float> _x_tex;
#else
texture<int2,1> _x_tex;
#endif
#define _type_tex MY_AP(type_tex)
texture<int> _type_tex;
#define _x_type_tex MY_AP(x_type_tex)
#if X_PRECISION == 1
texture<float4,1> _x_type_tex;
#else
texture<int4,1> _x_type_tex;
#endif
#define _v_radius_tex MY_AP(v_radius_tex)
#if V_PRECISION == 1
texture<float4,1> _v_radius_tex;
#else
texture<int4,1> _v_radius_tex;
#endif
#define _omega_rmass_tex MY_AP(omega_rmass_tex)
#if V_PRECISION == 1
texture<float4,1> _omega_rmass_tex;
#else
texture<int4,1> _omega_rmass_tex;
#endif
#define _q_tex MY_AP(q_tex)
#if F_PRECISION == 1
texture<float> _q_tex;
#else
texture<int2,1> _q_tex;
#endif
#endif
//neighbor
#ifdef IncludeCommonNeigh
#define _inum MY_AP(inum)
#define _inum_border MY_AP(inum_border)
#define _ilist MY_AP(ilist)
#define _ilist_border MY_AP(ilist_border)
#define _numneigh MY_AP(numneigh)
#define _numneigh_border MY_AP(numneigh_border)
#define _numneigh_inner MY_AP(numneigh_inner)
#define _firstneigh MY_AP(firstneigh)
#define _neighbors MY_AP(neighbors)
#define _neighbors_border MY_AP(neighbors_border)
#define _neighbors_inner MY_AP(neighbors_inner)
#define _reneigh_flag MY_AP(reneigh_flag)
#define _triggerneighsq MY_AP(triggerneighsq)
#define _xhold MY_AP(xhold)
#define _maxhold MY_AP(maxhold)
#define _dist_check MY_AP(dist_check)
#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
#define _maxneighbors MY_AP(maxneighbors)
#define _overlap_comm MY_AP(overlap_comm)
__device__ __constant__ int _inum;
__device__ __constant__ int* _inum_border;
__device__ __constant__ int* _ilist;
__device__ __constant__ int* _ilist_border;
__device__ __constant__ int* _numneigh;
__device__ __constant__ int* _numneigh_border;
__device__ __constant__ int* _numneigh_inner;
__device__ __constant__ int** _firstneigh;
__device__ __constant__ int* _neighbors;
__device__ __constant__ int* _neighbors_border;
__device__ __constant__ int* _neighbors_inner;
__device__ __constant__ int* _reneigh_flag;
__device__ __constant__ X_FLOAT _triggerneighsq;
__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions
__device__ __constant__ int _maxhold;
__device__ __constant__ int _dist_check;
__device__ __constant__ int _neighbor_maxlocal;
__device__ __constant__ int _maxneighbors;
__device__ __constant__ int _overlap_comm;
#endif
//system properties
#define _nall MY_AP(nall)
#define _nghost MY_AP(nghost)
#define _nlocal MY_AP(nlocal)
#define _nmax MY_AP(nmax)
#define _cuda_ntypes MY_AP(cuda_ntypes)
#define _dtf MY_AP(dtf)
#define _dtv MY_AP(dtv)
#define _factor MY_AP(factor)
#define _virial MY_AP(virial)
#define _eng_vdwl MY_AP(eng_vdwl)
#define _eng_coul MY_AP(eng_coul)
#define _molecular MY_AP(molecular)
__device__ __constant__ unsigned _nall;
__device__ __constant__ unsigned _nghost;
__device__ __constant__ unsigned _nlocal;
__device__ __constant__ unsigned _nmax;
__device__ __constant__ unsigned _cuda_ntypes;
__device__ __constant__ V_FLOAT _dtf;
__device__ __constant__ X_FLOAT _dtv;
__device__ __constant__ V_FLOAT _factor;
__device__ __constant__ ENERGY_FLOAT* _virial;
__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
__device__ __constant__ ENERGY_FLOAT* _eng_coul;
__device__ __constant__ int _molecular;
//other general constants
#define _buffer MY_AP(buffer)
#define _flag MY_AP(flag)
#define _debugdata MY_AP(debugdata)
__device__ __constant__ void* _buffer;
__device__ __constant__ int* _flag;
__device__ __constant__ int* _debugdata;
// pointers to data fields on GPU are hold in constant space
// -> reduces register usage and number of parameters for kernelcalls
// will be variables of file scope in cuda files
// maybe used to output cudaError_t
#define MY_OUTPUT_RESULT(result) \
switch(result) \
{ \
case cudaSuccess: printf(" => cudaSuccess\n"); break; \
case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
default: printf(" => unknown\n"); break; \
}
#ifdef _DEBUG
# define CUT_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#else
# define CUT_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#endif
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} }
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
#define X_MASK 1
#define V_MASK 2
#define F_MASK 4
#define TAG_MASK 8
#define TYPE_MASK 16
#define MASK_MASK 32
#define IMAGE_MASK 64
#define Q_MASK 128
#define MOLECULE_MASK 256
#define RMASS_MASK 512
#define RADIUS_MASK 1024
#define DENSITY_MASK 2048
#define OMEGA_MASK 4096
#define TORQUE_MASK 8192
#endif // #ifdef _CUDA_COMMON_H_

View File

@ -1,269 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef CUDA_PRECISION_H_
#define CUDA_PRECISION_H_
/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
* Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
* ***_FLOAT: type definition of given property
* ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
*/
#ifdef CUDA_USE_BINNING
#define CUDA_IF_BINNING(a) a
#else
#define CUDA_IF_BINNING(a)
#endif
//GLOBAL
#ifdef CUDA_PRECISION
#if CUDA_PRECISION == 1
#define CUDA_FLOAT float
#define CUDA_F(x) x##f
#endif
#if CUDA_PRECISION == 2
#define CUDA_FLOAT double
#define CUDA_F(x) x
#endif
#endif
#ifndef CUDA_PRECISION
#define CUDA_FLOAT double
#define CUDA_F(x) x
#define CUDA_PRECISION 2
#endif
//--------------------------------
//-----------FFT-----------------
//--------------------------------
#ifdef FFT_PRECISION_CU
#if FFT_PRECISION_CU == 1
#define FFT_FLOAT float
#define FFT_F(x) x##f
#endif
#if FFT_PRECISION_CU == 2
#define FFT_FLOAT double
#define FFT_F(x) x
#endif
#endif
#ifndef FFT_PRECISION_CU
#define FFT_FLOAT CUDA_FLOAT
#define FFT_F(x) CUDA_F(x)
#define FFT_PRECISION_CU CUDA_PRECISION
#endif
//--------------------------------
//-----------PPPM-----------------
//--------------------------------
#ifdef PPPM_PRECISION
#if PPPM_PRECISION == 1
#define PPPM_FLOAT float
#define PPPM_F(x) x##f
#endif
#if PPPM_PRECISION == 2
#define PPPM_FLOAT double
#define PPPM_F(x) x
#endif
#endif
#ifndef PPPM_PRECISION
#define PPPM_FLOAT CUDA_FLOAT
#define PPPM_F(x) CUDA_F(x)
#define PPPM_PRECISION CUDA_PRECISION
#endif
//--------------------------------
//-----------FORCE-----------------
//--------------------------------
#ifdef F_PRECISION
#if F_PRECISION == 1
#define F_FLOAT float
#define F_F(x) x##f
#endif
#if F_PRECISION == 2
#define F_FLOAT double
#define F_F(x) x
#endif
#endif
#ifndef F_PRECISION
#define F_FLOAT CUDA_FLOAT
#define F_F(x) CUDA_F(x)
#define F_PRECISION CUDA_PRECISION
#endif
#if F_PRECISION == 1
#define _SQRT_ sqrtf
#define _RSQRT_ rsqrtf
#define _EXP_ expf
#else
#define _SQRT_ sqrt
#define _RSQRT_ rsqrt
#define _EXP_ exp
#endif
#if F_PRECISION == 2
struct F_FLOAT2
{
F_FLOAT x;
F_FLOAT y;
};
struct F_FLOAT3
{
F_FLOAT x;
F_FLOAT y;
F_FLOAT z;
};
struct F_FLOAT4
{
F_FLOAT x;
F_FLOAT y;
F_FLOAT z;
F_FLOAT w;
};
#else
#define F_FLOAT2 float2
#define F_FLOAT3 float3
#define F_FLOAT4 float4
#endif
//--------------------------------
//-----------ENERGY-----------------
//--------------------------------
#ifndef ENERGY_PRECISION
#define ENERGY_FLOAT CUDA_FLOAT
#define ENERGY_F(x) CUDA_F(x)
#endif
#ifdef ENERGY_PRECISION
#if ENERGY_PRECISION == 1
#define ENERGY_FLOAT float
#define ENERGY_F(x) x##f
#endif
#if ENERGY_PRECISION == 2
#define ENERGY_FLOAT double
#define ENERGY_F(x) x
#endif
#endif
#ifndef ENERGY_PRECISION
#define ENERGY_FLOAT CUDA_FLOAT
#define ENERGY_F(x) CUDA_F(x)
#define ENERGY_PRECISION CUDA_PRECISION
#endif
//--------------------------------
//-----------POSITIONS------------
//--------------------------------
#ifdef X_PRECISION
#if X_PRECISION == 1
#define X_FLOAT float
#define X_F(x) x##f
#endif
#if X_PRECISION == 2
#define X_FLOAT double
#define X_F(x) x
#endif
#endif
#ifndef X_PRECISION
#define X_FLOAT CUDA_FLOAT
#define X_F(x) CUDA_F(x)
#define X_PRECISION CUDA_PRECISION
#endif
#if X_PRECISION == 2
struct X_FLOAT2
{
X_FLOAT x;
X_FLOAT y;
};
struct X_FLOAT3
{
X_FLOAT x;
X_FLOAT y;
X_FLOAT z;
};
struct X_FLOAT4
{
X_FLOAT x;
X_FLOAT y;
X_FLOAT z;
X_FLOAT w;
};
#else
#define X_FLOAT2 float2
#define X_FLOAT3 float3
#define X_FLOAT4 float4
#endif
//--------------------------------
//-----------velocities-----------
//--------------------------------
#ifdef V_PRECISION
#if V_PRECISION == 1
#define V_FLOAT float
#define V_F(x) x##f
#endif
#if V_PRECISION == 2
#define V_FLOAT double
#define V_F(x) x
#endif
#endif
#ifndef V_PRECISION
#define V_FLOAT CUDA_FLOAT
#define V_F(x) CUDA_F(x)
#define V_PRECISION CUDA_PRECISION
#endif
#if V_PRECISION == 2
struct V_FLOAT4
{
V_FLOAT x;
V_FLOAT y;
V_FLOAT z;
V_FLOAT w;
};
#else
#define V_FLOAT4 float4
#endif
#ifdef NO_PREC_TIMING
struct timespec_2
{
unsigned int tv_sec;
unsigned int tv_nsec;
};
#define timespec timespec_2
#define clock_gettime(a,b)
#endif
#endif /*CUDA_PRECISION_H_*/

View File

@ -1,378 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_SHARED_H_
#define _CUDA_SHARED_H_
#include "cuda_precision.h"
#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
struct dev_array
{
void* dev_data; // pointer to memory address on cuda device
unsigned dim[3]; // array dimensions
};
struct cuda_shared_atom // relevent data from atom class
{
dev_array dx; // cumulated distance for binning settings
dev_array x; // position
dev_array v; // velocity
dev_array f; // force
dev_array tag;
dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1)
dev_array mask;
dev_array image;
dev_array q; // charges
dev_array mass; // per-type masses
dev_array rmass; // per-atom masses
dev_array radius; // per-atom radius
dev_array density;
dev_array omega;
dev_array torque;
dev_array molecule;
dev_array special;
int maxspecial;
dev_array nspecial;
int* special_flag;
int molecular;
dev_array eatom; // per-atom energy
dev_array vatom; // per-atom virial
int need_eatom;
int need_vatom;
dev_array x_type; // position + type in X_FLOAT4 struct
dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
double* mass_host; // remember per-type host pointer to masses
//int natoms; // total # of atoms in system, could be 0
int nghost; // and ghost atoms on this proc
int nlocal; // # of owned
int nall; // total # of atoms in this proc
int nmax; // max # of owned+ghost in arrays on this proc
int ntypes;
int q_flag; // do we have charges?
int rmass_flag; // do we have per-atom masses?
int firstgroup;
int nfirst;
int update_nlocal;
int update_nmax;
dev_array xhold; // position at last neighboring
X_FLOAT triggerneighsq; // maximum square movement before reneighboring
int reneigh_flag; // is reneighboring necessary
int maxhold; // size of xhold
int dist_check; //perform distance check for reneighboring
dev_array binned_id; //id of each binned atom (not tag!!)
dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
float bin_extraspace;
int bin_dim[3];
int bin_nmax;
dev_array map_array;
};
struct cuda_shared_pair // relevent data from pair class
{
char cudable_force; // check for (cudable_force!=0)
X_FLOAT cut_global;
X_FLOAT cut_inner_global;
X_FLOAT cut_coul_global;
double** cut; // type-type cutoff
double** cutsq; // type-type cutoff
double** cut_inner; // type-type cutoff for coul
double** cut_coul; // type-type cutoff for coul
double** coeff1; // tpye-type pair parameters
double** coeff2;
double** coeff3;
double** coeff4;
double** coeff5;
double** coeff6;
double** coeff7;
double** coeff8;
double** coeff9;
double** coeff10;
double** offset;
double* special_lj;
double* special_coul;
dev_array virial; // ENERGY_FLOAT
dev_array eng_vdwl; // ENERGY_FLOAT
dev_array eng_coul; // ENERGY_FLOAT
X_FLOAT cut_coulsq_global;
F_FLOAT g_ewald,kappa;
int freeze_group_bit;
dev_array coeff1_gm;
dev_array coeff2_gm;
dev_array coeff3_gm;
dev_array coeff4_gm;
dev_array coeff5_gm;
dev_array coeff6_gm;
dev_array coeff7_gm;
dev_array coeff8_gm;
dev_array coeff9_gm;
dev_array coeff10_gm;
int lastgridsize;
int n_energy_virial;
int collect_forces_later;
int use_block_per_atom;
int override_block_per_atom;
};
struct cuda_shared_domain // relevent data from domain class
{
X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
X_FLOAT subhi[3];
X_FLOAT boxlo[3];
X_FLOAT boxhi[3];
X_FLOAT prd[3];
int periodicity[3]; // xyz periodicity as array
int triclinic;
X_FLOAT xy;
X_FLOAT xz;
X_FLOAT yz;
X_FLOAT boxlo_lamda[3];
X_FLOAT boxhi_lamda[3];
X_FLOAT prd_lamda[3];
X_FLOAT h[6];
X_FLOAT h_inv[6];
V_FLOAT h_rate[6];
int update;
};
struct cuda_shared_pppm
{
char cudable_force;
#ifdef FFT_CUFFT
FFT_FLOAT* work1;
FFT_FLOAT* work2;
FFT_FLOAT* work3;
PPPM_FLOAT* greensfn;
PPPM_FLOAT* fkx;
PPPM_FLOAT* fky;
PPPM_FLOAT* fkz;
PPPM_FLOAT* vg;
#endif
int* part2grid;
PPPM_FLOAT* density_brick;
int* density_brick_int;
PPPM_FLOAT density_intScale;
PPPM_FLOAT* vdx_brick;
PPPM_FLOAT* vdy_brick;
PPPM_FLOAT* vdz_brick;
PPPM_FLOAT* density_fft;
ENERGY_FLOAT* energy;
ENERGY_FLOAT* virial;
int nxlo_in;
int nxhi_in;
int nxlo_out;
int nxhi_out;
int nylo_in;
int nyhi_in;
int nylo_out;
int nyhi_out;
int nzlo_in;
int nzhi_in;
int nzlo_out;
int nzhi_out;
int nx_pppm;
int ny_pppm;
int nz_pppm;
PPPM_FLOAT qqrd2e;
int order;
// float3 sublo;
PPPM_FLOAT* rho_coeff;
int nmax;
int nlocal;
PPPM_FLOAT* debugdata;
PPPM_FLOAT delxinv;
PPPM_FLOAT delyinv;
PPPM_FLOAT delzinv;
int nlower;
int nupper;
PPPM_FLOAT shiftone;
};
struct cuda_shared_comm
{
int maxswap;
int maxlistlength;
dev_array pbc;
dev_array slablo;
dev_array slabhi;
dev_array multilo;
dev_array multihi;
dev_array sendlist;
int grow_flag;
int comm_phase;
int nsend;
int* nsend_swap;
int* send_size;
int* recv_size;
double** buf_send;
void** buf_send_dev;
double** buf_recv;
void** buf_recv_dev;
void* buffer;
int buffer_size;
double overlap_split_ratio;
};
struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
{
int maxlocal;
int inum; // # of I atoms neighbors are stored for local indices of I atoms
int inum_border2;
dev_array inum_border; // # of atoms which interact with border atoms
dev_array ilist;
dev_array ilist_border;
dev_array numneigh;
dev_array numneigh_inner;
dev_array numneigh_border;
dev_array firstneigh;
dev_array neighbors;
dev_array neighbors_border;
dev_array neighbors_inner;
int maxpage;
dev_array page_pointers;
dev_array* pages;
int maxneighbors;
int neigh_lists_per_page;
double** cutneighsq;
CUDA_FLOAT* cu_cutneighsq;
int* binned_id;
int* bin_dim;
int bin_nmax;
float bin_extraspace;
double maxcut;
dev_array ex_type;
int nex_type;
dev_array ex1_bit;
dev_array ex2_bit;
int nex_group;
dev_array ex_mol_bit;
int nex_mol;
};
struct cuda_compile_settings // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
{
int prec_glob;
int prec_x;
int prec_v;
int prec_f;
int prec_pppm;
int prec_fft;
int cufft;
int arch;
};
struct cuda_timings_struct
{
//Debug:
double test1;
double test2;
//transfers
double transfer_upload_tmp_constr;
double transfer_download_tmp_deconstr;
//communication
double comm_forward_total;
double comm_forward_mpi_upper;
double comm_forward_mpi_lower;
double comm_forward_kernel_pack;
double comm_forward_kernel_unpack;
double comm_forward_kernel_self;
double comm_forward_upload;
double comm_forward_download;
double comm_exchange_total;
double comm_exchange_mpi;
double comm_exchange_kernel_pack;
double comm_exchange_kernel_unpack;
double comm_exchange_kernel_fill;
double comm_exchange_cpu_pack;
double comm_exchange_upload;
double comm_exchange_download;
double comm_border_total;
double comm_border_mpi;
double comm_border_kernel_pack;
double comm_border_kernel_unpack;
double comm_border_kernel_self;
double comm_border_kernel_buildlist;
double comm_border_upload;
double comm_border_download;
//pair forces
double pair_xtype_conversion;
double pair_kernel;
double pair_virial;
double pair_force_collection;
//neighbor
double neigh_bin;
double neigh_build;
double neigh_special;
//PPPM
double pppm_particle_map;
double pppm_make_rho;
double pppm_brick2fft;
double pppm_poisson;
double pppm_fillbrick;
double pppm_fieldforce;
double pppm_compute;
};
struct cuda_shared_data // holds space for all relevent data from the different classes
{
void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
int buffersize; //maxsize of buffer
int buffer_new; //should be 1 if the pointer to buffer has changed
void* flag;
void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
cuda_shared_atom atom;
cuda_shared_pair pair;
cuda_shared_domain domain;
cuda_shared_pppm pppm;
cuda_shared_comm comm;
cuda_compile_settings compile_settings;
cuda_timings_struct cuda_timings;
int exchange_dim;
int me; //mpi rank
unsigned int datamask;
int overlap_comm;
};
#endif // #ifndef _CUDA_SHARED_H_

View File

@ -26,6 +26,9 @@
using namespace LAMMPS_NS;
enum{NSQ,BIN,MULTI}; // also in neigh_list.cpp
/* ---------------------------------------------------------------------- */
@ -56,9 +59,9 @@ void NeighborCuda::choose_build(int index, NeighRequest *rq)
{
Neighbor::choose_build(index,rq);
if (rq->full && style == NSQ && rq->ghost == 0 && rq->cudable)
if (rq->full && style == NSQ && rq->cudable)
pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_nsq_cuda;
else if (rq->full && style == BIN && rq->ghost == 0 && rq->cudable)
else if (rq->full && style == BIN && rq->cudable)
pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_bin_cuda;
}

View File

@ -21,6 +21,7 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <cstdlib>
#include <cstdio>
#include <cstring>
@ -56,6 +57,7 @@ using namespace LAMMPS_NS;
#define MAKETIMEING
VerletCuda::VerletCuda(LAMMPS *lmp, int narg, char **arg) : Verlet(lmp, narg, arg) {
cuda = lmp->cuda;
if(cuda == NULL)
@ -132,20 +134,19 @@ void VerletCuda::setup()
cuda->uploadAll();
neighbor->build();
neighbor->ncalls = 0;
cuda->uploadAllNeighborLists();
if(atom->mass)
cuda->cu_mass->upload();
if(cuda->cu_map_array)
cuda->cu_map_array->upload();
// compute all forces
ev_set(update->ntimestep);
if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
if(elist_atom||vlist_atom) cuda->checkResize();
int test_BpA_vs_TpA = true;
timespec starttime;