Adding changes from Mike Brown.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3883 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
pscrozi 2010-03-04 22:35:29 +00:00
parent 780ba6e377
commit c0e0b65619
20 changed files with 578 additions and 611 deletions

View File

@ -74,13 +74,3 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
package has been installed before installing the GPU package in LAMMPS.
GPU MEMORY
Upon initialization of the gayberne/gpu pair style, the library will reserve
memory for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
is limiting. The value of 70% can be changed by editing the
PERCENT_GPU_MEMORY definition in the source file. For gayberne/gpu, the value
of 64K cannot be increased and is the maximum number of atoms allowed per
GPU. Using the 'neigh_modify one' modifier in your LAMMPS input script
can help to increase maximum number of atoms per GPU for cards with
limited memory.

View File

@ -36,7 +36,7 @@ static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
// -- Only pack neighbors within cutoff
// ---------------------------------------------------------------------------
template<class numtyp>
__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
const int start, const int inum,
const int *dev_ij, const int form_low,
const int form_high, const int nall) {
@ -56,25 +56,24 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
int *nbor_newj=nbor;
nbor+=nbor_pitch;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,7);
vec4 ix=x_[i];
int itype=ix.w;
int newj=0;
for ( ; list<list_end; list++) {
int j=*list;
if (j>=nall)
j%=nall;
int jtype=_x_<numtyp>(j,7);
vec4 jx=x_[j];
int jtype=jx.w;
if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
// Compute r12;
numtyp rsq=_x_<numtyp>(j,0)-ix;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=_x_<numtyp>(j,1)-iy;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=_x_<numtyp>(j,2)-iz;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq< _cutsq_<numtyp>(itype,jtype)) {
@ -95,7 +94,7 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
// -- Fast version of routine that uses shared memory for LJ constants
// ---------------------------------------------------------------------------
template<class numtyp>
__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
const int start, const int inum,
const int *dev_ij, const int form_low,
const int form_high, const int nall) {
@ -124,25 +123,25 @@ __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
int *nbor_newj=nbor;
nbor+=nbor_pitch;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
vec4 ix=x_[i];
int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
int newj=0;
for ( ; list<list_end; list++) {
int j=*list;
if (j>=nall)
j%=nall;
int mtype=itype+_x_<numtyp>(j,7);
vec4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
if (form[mtype]>=form_low && form[mtype]<=form_high) {
// Compute r12;
numtyp rsq=_x_<numtyp>(j,0)-ix;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=_x_<numtyp>(j,1)-iy;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=_x_<numtyp>(j,2)-iz;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cutsq[mtype]) {
@ -159,13 +158,15 @@ __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
template<class numtyp, class acctyp>
void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
const int inum, const int form_low, const int form_high) {
if (gbm.shared_types)
if (gbm.shared_types) {
kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
gbm.atom.inum(), start, inum,
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
else
} else
kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
gbm.atom.inum(), start, inum,
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
}
@ -188,9 +189,7 @@ EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
string sname=GBMF[0].gpu.name(id)+", "+
gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
max_nbors))+" Atoms";
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ";
strcpy(name,sname.c_str());
}
@ -203,6 +202,7 @@ EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int nlocal, const int nall,
const int max_nbors, const int thread, const int gpu_id) {
assert(thread<MAX_GPU_THREADS);
@ -215,7 +215,8 @@ EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
well, cutsq, sigma, epsilon, host_lshape, form,
host_lj1, host_lj2, host_lj3, host_lj4, offset,
special_lj, max_nbors, false, gpu_id);
special_lj, nlocal, nall, max_nbors, false,
gpu_id);
}
// ---------------------------------------------------------------------------
@ -230,34 +231,23 @@ EXTERN void gb_gpu_clear(const int thread) {
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
double **host_quat, const int *host_type,
const bool rebuild, cudaStream_t &stream) {
double **host_quat, const int *host_type,
const bool rebuild, cudaStream_t &stream) {
atom.time_atom.start();
atom.reset_write_buffer();
// Rows 1-3 of dev_x are position; rows 4-7 are quaternion
atom.add_atom_data(host_x[0],3);
atom.add_atom_data(host_x[0]+1,3);
atom.add_atom_data(host_x[0]+2,3);
atom.add_atom_data(host_quat[0],4);
atom.add_atom_data(host_quat[0]+1,4);
atom.add_atom_data(host_quat[0]+2,4);
atom.add_atom_data(host_quat[0]+3,4);
int csize=7;
// If a rebuild occured, copy type data
if (rebuild) {
atom.add_atom_data(host_type);
csize++;
}
atom.copy_atom_data(csize,stream);
atom.add_x_data(host_x,host_type);
atom.add_q_data(host_quat[0]);
atom.copy_x_data(stream);
atom.copy_q_data(stream);
atom.time_atom.stop();
}
EXTERN void gb_gpu_atom(double **host_x, double **host_quat,
const int *host_type, const bool rebuild, const int thread) {
const int *host_type, const bool rebuild,
const int thread) {
_gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
GBMF[thread].pair_stream);
}
@ -269,46 +259,111 @@ template <class gbmtyp>
int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
const int inum, int *ilist, const int *numj,
const int *type, bool &success) {
if (nall>gbm.max_atoms) {
success=false;
return 0;
}
success=true;
gbm.nbor.time_nbor.start();
int mn=0;
for (int i=0; i<inum; i++)
mn=std::max(mn,numj[i]);
if (nall>gbm.max_atoms)
gbm.resize_atom(nall,success);
if (nlocal>gbm.max_local || mn>gbm._max_nbors)
gbm.resize_local(nlocal,mn,success);
if (!success)
return false;
gbm.atom.nall(nall);
gbm.atom.inum(inum);
if (gbm.multiple_forms) {
int p=0, acc=0;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[p]=numj[ilist[i]];
gbm.nbor.host_ij[p+inum]=acc;
acc+=numj[ilist[i]];
p++;
int ij_size=gbm.nbor.host_ij.numel();
if (inum*2<ij_size) {
int p=0, acc=0;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[p]=numj[ilist[i]];
gbm.nbor.host_ij[p+inum]=acc;
acc+=numj[ilist[i]];
p++;
}
}
}
gbm.last_ellipse=p;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[p]=numj[ilist[i]];
gbm.nbor.host_ij[p+inum]=acc;
acc+=numj[ilist[i]];
p++;
gbm.last_ellipse=p;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[p]=numj[ilist[i]];
gbm.nbor.host_ij[p+inum]=acc;
acc+=numj[ilist[i]];
p++;
}
}
gbm.nbor.ij_total=0;
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum,
2*inum,gbm.pair_stream);
} else {
int p=0, acc=0;
int offset=0;
int half=ij_size/2;
int hi=0;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[hi]=numj[ilist[i]];
gbm.nbor.host_ij[hi+half]=acc;
acc+=numj[ilist[i]];
p++;
hi++;
if (hi==half) {
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
half,gbm.pair_stream);
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
inum*2+offset,
half,gbm.pair_stream);
hi=0;
offset+=half;
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
}
}
}
gbm.last_ellipse=p;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[hi]=numj[ilist[i]];
gbm.nbor.host_ij[hi+half]=acc;
acc+=numj[ilist[i]];
p++;
hi++;
if (hi==half) {
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
half,gbm.pair_stream);
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
inum*2+offset,
half,gbm.pair_stream);
hi=0;
offset+=half;
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
}
}
}
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
if (hi>0) {
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
hi,gbm.pair_stream);
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
inum*2+offset,
hi,gbm.pair_stream);
}
gbm.nbor.ij_total=0;
}
gbm.nbor.ij_total=0;
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
gbm.nbor.dev_nbor.row_size(),
gbm.nbor.dev_nbor.row_size(),2,inum,
gbm.pair_stream);
} else {
gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
gbm.last_ellipse=inum;
@ -349,6 +404,14 @@ EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
_gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
}
template<class numtyp, class acctyp>
void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) {
gbm.atom.time_answer.start();
gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
gbm.atom.time_answer.stop();
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques for all ij interactions
// ---------------------------------------------------------------------------
@ -357,7 +420,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
const bool rebuild) {
// Compute the block size and grid size to keep all cores busy
const int BX=BLOCK_1D;
int ans_pitch=6;
if (eflag)
ans_pitch++;
if (vflag)
ans_pitch+=6;
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
if (gbm.multiple_forms) {
@ -371,9 +439,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
gbm.time_gayberne.start();
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
gbm.time_gayberne.stop();
@ -393,14 +462,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
gbm.last_ellipse)/BX));
pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
ELLIPSE_SPHERE);
ELLIPSE_SPHERE);
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
gbm.time_gayberne2.stop();
} else {
@ -419,16 +489,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
if (gbm.last_ellipse<gbm.atom.inum()) {
if (gbm.shared_types)
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),
gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),
gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse,
gbm.atom.inum(), gbm.atom.nall());
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag,
vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
else
kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),
gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
}
gbm.time_pair.stop();
@ -437,19 +506,21 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
gbm.time_gayberne.start();
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(),
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(),
eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
gbm.time_gayberne.stop();
}
}
EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
const int thread) {
const int thread) {
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
_gb_gpu_enqueue<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag);
}
// ---------------------------------------------------------------------------
@ -462,9 +533,6 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
double *virial) {
double evdw;
gbm.atom.time_answer.start();
gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
gbm.atom.time_atom.add_to_total();
gbm.nbor.time_nbor.add_to_total();
gbm.time_kernel.add_to_total();
@ -475,11 +543,19 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
gbm.time_pair.add_to_total();
}
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
gbm.atom.add_forces(ilist,f);
gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
gbm.atom.time_answer.stop();
if (gbm.last_ellipse>gbm.atom.inum()) {
if (eflag || vflag)
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
f,tor,gbm.atom.inum());
else
gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum());
} else {
if (eflag || vflag)
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
f,tor,gbm.last_ellipse);
else
gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse);
}
gbm.atom.time_answer.add_to_total();
return evdw;
}
@ -516,3 +592,4 @@ EXTERN int gb_gpu_num_devices() {
EXTERN double gb_gpu_bytes() {
return GBMF[0].host_memory_usage();
}

View File

@ -298,24 +298,22 @@ static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi,
static __inline__ __device__ void gpu_quat_to_mat_trans(const vec4 *qif,
const int qi,
numtyp mat[9])
{
numtyp qi3=_x_<numtyp>(qi,3);
numtyp qi4=_x_<numtyp>(qi,4);
numtyp qi5=_x_<numtyp>(qi,5);
numtyp qi6=_x_<numtyp>(qi,6);
numtyp w2 = qi3*qi3;
numtyp i2 = qi4*qi4;
numtyp j2 = qi5*qi5;
numtyp k2 = qi6*qi6;
numtyp twoij = (numtyp)2.0*qi4*qi5;
numtyp twoik = (numtyp)2.0*qi4*qi6;
numtyp twojk = (numtyp)2.0*qi5*qi6;
numtyp twoiw = (numtyp)2.0*qi4*qi3;
numtyp twojw = (numtyp)2.0*qi5*qi3;
numtyp twokw = (numtyp)2.0*qi6*qi3;
vec4 q=qif[qi];
numtyp w2 = q.x*q.x;
numtyp i2 = q.y*q.y;
numtyp j2 = q.z*q.z;
numtyp k2 = q.w*q.w;
numtyp twoij = (numtyp)2.0*q.y*q.z;
numtyp twoik = (numtyp)2.0*q.y*q.w;
numtyp twojk = (numtyp)2.0*q.z*q.w;
numtyp twoiw = (numtyp)2.0*q.y*q.x;
numtyp twojw = (numtyp)2.0*q.z*q.x;
numtyp twokw = (numtyp)2.0*q.w*q.x;
mat[0] = w2+i2-j2-k2;
mat[3] = twoij-twokw;

View File

@ -92,16 +92,19 @@ static __inline__ __device__ void compute_eta_torque(numtyp m[9],
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
#include "gb_gpu_kernel.h"
template<class numtyp, class acctyp>
__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
const int *dev_nbor, const int nbor_pitch,
__global__ void kernel_gayberne(const vec4* x_, const vec4 *q,
const numtyp *gum, const numtyp *special_lj,
const int *dev_nbor, const size_t nbor_pitch,
acctyp *ans, size_t ans_pitch, int *err_flag,
const bool eflag, const bool vflag,
const int inum, const int nall) {
__shared__ numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
if (ii<4)
sp_lj[ii]=special_lj[ii];
@ -109,7 +112,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
@ -128,16 +131,14 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
const int *nbor_end=nbor+nbor_pitch*numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,7);
vec4 ix=x_[i];
int itype=ix.w;
numtyp a1[9], b1[9], g1[9];
{
numtyp t[9];
gpu_quat_to_mat_trans(i,a1);
gpu_quat_to_mat_trans(q,i,a1);
gpu_shape_times3(itype,a1,t);
gpu_transpose_times3(a1,t,g1);
gpu_well_times3(itype,a1,t);
@ -146,7 +147,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
@ -154,20 +155,21 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
factor_lj = sp_lj[j/nall];
j %= nall;
}
int jtype=_x_<numtyp>(j,7);
vec4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = _x_<numtyp>(j,0)-ix;
r12[1] = _x_<numtyp>(j,1)-iy;
r12[2] = _x_<numtyp>(j,2)-iz;
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp a2[9];
gpu_quat_to_mat_trans(j,a2);
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
{ // Compute U_r, dUr, eta, and teta
@ -366,35 +368,37 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
acctyp *ap1=ans+ii*ans_pitch;
if (eflag) {
*ap1=energy;
ap1+=ans_pitch;
ap1++;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=ans_pitch;
ap1++;
}
}
*ap1=fx;
ap1+=ans_pitch;
ap1++;
*ap1=fy;
ap1+=ans_pitch;
ap1++;
*ap1=fz;
ap1+=ans_pitch;
ap1++;
*ap1=torx;
ap1+=ans_pitch;
ap1++;
*ap1=tory;
ap1+=ans_pitch;
ap1++;
*ap1=torz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
const int *dev_nbor, const int nbor_pitch,
__global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q,
const numtyp *gum, const numtyp *special_lj,
const int *dev_nbor, const size_t nbor_pitch,
acctyp *ans, size_t ans_pitch, int *err_flag,
const bool eflag, const bool vflag,
const int start, const int inum,
@ -425,12 +429,10 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
const int *nbor_end=nbor+nbor_pitch*numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,7);
vec4 ix=x_[i];
int itype=ix.w;
numtyp oner=_shape_<numtyp>(itype,0);
numtyp one_well=_well_<numtyp>(itype,0);
@ -445,13 +447,14 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
factor_lj = sp_lj[j/nall];
j %= nall;
}
int jtype=_x_<numtyp>(j,7);
vec4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = _x_<numtyp>(j,0)-ix;
r12[1] = _x_<numtyp>(j,1)-iy;
r12[2] = _x_<numtyp>(j,2)-iz;
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
@ -463,7 +466,7 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
r12hat[2]=r12[2]*ir;
numtyp a2[9];
gpu_quat_to_mat_trans(j,a2);
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], eta;
{ // Compute U_r, dUr, eta, and teta
@ -611,28 +614,29 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
acctyp *ap1=ans+ii*ans_pitch;
if (eflag) {
*ap1=energy;
ap1+=ans_pitch;
ap1++;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=ans_pitch;
ap1++;
}
}
*ap1=fx;
ap1+=ans_pitch;
ap1++;
*ap1=fy;
ap1+=ans_pitch;
ap1++;
*ap1=fz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
const int *dev_ij, const int nbor_pitch, acctyp *ans,
__global__ void kernel_lj(const vec4 *x_,
const numtyp *special_lj, const int *dev_nbor,
const size_t nbor_pitch, const int *dev_ij, acctyp *ans,
size_t ans_pitch, int *err_flag, const bool eflag,
const bool vflag, const int start, const int inum,
const int nall) {
@ -663,10 +667,8 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,7);
vec4 ix=x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; list<list_end; list++) {
@ -678,12 +680,13 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
factor_lj = sp_lj[j/nall];
j %= nall;
}
int jtype=_x_<numtyp>(j,7);
vec4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix-_x_<numtyp>(j,0);
numtyp dely = iy-_x_<numtyp>(j,1);
numtyp delz = iz-_x_<numtyp>(j,2);
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
@ -716,29 +719,30 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
acctyp *ap1=ans+ii*ans_pitch;
if (eflag) {
*ap1+=energy;
ap1+=ans_pitch;
ap1++;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=ans_pitch;
ap1++;
}
}
*ap1+=fx;
ap1+=ans_pitch;
ap1++;
*ap1+=fy;
ap1+=ans_pitch;
ap1++;
*ap1+=fz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
const int *dev_ij, const int nbor_pitch,
__global__ void kernel_lj_fast(const vec4 *x_,
const numtyp *special_lj, const int *dev_nbor,
const size_t nbor_pitch, const int *dev_ij,
acctyp *ans, size_t ans_pitch,int *err_flag,
const bool eflag, const bool vflag,
const int start, const int inum, const int nall){
@ -788,10 +792,8 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
vec4 ix=x_[i];
int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
numtyp factor_lj;
for ( ; list<list_end; list++) {
@ -803,12 +805,13 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
factor_lj = sp_lj[j/nall];
j %= nall;
}
int mtype=itype+_x_<numtyp>(j,7);
vec4 jx=x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix-_x_<numtyp>(j,0);
numtyp dely = iy-_x_<numtyp>(j,1);
numtyp delz = iz-_x_<numtyp>(j,2);
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
@ -837,21 +840,21 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
acctyp *ap1=ans+ii*ans_pitch;
if (eflag) {
*ap1+=energy;
ap1+=ans_pitch;
ap1++;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=ans_pitch;
ap1++;
}
}
*ap1+=fx;
ap1+=ans_pitch;
ap1++;
*ap1+=fy;
ap1+=ans_pitch;
ap1++;
*ap1+=fz;
} // if ii

View File

@ -41,14 +41,17 @@ bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
double *host_lshape, int **h_form, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int max_nbors,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const bool force_d, const int me) {
_max_nbors=max_nbors;
if (this->allocated)
clear();
bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
host_lj1, host_lj2, host_lj3, host_lj4,
host_offset, host_special_lj, max_nbors, me);
host_offset, host_special_lj, max_nbors, me,
nlocal, nall);
if (!p)
return false;
@ -96,11 +99,28 @@ bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
multiple_forms=true;
// Memory for ilist ordered by particle type
host_olist.safe_alloc_rw(this->max_atoms);
return true;
return host_olist.alloc_rw(this->max_local);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::resize_atom(const int nall, bool &success) {
this->max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
this->atom.resize(this->max_atoms, success);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::resize_local(const int nlocal, const int max_nbors,
bool &success) {
if (nlocal>this->max_local) {
this->max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
host_olist.clear();
success=success && host_olist.alloc_rw(this->max_local);
}
if (max_nbors>_max_nbors)
_max_nbors=static_cast<int>(static_cast<double>(max_nbors)*1.10);
this->nbor.resize(this->max_local,_max_nbors,success);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::clear() {
if (!this->allocated)

View File

@ -37,8 +37,11 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
double **host_epsilon, double *host_lshape, int **h_form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int max_nbors, const bool force_d, const int me);
const int max_nbors, const int nlocal, const int nall,
const bool force_d, const int me);
void resize_atom(const int nall, bool &success);
void resize_local(const int nlocal, const int max_nbors, bool &success);
void clear();
double host_memory_usage();
@ -61,6 +64,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
bool multiple_forms;
int **host_form;
int last_ellipse;
int _max_nbors;
private:
};

View File

@ -74,7 +74,8 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **
ij_size=IJ_SIZE;
bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id,
0,0);
ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
@ -101,37 +102,6 @@ EXTERN void lj_gpu_clear() {
}
// ---------------------------------------------------------------------------
// Calculate energies and forces for all ij interactions
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
// Compute the block size and grid size to keep all cores busy
const int BX=BLOCK_1D;
int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
ljm.time_pair.start();
if (ljm.shared_types)
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
vflag, ljm.atom.inum(), ljm.atom.nall());
else
kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
vflag, ljm.atom.inum(), ljm.atom.nall());
ljm.time_pair.stop();
}
EXTERN void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
_lj_gpu<PRECISION,ACC_PRECISION>(LJMF, eflag,vflag,rebuild);
}
template <class numtyp, class acctyp>
double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
double **host_x, int *host_type, const int inum,

View File

@ -217,229 +217,4 @@ __global__ void kernel_lj_cell(float3 *force3,
}
/* Neigbhor list version of LJ kernel */
template<class numtyp, class acctyp>
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
const int *dev_ij, const int nbor_pitch, acctyp *ans,
size_t ans_pitch, const bool eflag,
const bool vflag, const int inum, const int nall) {
__shared__ numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
if (ii<4)
sp_lj[ii]=special_lj[ii];
ii+=INT_MUL(blockIdx.x,blockDim.x);
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,3);
numtyp factor_lj;
for ( ; list<list_end; list++) {
int j=*list;
if (j < nall)
factor_lj = 1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
int jtype=_x_<numtyp>(j,3);
// Compute r12
numtyp delx = ix-_x_<numtyp>(j,0);
numtyp dely = iy-_x_<numtyp>(j,1);
numtyp delz = iz-_x_<numtyp>(j,2);
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<_cutsq_<numtyp>(itype,jtype)) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv =r2inv*r2inv*r2inv;
numtyp force =factor_lj*r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
_lj1_<numtyp>(itype,jtype).y);
fx+=delx*force;
fy+=dely*force;
fz+=delz*force;
if (eflag) {
numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
_lj3_<numtyp>(itype,jtype).y);
energy+=factor_lj*(e-_offset_<numtyp>(1,1));
}
if (vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
if (eflag) {
*ap1=energy;
ap1+=ans_pitch;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=ans_pitch;
}
}
*ap1=fx;
ap1+=ans_pitch;
*ap1=fy;
ap1+=ans_pitch;
*ap1=fz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
const int *dev_ij, const int nbor_pitch,
acctyp *ans, size_t ans_pitch,const bool eflag,
const bool vflag, const int inum,
const int nall) {
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
__shared__ numtyp sp_lj[4];
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<4)
sp_lj[ii]=special_lj[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
int itype=ii/MAX_SHARED_TYPES;
int jtype=ii%MAX_SHARED_TYPES;
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
if (eflag) {
lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
offset[ii]=_offset_<numtyp>(itype,jtype);
}
}
ii+=INT_MUL(blockIdx.x,blockDim.x);
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
numtyp factor_lj;
for ( ; list<list_end; list++) {
int j= *list;
if (j < nall)
factor_lj = 1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
int mtype=itype+_x_<numtyp>(j,3);
// Compute r12
numtyp delx = ix-_x_<numtyp>(j,0);
numtyp dely = iy-_x_<numtyp>(j,1);
numtyp delz = iz-_x_<numtyp>(j,2);
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<cutsq[mtype]) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
fx+=delx*force;
fy+=dely*force;
fz+=delz*force;
if (eflag) {
numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
energy+=factor_lj*(e-offset[mtype]);
}
if (vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
if (eflag) {
*ap1=energy;
ap1+=ans_pitch;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=ans_pitch;
}
}
*ap1=fx;
ap1+=ans_pitch;
*ap1=fy;
ap1+=ans_pitch;
*ap1=fz;
} // if ii
}
#endif

View File

@ -25,15 +25,6 @@ int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
bytes_per_atom(max_nbors));
if (matoms>MAX_ATOMS)
matoms=MAX_ATOMS;
return matoms;
}
template <class numtyp, class acctyp>
bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
double **host_cutsq, double **host_sigma,
@ -41,7 +32,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int max_nbors,
const int me) {
const int me, const int nlocal, const int nall) {
if (allocated)
clear();
@ -55,9 +46,18 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
time_pair.init();
// Initialize atom and nbor data
max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
atom.init(max_atoms);
nbor.init(ij_size,max_atoms,max_nbors);
max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
if (max_local==0)
max_local=1000;
if (nall<=nlocal)
max_atoms=max_local*2;
else
max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
if (!atom.init(max_atoms))
return false;
if (!nbor.init(ij_size,max_local,max_nbors))
return false;
// Get a stream for computing pair potentials
CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
@ -112,7 +112,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
allocated=true;
return true;
}
template <class numtyp, class acctyp>
void LJ_GPU_MemoryT::clear() {
if (!allocated)

View File

@ -28,7 +28,6 @@
#define BLOCK_1D 64 // max value = 256
#define CELL_SIZE BLOCK_1D
#define MAX_SHARED_TYPES 8
#define PERCENT_GPU_MEMORY 0.7
#define BIG_NUMBER 100000000
template <class numtyp, class acctyp>
@ -42,14 +41,13 @@ class LJ_GPU_Memory {
double **host_sigma, double **host_epsilon,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int max_nbors, const int me);
const int max_nbors, const int me, const int nlocal,
const int nall);
/// Free any memory on host and device
void clear();
/// Returns memory usage on GPU per atom
int bytes_per_atom(const int max_nbors) const;
/// Maximum number of atoms that can be stored on GPU
int get_max_atoms(const size_t gpu_bytes, const int max_nbors);
/// Total host memory used by library
double host_memory_usage() const;
@ -72,7 +70,7 @@ class LJ_GPU_Memory {
NVC_ConstMat< typename nvc_vec_traits<numtyp>::vec2 > lj1, lj3;
NVC_VecT special_lj;
size_t max_atoms;
size_t max_atoms, max_local;
// Timing for pair calculation
NVCTimer time_pair;

View File

@ -8,7 +8,6 @@
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
@ -17,6 +16,12 @@
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#if defined(__APPLE__)
#if _GLIBCXX_ATOMIC_BUILTINS == 1
#undef _GLIBCXX_ATOMIC_BUILTINS
#endif // _GLIBCXX_ATOMIC_BUILTINS
#endif // __APPLE__
#include "nvc_device.h"
int main(int argc, char** argv) {

View File

@ -20,6 +20,12 @@
#ifndef NVC_MACROS_H
#define NVC_MACROS_H
#if defined(__APPLE__)
#if _GLIBCXX_ATOMIC_BUILTINS == 1
#undef _GLIBCXX_ATOMIC_BUILTINS
#endif // _GLIBCXX_ATOMIC_BUILTINS
#endif // __APPLE__
#include <stdio.h>
#include "math_constants.h"
#define INT_MUL(x,y) (__mul24(x,y))

View File

@ -96,6 +96,17 @@ class NVC_Host {
_end=_array+cols;
}
// Allocate page-locked memory with fast write/slow read on host
inline bool alloc_w(const size_t cols) {
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
if (cudaHostAlloc((void **)&_array,_row_bytes,cudaHostAllocWriteCombined)!=
cudaSuccess)
return false;
_end=_array+cols;
return true;
}
// Allocate page-locked memory with fast read/write on host
inline void safe_alloc_rw(const size_t cols) {
_cols=cols;
@ -104,6 +115,16 @@ class NVC_Host {
_end=_array+cols;
}
// Allocate page-locked memory with fast read/write on host
inline bool alloc_rw(const size_t cols) {
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
if (cudaMallocHost((void **)&_array,_row_bytes)!=cudaSuccess)
return false;
_end=_array+cols;
return true;
}
/// Free any memory associated with device
inline void clear()
{ if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } }
@ -163,6 +184,13 @@ class NVC_Host {
cudaMemcpyHostToDevice,stream));
}
/// Asynchronous copy to device (numel is not bytes)
inline void copy_to_device(size_t offset, numtyp *device_p, size_t numel,
cudaStream_t &stream) {
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array+offset,numel*sizeof(numtyp),
cudaMemcpyHostToDevice,stream));
}
/// Asynchronous copy to 2D matrix on device (numel is not bytes)
inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
const size_t rows, const size_t cols,
@ -194,6 +222,16 @@ class NVC_Vec {
_end=_array+cols;
}
// Row vector on device
inline bool alloc(const size_t cols) {
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
if (cudaMalloc((void **)&_array,_row_bytes)!=cudaSuccess)
return false;
_end=_array+cols;
return true;
}
// Row vector on device (allocate and assign texture and bind)
inline void safe_alloc(const size_t cols, textureReference *t)
{ safe_alloc(cols); assign_texture(t); bind(); }
@ -221,11 +259,22 @@ class NVC_Vec {
{ CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(),
cudaMemcpyHostToDevice)); }
/// Copy from host (n elements)
inline void copy_from_host(const numtyp *host_p, const size_t n)
{ CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,n*sizeof(numtyp),
cudaMemcpyHostToDevice)); }
/// Asynchronous copy from host
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
{ CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(),
cudaMemcpyHostToDevice, stream)); }
/// Asynchronous copy from host (n elements)
inline void copy_from_host(const numtyp *host_p, const size_t n,
cudaStream_t &stream)
{ CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,n*sizeof(numtyp),
cudaMemcpyHostToDevice, stream)); }
/// Copy to host
inline void copy_to_host(numtyp *host_p)
{ CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(),

View File

@ -28,7 +28,9 @@ int PairGPUAtomT::bytes_per_atom() const {
}
template <class numtyp, class acctyp>
void PairGPUAtomT::init(const int max_atoms) {
bool PairGPUAtomT::init(const int max_atoms) {
bool success=true;
if (allocated)
clear();
@ -39,28 +41,48 @@ void PairGPUAtomT::init(const int max_atoms) {
time_answer.init();
// Device matrices for atom and force data
dev_x.safe_alloc(atom_fields(),max_atoms,x_get_texture<numtyp>());
ans.safe_alloc(ans_fields(),max_atoms);
success=success && dev_x.alloc(max_atoms*sizeof(vec4));
success=success && dev_q.alloc(max_atoms*sizeof(vec4));
success=success && ans.alloc(ans_fields()*max_atoms);
// Get a host read/write buffer
success=success && host_read.alloc_rw(max_atoms*ans_fields());
// Get a host write only buffer
host_write.safe_alloc_w(max_atoms*atom_fields());
// Get a host read/write buffer
host_read.safe_alloc_rw(ans.row_size()*ans_fields());
success=success && host_write.alloc_w(max_atoms*atom_fields());
allocated=true;
return success;
}
template <class numtyp, class acctyp>
void PairGPUAtomT::resize(const int max_atoms, bool &success) {
ans.clear();
dev_x.clear();
dev_q.clear();
host_write.clear();
host_read.clear();
_max_atoms=max_atoms;
success = success && dev_x.alloc(_max_atoms*sizeof(vec4));
success = success && dev_q.alloc(_max_atoms*sizeof(vec4));
success = success && ans.alloc(ans_fields()*_max_atoms);
success = success && host_read.alloc_rw(_max_atoms*ans_fields());
success = success && host_write.alloc_w(_max_atoms*atom_fields());
}
template <class numtyp, class acctyp>
void PairGPUAtomT::clear() {
if (!allocated)
return;
allocated=false;
dev_x.unbind();
ans.clear();
ans.clear();
dev_x.clear();
dev_q.clear();
host_write.clear();
host_read.clear();
dev_x.clear();
}
template <class numtyp, class acctyp>
@ -82,88 +104,88 @@ void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
if (!vflag)
csize-=6;
host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s);
host_read.copy_from_device(ans.begin(),_inum*csize,s);
}
template <class numtyp, class acctyp>
double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom,
const bool vflag_atom, double *eatom,
double **vatom, double *virial) {
double **vatom, double *virial,
double **f, double **tor, const int n) {
double evdwl=0.0;
int gap=ans.row_size()-_inum;
acctyp *ap=host_read.begin();
if (_eflag) {
if (eflag_atom) {
for (int i=0; i<_inum; i++) {
for (int i=0; i<_inum; i++) {
int ii=ilist[i];
if (_eflag) {
if (eflag_atom) {
evdwl+=*ap;
eatom[ilist[i]]+=*ap*0.5;
eatom[ii]+=*ap*0.5;
ap++;
}
} else
for (int i=0; i<_inum; i++) {
} else {
evdwl+=*ap;
ap++;
}
ap+=gap;
evdwl*=0.5;
}
_read_loc=ap;
gap=ans.row_size();
if (_vflag) {
if (vflag_atom) {
for (int ii=0; ii<_inum; ii++) {
int i=ilist[ii];
ap=_read_loc+ii;
}
if (_vflag) {
if (vflag_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=*ap*0.5;
vatom[ii][j]+=*ap*0.5;
virial[j]+=*ap;
ap+=gap;
ap++;
}
}
} else {
for (int ii=0; ii<_inum; ii++) {
ap=_read_loc+ii;
} else {
for (int j=0; j<6; j++) {
virial[j]+=*ap;
ap+=gap;
ap++;
}
}
}
for (int j=0; j<6; j++)
virial[j]*=0.5;
_read_loc+=gap*6;
f[ii][0]+=*ap;
ap++;
f[ii][1]+=*ap;
ap++;
f[ii][2]+=*ap;
ap++;
if (i<n) {
tor[ii][0]+=*ap;
ap++;
tor[ii][1]+=*ap;
ap++;
tor[ii][2]+=*ap;
ap++;
} else {
ap+=3;
}
}
for (int j=0; j<6; j++)
virial[j]*=0.5;
evdwl*=0.5;
return evdwl;
}
template <class numtyp, class acctyp>
void PairGPUAtomT::add_forces(const int *ilist, double **f) {
int gap=ans.row_size();
for (int ii=0; ii<_inum; ii++) {
acctyp *ap=_read_loc+ii;
int i=ilist[ii];
f[i][0]+=*ap;
ap+=gap;
f[i][1]+=*ap;
ap+=gap;
f[i][2]+=*ap;
}
}
template <class numtyp, class acctyp>
void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) {
int gap=ans.row_size();
_read_loc+=gap*3;
for (int ii=0; ii<n; ii++) {
acctyp *ap=_read_loc+ii;
int i=ilist[ii];
tor[i][0]+=*ap;
ap+=gap;
tor[i][1]+=*ap;
ap+=gap;
tor[i][2]+=*ap;
void PairGPUAtomT::copy_asphere(const int *ilist, double **f, double **tor,
const int n) {
acctyp *ap=host_read.begin();
for (int i=0; i<_inum; i++) {
int ii=ilist[i];
f[ii][0]+=*ap;
ap++;
f[ii][1]+=*ap;
ap++;
f[ii][2]+=*ap;
ap++;
if (i<n) {
tor[ii][0]+=*ap;
ap++;
tor[ii][1]+=*ap;
ap++;
tor[ii][2]+=*ap;
ap++;
} else {
ap+=3;
}
}
}

View File

@ -26,18 +26,21 @@
#define PRECISION float
#define ACC_PRECISION double
#define MAX_ATOMS 65536
#define vec4 float4
#endif
#ifdef _DOUBLE_DOUBLE
#define PRECISION double
#define ACC_PRECISION double
#define MAX_ATOMS 32768
struct vec4 { double x; double y; double z; double w; };
#endif
#ifndef PRECISION
#define PRECISION float
#define ACC_PRECISION float
#define MAX_ATOMS 65536
#define vec4 float4
#endif
#include "nvc_timer.h"
@ -71,7 +74,8 @@ class PairGPUAtom {
/// Must be called once to allocate host and device memory
/** \note atom_fields and ans_fields should be set first if not default **/
void init(const int max_atoms);
bool init(const int max_atoms);
void resize(const int max_atoms, bool &success);
/// Free all memory on host and device
void clear();
@ -99,11 +103,35 @@ class PairGPUAtom {
for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; }
}
/// Copy num_rows x nall() write buffer to x in GPU
/** num_rows<=atom_fields() **/
inline void copy_atom_data(const int num_rows, cudaStream_t &stream)
{ dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); }
/// Add positions to write buffer
/** Copies nall() elements **/
inline void add_x_data(double **host_ptr, const int *host_type) {
for (int i=0; i<_nall; i++) {
*_write_loc=host_ptr[i][0];
_write_loc++;
*_write_loc=host_ptr[i][1];
_write_loc++;
*_write_loc=host_ptr[i][2];
_write_loc++;
*_write_loc=host_type[i];
_write_loc++;
}
}
/// Add quaternions to write buffer
/** Copies nall() elements **/
template<class cpytyp>
inline void add_q_data(const cpytyp *host_ptr) {
const int end=_nall*4;
for (int i=0; i<end; i++) { *_write_loc=host_ptr[i]; _write_loc++; }
}
/// Copy num_rows positions+type to x in GPU
/** num_rows<=atom_fields() **/
inline void copy_x_data(cudaStream_t &stream)
{ dev_x.copy_from_host(host_write.begin(),_nall*4,stream); }
inline void copy_q_data(cudaStream_t &stream)
{ dev_q.copy_from_host(host_write.begin()+_nall*4,_nall*4,stream); }
// -------------------------COPY FROM GPU -------------------------------
@ -113,22 +141,19 @@ class PairGPUAtom {
/// Copy energy and virial data into LAMMPS memory
double energy_virial(const int *ilist, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial);
/// Add forces from the GPU into a LAMMPS pointer
void add_forces(const int *ilist, double **f);
/// Add torques from the GPU into a LAMMPS pointer
void add_torques(const int *ilist, double **tor, const int n);
double *virial, double **f, double **tor, const int);
/// Add forces and torques from the GPU into a LAMMPS pointer
void copy_asphere(const int *ilist, double **f, double **tor, const int n);
// ------------------------------ DATA ----------------------------------
// atom_fields() x n (example: rows 1-3 position, 4 is type)
NVC_ConstMatT dev_x;
// ans_fields() x n Storage for Forces, etc.
// example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force
// example: if (!eflag) 1-3 is force
NVC_Mat<acctyp> ans;
// atom coordinates
NVC_Vec<numtyp> dev_x;
// quaterions
NVC_Vec<numtyp> dev_q;
// ans_fields()
// example: if (eflag and vflag) 1 is energy, 2-7 is virial
NVC_Vec<acctyp> ans;
// Buffer for moving floating point data to GPU
NVC_HostT host_write;

View File

@ -17,6 +17,12 @@
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#if defined(__APPLE__)
#if _GLIBCXX_ATOMIC_BUILTINS == 1
#undef _GLIBCXX_ATOMIC_BUILTINS
#endif // _GLIBCXX_ATOMIC_BUILTINS
#endif // __APPLE__
#include <assert.h>
#include "lj_gpu_memory.h"
#include "pair_gpu_cell.h"

View File

@ -26,8 +26,9 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
return (max_nbors+3)*sizeof(int);
}
void PairGPUNbor::init(const int ij_size, const int max_atoms,
bool PairGPUNbor::init(const int ij_size, const int max_atoms,
const int max_nbors) {
bool success=true;
if (allocated)
clear();
@ -35,16 +36,29 @@ void PairGPUNbor::init(const int ij_size, const int max_atoms,
time_nbor.init();
if (_use_packing)
dev_nbor.safe_alloc(max_nbors+4,max_atoms);
success=success && dev_nbor.alloc((max_nbors+4)*max_atoms);
else
dev_nbor.safe_alloc(3,max_atoms);
success=success && dev_nbor.alloc(3*max_atoms);
ij.safe_alloc(max_nbors*max_atoms);
host_ij.safe_alloc_w(ij_size);
success=success && ij.alloc(max_nbors*max_atoms);
success=success && host_ij.alloc_w(ij_size);
allocated=true;
return success;
}
void PairGPUNbor::resize(const int nlocal, const int max_nbor, bool &success) {
dev_nbor.clear();
ij.clear();
if (_use_packing)
success=success && dev_nbor.alloc((max_nbor+4)*nlocal);
else
success=success && dev_nbor.alloc(3*nlocal);
success=success && ij.alloc(max_nbor*nlocal);
allocated=true;
}
void PairGPUNbor::clear() {
if (!allocated)
return;
@ -54,7 +68,7 @@ void PairGPUNbor::clear() {
host_ij.clear();
dev_nbor.clear();
}
double PairGPUNbor::host_memory_usage() const {
return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor);
}
@ -65,12 +79,35 @@ void PairGPUNbor::reset(const int inum, int *ilist, const int *numj,
dev_nbor.copy_from_host(ilist,inum);
int acc=0;
for (int i=0; i<inum; i++) {
host_ij[i]=numj[ilist[i]];
host_ij[i+inum]=acc;
acc+=numj[ilist[i]];
}
host_ij.copy_to_2Ddevice(dev_nbor.begin()+dev_nbor.row_size(),
dev_nbor.row_size(),2,inum, s);
int ij_size=host_ij.numel();
if (inum*2<ij_size) {
for (int i=0; i<inum; i++) {
host_ij[i]=numj[ilist[i]];
host_ij[i+inum]=acc;
acc+=numj[ilist[i]];
}
host_ij.copy_to_device(dev_nbor.begin()+inum,2*inum, s);
} else {
int offset=0;
int half=ij_size/2;
int hi=0;
for (int i=0; i<inum; i++) {
host_ij[hi]=numj[ilist[i]];
host_ij[hi+half]=acc;
acc+=numj[ilist[i]];
hi++;
if (hi==half) {
host_ij.copy_to_device(dev_nbor.begin()+inum+offset,half,s);
host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,half,s);
offset+=half;
hi=0;
CUDA_SAFE_CALL(cudaStreamSynchronize(s));
}
}
if (hi>0) {
host_ij.copy_to_device(dev_nbor.begin()+inum+offset,hi,s);
host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,hi,s);
}
}
}

View File

@ -40,8 +40,10 @@ class PairGPUNbor {
void packing(const bool use_packing) { _use_packing=use_packing; }
/// Called once to allocate memory
void init(const int ij_size, const int max_atoms, const int max_nbors);
bool init(const int ij_size, const int max_atoms, const int max_nbors);
void resize(const int nlocal, const int max_nbor, bool &success);
/// Free all memory on host and device
void clear();
@ -73,7 +75,7 @@ class PairGPUNbor {
// - 1st row is i
// - 2nd row is numj (number of neighbors)
// - 3rd row is starting address in host_ij of neighbors
NVC_MatI dev_nbor;
NVC_VecI dev_nbor;
// --------------- Timing Stuff
NVCTimer time_nbor;

View File

@ -31,32 +31,6 @@
#define GB_GPU_DOUBLE
#endif
// ------------------------------- x ------------------------------------
static texture<float, 2, cudaReadModeElementType> x_float_tex;
static texture<int2, 2, cudaReadModeElementType> x_double_tex;
template <class numtyp> inline textureReference * x_get_texture() {
const textureReference *ptr;
cudaGetTextureReference(&ptr,"x_float_tex");
return const_cast<textureReference *>(ptr);
}
template <> inline textureReference * x_get_texture<double>() {
const textureReference *ptr;
cudaGetTextureReference(&ptr,"x_double_tex");
return const_cast<textureReference *>(ptr);
}
template <class numtyp>
static __inline__ __device__ numtyp _x_(const int i, const int j) {
return tex2D(x_float_tex,i,j);
}
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double _x_<double>(const int i,const int j) {
int2 t=tex2D(x_double_tex,i,j);
return __hiloint2double(t.y, t.x);
}
#endif
// ------------------------------- form ------------------------------------
static texture<int, 2, cudaReadModeElementType> form_tex;

View File

@ -17,6 +17,12 @@
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#if defined(__APPLE__)
#if _GLIBCXX_ATOMIC_BUILTINS == 1
#undef _GLIBCXX_ATOMIC_BUILTINS
#endif // _GLIBCXX_ATOMIC_BUILTINS
#endif // __APPLE__
#include "pair_gpu_atom.cu"
#include "lj_gpu.cu"
#include "gb_gpu.cu"