forked from lijiext/lammps
Adding changes from Mike Brown.
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3883 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
780ba6e377
commit
c0e0b65619
|
@ -74,13 +74,3 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
|
|||
NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
|
||||
package has been installed before installing the GPU package in LAMMPS.
|
||||
|
||||
GPU MEMORY
|
||||
|
||||
Upon initialization of the gayberne/gpu pair style, the library will reserve
|
||||
memory for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
|
||||
is limiting. The value of 70% can be changed by editing the
|
||||
PERCENT_GPU_MEMORY definition in the source file. For gayberne/gpu, the value
|
||||
of 64K cannot be increased and is the maximum number of atoms allowed per
|
||||
GPU. Using the 'neigh_modify one' modifier in your LAMMPS input script
|
||||
can help to increase maximum number of atoms per GPU for cards with
|
||||
limited memory.
|
||||
|
|
|
@ -36,7 +36,7 @@ static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
|
|||
// -- Only pack neighbors within cutoff
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp>
|
||||
__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
|
||||
__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
|
||||
const int start, const int inum,
|
||||
const int *dev_ij, const int form_low,
|
||||
const int form_high, const int nall) {
|
||||
|
@ -56,25 +56,24 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
|
|||
int *nbor_newj=nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,7);
|
||||
vec4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
int newj=0;
|
||||
for ( ; list<list_end; list++) {
|
||||
int j=*list;
|
||||
if (j>=nall)
|
||||
j%=nall;
|
||||
int jtype=_x_<numtyp>(j,7);
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=_x_<numtyp>(j,0)-ix;
|
||||
numtyp rsq=jx.x-ix.x;
|
||||
rsq*=rsq;
|
||||
numtyp t=_x_<numtyp>(j,1)-iy;
|
||||
numtyp t=jx.y-ix.y;
|
||||
rsq+=t*t;
|
||||
t=_x_<numtyp>(j,2)-iz;
|
||||
t=jx.z-ix.z;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq< _cutsq_<numtyp>(itype,jtype)) {
|
||||
|
@ -95,7 +94,7 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
|
|||
// -- Fast version of routine that uses shared memory for LJ constants
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp>
|
||||
__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
|
||||
__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
|
||||
const int start, const int inum,
|
||||
const int *dev_ij, const int form_low,
|
||||
const int form_high, const int nall) {
|
||||
|
@ -124,25 +123,25 @@ __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
|
|||
int *nbor_newj=nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
|
||||
vec4 ix=x_[i];
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
|
||||
|
||||
int newj=0;
|
||||
for ( ; list<list_end; list++) {
|
||||
int j=*list;
|
||||
if (j>=nall)
|
||||
j%=nall;
|
||||
int mtype=itype+_x_<numtyp>(j,7);
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
|
||||
if (form[mtype]>=form_low && form[mtype]<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=_x_<numtyp>(j,0)-ix;
|
||||
numtyp rsq=jx.x-ix.x;
|
||||
rsq*=rsq;
|
||||
numtyp t=_x_<numtyp>(j,1)-iy;
|
||||
numtyp t=jx.y-ix.y;
|
||||
rsq+=t*t;
|
||||
t=_x_<numtyp>(j,2)-iz;
|
||||
t=jx.z-ix.z;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq<cutsq[mtype]) {
|
||||
|
@ -159,13 +158,15 @@ __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
|
|||
template<class numtyp, class acctyp>
|
||||
void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
|
||||
const int inum, const int form_low, const int form_high) {
|
||||
if (gbm.shared_types)
|
||||
if (gbm.shared_types) {
|
||||
kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
|
||||
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
|
||||
gbm.atom.inum(), start, inum,
|
||||
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
|
||||
else
|
||||
} else
|
||||
kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
|
||||
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
|
||||
gbm.atom.inum(), start, inum,
|
||||
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
|
||||
}
|
||||
|
||||
|
@ -188,9 +189,7 @@ EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
|
|||
string sname=GBMF[0].gpu.name(id)+", "+
|
||||
gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
|
||||
gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
|
||||
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
|
||||
gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
|
||||
max_nbors))+" Atoms";
|
||||
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ";
|
||||
strcpy(name,sname.c_str());
|
||||
}
|
||||
|
||||
|
@ -203,6 +202,7 @@ EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
|
|||
double **epsilon, double *host_lshape, int **form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int nlocal, const int nall,
|
||||
const int max_nbors, const int thread, const int gpu_id) {
|
||||
assert(thread<MAX_GPU_THREADS);
|
||||
|
||||
|
@ -215,7 +215,8 @@ EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
|
|||
return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
|
||||
well, cutsq, sigma, epsilon, host_lshape, form,
|
||||
host_lj1, host_lj2, host_lj3, host_lj4, offset,
|
||||
special_lj, max_nbors, false, gpu_id);
|
||||
special_lj, nlocal, nall, max_nbors, false,
|
||||
gpu_id);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -230,34 +231,23 @@ EXTERN void gb_gpu_clear(const int thread) {
|
|||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
|
||||
double **host_quat, const int *host_type,
|
||||
const bool rebuild, cudaStream_t &stream) {
|
||||
double **host_quat, const int *host_type,
|
||||
const bool rebuild, cudaStream_t &stream) {
|
||||
atom.time_atom.start();
|
||||
atom.reset_write_buffer();
|
||||
|
||||
// Rows 1-3 of dev_x are position; rows 4-7 are quaternion
|
||||
atom.add_atom_data(host_x[0],3);
|
||||
atom.add_atom_data(host_x[0]+1,3);
|
||||
atom.add_atom_data(host_x[0]+2,3);
|
||||
atom.add_atom_data(host_quat[0],4);
|
||||
atom.add_atom_data(host_quat[0]+1,4);
|
||||
atom.add_atom_data(host_quat[0]+2,4);
|
||||
atom.add_atom_data(host_quat[0]+3,4);
|
||||
|
||||
int csize=7;
|
||||
|
||||
// If a rebuild occured, copy type data
|
||||
if (rebuild) {
|
||||
atom.add_atom_data(host_type);
|
||||
csize++;
|
||||
}
|
||||
|
||||
atom.copy_atom_data(csize,stream);
|
||||
atom.add_x_data(host_x,host_type);
|
||||
atom.add_q_data(host_quat[0]);
|
||||
|
||||
atom.copy_x_data(stream);
|
||||
atom.copy_q_data(stream);
|
||||
atom.time_atom.stop();
|
||||
}
|
||||
|
||||
EXTERN void gb_gpu_atom(double **host_x, double **host_quat,
|
||||
const int *host_type, const bool rebuild, const int thread) {
|
||||
const int *host_type, const bool rebuild,
|
||||
const int thread) {
|
||||
_gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
|
||||
GBMF[thread].pair_stream);
|
||||
}
|
||||
|
@ -269,46 +259,111 @@ template <class gbmtyp>
|
|||
int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
|
||||
const int inum, int *ilist, const int *numj,
|
||||
const int *type, bool &success) {
|
||||
if (nall>gbm.max_atoms) {
|
||||
success=false;
|
||||
return 0;
|
||||
}
|
||||
success=true;
|
||||
|
||||
gbm.nbor.time_nbor.start();
|
||||
|
||||
int mn=0;
|
||||
for (int i=0; i<inum; i++)
|
||||
mn=std::max(mn,numj[i]);
|
||||
|
||||
if (nall>gbm.max_atoms)
|
||||
gbm.resize_atom(nall,success);
|
||||
if (nlocal>gbm.max_local || mn>gbm._max_nbors)
|
||||
gbm.resize_local(nlocal,mn,success);
|
||||
if (!success)
|
||||
return false;
|
||||
|
||||
gbm.atom.nall(nall);
|
||||
gbm.atom.inum(inum);
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
int p=0, acc=0;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[p+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
int ij_size=gbm.nbor.host_ij.numel();
|
||||
if (inum*2<ij_size) {
|
||||
int p=0, acc=0;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[p+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
}
|
||||
}
|
||||
}
|
||||
gbm.last_ellipse=p;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[p+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
gbm.last_ellipse=p;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[p+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
}
|
||||
}
|
||||
gbm.nbor.ij_total=0;
|
||||
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
|
||||
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum,
|
||||
2*inum,gbm.pair_stream);
|
||||
} else {
|
||||
int p=0, acc=0;
|
||||
int offset=0;
|
||||
int half=ij_size/2;
|
||||
int hi=0;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[hi]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[hi+half]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
hi++;
|
||||
if (hi==half) {
|
||||
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
|
||||
half,gbm.pair_stream);
|
||||
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
|
||||
inum*2+offset,
|
||||
half,gbm.pair_stream);
|
||||
hi=0;
|
||||
offset+=half;
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
gbm.last_ellipse=p;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[hi]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[hi+half]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
hi++;
|
||||
if (hi==half) {
|
||||
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
|
||||
half,gbm.pair_stream);
|
||||
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
|
||||
inum*2+offset,
|
||||
half,gbm.pair_stream);
|
||||
hi=0;
|
||||
offset+=half;
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
|
||||
if (hi>0) {
|
||||
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
|
||||
hi,gbm.pair_stream);
|
||||
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
|
||||
inum*2+offset,
|
||||
hi,gbm.pair_stream);
|
||||
}
|
||||
gbm.nbor.ij_total=0;
|
||||
}
|
||||
gbm.nbor.ij_total=0;
|
||||
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
|
||||
gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
|
||||
gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.nbor.dev_nbor.row_size(),2,inum,
|
||||
gbm.pair_stream);
|
||||
} else {
|
||||
gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
|
||||
gbm.last_ellipse=inum;
|
||||
|
@ -349,6 +404,14 @@ EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
|
|||
_gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
|
||||
}
|
||||
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) {
|
||||
gbm.atom.time_answer.start();
|
||||
gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
|
||||
gbm.atom.time_answer.stop();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques for all ij interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -357,7 +420,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
|
|||
const bool rebuild) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=BLOCK_1D;
|
||||
|
||||
int ans_pitch=6;
|
||||
if (eflag)
|
||||
ans_pitch++;
|
||||
if (vflag)
|
||||
ans_pitch+=6;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
|
@ -371,9 +439,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
|
|||
|
||||
gbm.time_gayberne.start();
|
||||
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
|
||||
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
|
||||
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
|
||||
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
|
||||
gbm.time_gayberne.stop();
|
||||
|
||||
|
@ -393,14 +462,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
|
|||
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
|
||||
gbm.last_ellipse)/BX));
|
||||
pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
|
||||
ELLIPSE_SPHERE);
|
||||
ELLIPSE_SPHERE);
|
||||
gbm.time_kernel2.stop();
|
||||
|
||||
gbm.time_gayberne2.start();
|
||||
kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
|
||||
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
|
||||
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
|
||||
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
||||
gbm.time_gayberne2.stop();
|
||||
} else {
|
||||
|
@ -419,16 +489,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
|
|||
if (gbm.last_ellipse<gbm.atom.inum()) {
|
||||
if (gbm.shared_types)
|
||||
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),
|
||||
gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),
|
||||
gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse,
|
||||
gbm.atom.inum(), gbm.atom.nall());
|
||||
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
|
||||
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag,
|
||||
vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
||||
else
|
||||
kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),
|
||||
gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
|
||||
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
|
||||
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
||||
}
|
||||
gbm.time_pair.stop();
|
||||
|
@ -437,19 +506,21 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
|
|||
pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
|
||||
gbm.time_gayberne.start();
|
||||
gbm.time_gayberne.start();
|
||||
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(),
|
||||
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
|
||||
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
|
||||
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
|
||||
gbm.time_gayberne.stop();
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
|
||||
const int thread) {
|
||||
const int thread) {
|
||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
|
||||
_gb_gpu_enqueue<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -462,9 +533,6 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
|
|||
double *virial) {
|
||||
double evdw;
|
||||
|
||||
gbm.atom.time_answer.start();
|
||||
gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
|
||||
|
||||
gbm.atom.time_atom.add_to_total();
|
||||
gbm.nbor.time_nbor.add_to_total();
|
||||
gbm.time_kernel.add_to_total();
|
||||
|
@ -475,11 +543,19 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
|
|||
gbm.time_pair.add_to_total();
|
||||
}
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
||||
|
||||
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
|
||||
gbm.atom.add_forces(ilist,f);
|
||||
gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
|
||||
gbm.atom.time_answer.stop();
|
||||
if (gbm.last_ellipse>gbm.atom.inum()) {
|
||||
if (eflag || vflag)
|
||||
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
|
||||
f,tor,gbm.atom.inum());
|
||||
else
|
||||
gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum());
|
||||
} else {
|
||||
if (eflag || vflag)
|
||||
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
|
||||
f,tor,gbm.last_ellipse);
|
||||
else
|
||||
gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse);
|
||||
}
|
||||
gbm.atom.time_answer.add_to_total();
|
||||
return evdw;
|
||||
}
|
||||
|
@ -516,3 +592,4 @@ EXTERN int gb_gpu_num_devices() {
|
|||
EXTERN double gb_gpu_bytes() {
|
||||
return GBMF[0].host_memory_usage();
|
||||
}
|
||||
|
||||
|
|
|
@ -298,24 +298,22 @@ static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
|
|||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi,
|
||||
static __inline__ __device__ void gpu_quat_to_mat_trans(const vec4 *qif,
|
||||
const int qi,
|
||||
numtyp mat[9])
|
||||
{
|
||||
numtyp qi3=_x_<numtyp>(qi,3);
|
||||
numtyp qi4=_x_<numtyp>(qi,4);
|
||||
numtyp qi5=_x_<numtyp>(qi,5);
|
||||
numtyp qi6=_x_<numtyp>(qi,6);
|
||||
|
||||
numtyp w2 = qi3*qi3;
|
||||
numtyp i2 = qi4*qi4;
|
||||
numtyp j2 = qi5*qi5;
|
||||
numtyp k2 = qi6*qi6;
|
||||
numtyp twoij = (numtyp)2.0*qi4*qi5;
|
||||
numtyp twoik = (numtyp)2.0*qi4*qi6;
|
||||
numtyp twojk = (numtyp)2.0*qi5*qi6;
|
||||
numtyp twoiw = (numtyp)2.0*qi4*qi3;
|
||||
numtyp twojw = (numtyp)2.0*qi5*qi3;
|
||||
numtyp twokw = (numtyp)2.0*qi6*qi3;
|
||||
vec4 q=qif[qi];
|
||||
|
||||
numtyp w2 = q.x*q.x;
|
||||
numtyp i2 = q.y*q.y;
|
||||
numtyp j2 = q.z*q.z;
|
||||
numtyp k2 = q.w*q.w;
|
||||
numtyp twoij = (numtyp)2.0*q.y*q.z;
|
||||
numtyp twoik = (numtyp)2.0*q.y*q.w;
|
||||
numtyp twojk = (numtyp)2.0*q.z*q.w;
|
||||
numtyp twoiw = (numtyp)2.0*q.y*q.x;
|
||||
numtyp twojw = (numtyp)2.0*q.z*q.x;
|
||||
numtyp twokw = (numtyp)2.0*q.w*q.x;
|
||||
|
||||
mat[0] = w2+i2-j2-k2;
|
||||
mat[3] = twoij-twokw;
|
||||
|
|
|
@ -92,16 +92,19 @@ static __inline__ __device__ void compute_eta_torque(numtyp m[9],
|
|||
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
||||
}
|
||||
|
||||
#include "gb_gpu_kernel.h"
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
|
||||
const int *dev_nbor, const int nbor_pitch,
|
||||
__global__ void kernel_gayberne(const vec4* x_, const vec4 *q,
|
||||
const numtyp *gum, const numtyp *special_lj,
|
||||
const int *dev_nbor, const size_t nbor_pitch,
|
||||
acctyp *ans, size_t ans_pitch, int *err_flag,
|
||||
const bool eflag, const bool vflag,
|
||||
const int inum, const int nall) {
|
||||
|
||||
__shared__ numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
|
@ -109,7 +112,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
|
@ -128,16 +131,14 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
|
|||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
|
||||
const int *nbor_end=nbor+nbor_pitch*numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,7);
|
||||
vec4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
numtyp a1[9], b1[9], g1[9];
|
||||
{
|
||||
numtyp t[9];
|
||||
gpu_quat_to_mat_trans(i,a1);
|
||||
gpu_quat_to_mat_trans(q,i,a1);
|
||||
gpu_shape_times3(itype,a1,t);
|
||||
gpu_transpose_times3(a1,t,g1);
|
||||
gpu_well_times3(itype,a1,t);
|
||||
|
@ -146,7 +147,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
|
|||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
|
@ -154,20 +155,21 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
|
|||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int jtype=_x_<numtyp>(j,7);
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = _x_<numtyp>(j,0)-ix;
|
||||
r12[1] = _x_<numtyp>(j,1)-iy;
|
||||
r12[2] = _x_<numtyp>(j,2)-iz;
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(j,a2);
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
|
@ -366,35 +368,37 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
|
|||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
acctyp *ap1=ans+ii*ans_pitch;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1=fy;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1=fz;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1=torx;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1=tory;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1=torz;
|
||||
|
||||
} // if ii
|
||||
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
|
||||
const int *dev_nbor, const int nbor_pitch,
|
||||
__global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q,
|
||||
const numtyp *gum, const numtyp *special_lj,
|
||||
const int *dev_nbor, const size_t nbor_pitch,
|
||||
acctyp *ans, size_t ans_pitch, int *err_flag,
|
||||
const bool eflag, const bool vflag,
|
||||
const int start, const int inum,
|
||||
|
@ -425,12 +429,10 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
|
|||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
|
||||
const int *nbor_end=nbor+nbor_pitch*numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,7);
|
||||
vec4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp oner=_shape_<numtyp>(itype,0);
|
||||
numtyp one_well=_well_<numtyp>(itype,0);
|
||||
|
@ -445,13 +447,14 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
|
|||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int jtype=_x_<numtyp>(j,7);
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = _x_<numtyp>(j,0)-ix;
|
||||
r12[1] = _x_<numtyp>(j,1)-iy;
|
||||
r12[2] = _x_<numtyp>(j,2)-iz;
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
|
@ -463,7 +466,7 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
|
|||
r12hat[2]=r12[2]*ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(j,a2);
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], eta;
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
|
@ -611,28 +614,29 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
|
|||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
acctyp *ap1=ans+ii*ans_pitch;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1=fy;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1=fz;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
|
||||
const int *dev_ij, const int nbor_pitch, acctyp *ans,
|
||||
__global__ void kernel_lj(const vec4 *x_,
|
||||
const numtyp *special_lj, const int *dev_nbor,
|
||||
const size_t nbor_pitch, const int *dev_ij, acctyp *ans,
|
||||
size_t ans_pitch, int *err_flag, const bool eflag,
|
||||
const bool vflag, const int start, const int inum,
|
||||
const int nall) {
|
||||
|
@ -663,10 +667,8 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
|
|||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,7);
|
||||
vec4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; list<list_end; list++) {
|
||||
|
@ -678,12 +680,13 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
|
|||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int jtype=_x_<numtyp>(j,7);
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix-_x_<numtyp>(j,0);
|
||||
numtyp dely = iy-_x_<numtyp>(j,1);
|
||||
numtyp delz = iz-_x_<numtyp>(j,2);
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
|
||||
|
@ -716,29 +719,30 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
|
|||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
acctyp *ap1=ans+ii*ans_pitch;
|
||||
if (eflag) {
|
||||
*ap1+=energy;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
}
|
||||
}
|
||||
*ap1+=fx;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1+=fy;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1+=fz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
|
||||
const int *dev_ij, const int nbor_pitch,
|
||||
__global__ void kernel_lj_fast(const vec4 *x_,
|
||||
const numtyp *special_lj, const int *dev_nbor,
|
||||
const size_t nbor_pitch, const int *dev_ij,
|
||||
acctyp *ans, size_t ans_pitch,int *err_flag,
|
||||
const bool eflag, const bool vflag,
|
||||
const int start, const int inum, const int nall){
|
||||
|
@ -788,10 +792,8 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
|
|||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
|
||||
vec4 ix=x_[i];
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; list<list_end; list++) {
|
||||
|
@ -803,12 +805,13 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
|
|||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int mtype=itype+_x_<numtyp>(j,7);
|
||||
vec4 jx=x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix-_x_<numtyp>(j,0);
|
||||
numtyp dely = iy-_x_<numtyp>(j,1);
|
||||
numtyp delz = iz-_x_<numtyp>(j,2);
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
|
||||
|
@ -837,21 +840,21 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
|
|||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
acctyp *ap1=ans+ii*ans_pitch;
|
||||
if (eflag) {
|
||||
*ap1+=energy;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
}
|
||||
}
|
||||
*ap1+=fx;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1+=fy;
|
||||
ap1+=ans_pitch;
|
||||
ap1++;
|
||||
*ap1+=fz;
|
||||
|
||||
} // if ii
|
||||
|
|
|
@ -41,14 +41,17 @@ bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
|||
double *host_lshape, int **h_form, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int max_nbors,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const bool force_d, const int me) {
|
||||
_max_nbors=max_nbors;
|
||||
if (this->allocated)
|
||||
clear();
|
||||
|
||||
bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
|
||||
host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
host_offset, host_special_lj, max_nbors, me);
|
||||
host_offset, host_special_lj, max_nbors, me,
|
||||
nlocal, nall);
|
||||
if (!p)
|
||||
return false;
|
||||
|
||||
|
@ -96,11 +99,28 @@ bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
|||
multiple_forms=true;
|
||||
|
||||
// Memory for ilist ordered by particle type
|
||||
host_olist.safe_alloc_rw(this->max_atoms);
|
||||
|
||||
return true;
|
||||
return host_olist.alloc_rw(this->max_local);
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::resize_atom(const int nall, bool &success) {
|
||||
this->max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
this->atom.resize(this->max_atoms, success);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::resize_local(const int nlocal, const int max_nbors,
|
||||
bool &success) {
|
||||
if (nlocal>this->max_local) {
|
||||
this->max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
|
||||
host_olist.clear();
|
||||
success=success && host_olist.alloc_rw(this->max_local);
|
||||
}
|
||||
if (max_nbors>_max_nbors)
|
||||
_max_nbors=static_cast<int>(static_cast<double>(max_nbors)*1.10);
|
||||
this->nbor.resize(this->max_local,_max_nbors,success);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::clear() {
|
||||
if (!this->allocated)
|
||||
|
|
|
@ -37,8 +37,11 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
|
|||
double **host_epsilon, double *host_lshape, int **h_form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int max_nbors, const bool force_d, const int me);
|
||||
const int max_nbors, const int nlocal, const int nall,
|
||||
const bool force_d, const int me);
|
||||
|
||||
void resize_atom(const int nall, bool &success);
|
||||
void resize_local(const int nlocal, const int max_nbors, bool &success);
|
||||
void clear();
|
||||
|
||||
double host_memory_usage();
|
||||
|
@ -61,6 +64,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
|
|||
bool multiple_forms;
|
||||
int **host_form;
|
||||
int last_ellipse;
|
||||
int _max_nbors;
|
||||
|
||||
private:
|
||||
};
|
||||
|
|
|
@ -74,7 +74,8 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **
|
|||
ij_size=IJ_SIZE;
|
||||
|
||||
bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
|
||||
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
|
||||
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id,
|
||||
0,0);
|
||||
|
||||
ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
|
||||
ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
|
||||
|
@ -101,37 +102,6 @@ EXTERN void lj_gpu_clear() {
|
|||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies and forces for all ij interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=BLOCK_1D;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
|
||||
|
||||
ljm.time_pair.start();
|
||||
|
||||
if (ljm.shared_types)
|
||||
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
|
||||
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
|
||||
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
|
||||
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
|
||||
vflag, ljm.atom.inum(), ljm.atom.nall());
|
||||
else
|
||||
kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
|
||||
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
|
||||
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
|
||||
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
|
||||
vflag, ljm.atom.inum(), ljm.atom.nall());
|
||||
ljm.time_pair.stop();
|
||||
}
|
||||
|
||||
EXTERN void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
|
||||
_lj_gpu<PRECISION,ACC_PRECISION>(LJMF, eflag,vflag,rebuild);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
|
||||
double **host_x, int *host_type, const int inum,
|
||||
|
|
|
@ -217,229 +217,4 @@ __global__ void kernel_lj_cell(float3 *force3,
|
|||
|
||||
}
|
||||
|
||||
|
||||
/* Neigbhor list version of LJ kernel */
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
|
||||
const int *dev_ij, const int nbor_pitch, acctyp *ans,
|
||||
size_t ans_pitch, const bool eflag,
|
||||
const bool vflag, const int inum, const int nall) {
|
||||
__shared__ numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x);
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,3);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; list<list_end; list++) {
|
||||
|
||||
int j=*list;
|
||||
if (j < nall)
|
||||
factor_lj = 1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int jtype=_x_<numtyp>(j,3);
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix-_x_<numtyp>(j,0);
|
||||
numtyp dely = iy-_x_<numtyp>(j,1);
|
||||
numtyp delz = iz-_x_<numtyp>(j,2);
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<_cutsq_<numtyp>(itype,jtype)) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv =r2inv*r2inv*r2inv;
|
||||
numtyp force =factor_lj*r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
|
||||
_lj1_<numtyp>(itype,jtype).y);
|
||||
|
||||
fx+=delx*force;
|
||||
fy+=dely*force;
|
||||
fz+=delz*force;
|
||||
|
||||
if (eflag) {
|
||||
numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
|
||||
_lj3_<numtyp>(itype,jtype).y);
|
||||
energy+=factor_lj*(e-_offset_<numtyp>(1,1));
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fy;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
|
||||
const int *dev_ij, const int nbor_pitch,
|
||||
acctyp *ans, size_t ans_pitch,const bool eflag,
|
||||
const bool vflag, const int inum,
|
||||
const int nall) {
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
__shared__ numtyp sp_lj[4];
|
||||
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
int itype=ii/MAX_SHARED_TYPES;
|
||||
int jtype=ii%MAX_SHARED_TYPES;
|
||||
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
|
||||
lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
|
||||
lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
|
||||
if (eflag) {
|
||||
lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
|
||||
lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
|
||||
offset[ii]=_offset_<numtyp>(itype,jtype);
|
||||
}
|
||||
}
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x);
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
|
||||
|
||||
numtyp factor_lj;
|
||||
|
||||
for ( ; list<list_end; list++) {
|
||||
|
||||
int j= *list;
|
||||
|
||||
if (j < nall)
|
||||
factor_lj = 1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int mtype=itype+_x_<numtyp>(j,3);
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix-_x_<numtyp>(j,0);
|
||||
numtyp dely = iy-_x_<numtyp>(j,1);
|
||||
numtyp delz = iz-_x_<numtyp>(j,2);
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<cutsq[mtype]) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
|
||||
|
||||
fx+=delx*force;
|
||||
fy+=dely*force;
|
||||
fz+=delz*force;
|
||||
|
||||
if (eflag) {
|
||||
numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
|
||||
energy+=factor_lj*(e-offset[mtype]);
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fy;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -25,15 +25,6 @@ int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||
return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
|
||||
int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
|
||||
bytes_per_atom(max_nbors));
|
||||
if (matoms>MAX_ATOMS)
|
||||
matoms=MAX_ATOMS;
|
||||
return matoms;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
double **host_cutsq, double **host_sigma,
|
||||
|
@ -41,7 +32,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
|||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int max_nbors,
|
||||
const int me) {
|
||||
const int me, const int nlocal, const int nall) {
|
||||
if (allocated)
|
||||
clear();
|
||||
|
||||
|
@ -55,9 +46,18 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
|||
time_pair.init();
|
||||
|
||||
// Initialize atom and nbor data
|
||||
max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
|
||||
atom.init(max_atoms);
|
||||
nbor.init(ij_size,max_atoms,max_nbors);
|
||||
max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
|
||||
if (max_local==0)
|
||||
max_local=1000;
|
||||
if (nall<=nlocal)
|
||||
max_atoms=max_local*2;
|
||||
else
|
||||
max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
|
||||
if (!atom.init(max_atoms))
|
||||
return false;
|
||||
if (!nbor.init(ij_size,max_local,max_nbors))
|
||||
return false;
|
||||
|
||||
// Get a stream for computing pair potentials
|
||||
CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
|
||||
|
@ -112,7 +112,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
|||
allocated=true;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void LJ_GPU_MemoryT::clear() {
|
||||
if (!allocated)
|
||||
|
|
|
@ -28,7 +28,6 @@
|
|||
#define BLOCK_1D 64 // max value = 256
|
||||
#define CELL_SIZE BLOCK_1D
|
||||
#define MAX_SHARED_TYPES 8
|
||||
#define PERCENT_GPU_MEMORY 0.7
|
||||
#define BIG_NUMBER 100000000
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -42,14 +41,13 @@ class LJ_GPU_Memory {
|
|||
double **host_sigma, double **host_epsilon,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int max_nbors, const int me);
|
||||
const int max_nbors, const int me, const int nlocal,
|
||||
const int nall);
|
||||
/// Free any memory on host and device
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on GPU per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
/// Maximum number of atoms that can be stored on GPU
|
||||
int get_max_atoms(const size_t gpu_bytes, const int max_nbors);
|
||||
/// Total host memory used by library
|
||||
double host_memory_usage() const;
|
||||
|
||||
|
@ -72,7 +70,7 @@ class LJ_GPU_Memory {
|
|||
NVC_ConstMat< typename nvc_vec_traits<numtyp>::vec2 > lj1, lj3;
|
||||
NVC_VecT special_lj;
|
||||
|
||||
size_t max_atoms;
|
||||
size_t max_atoms, max_local;
|
||||
|
||||
// Timing for pair calculation
|
||||
NVCTimer time_pair;
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
|
@ -17,6 +16,12 @@
|
|||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#if _GLIBCXX_ATOMIC_BUILTINS == 1
|
||||
#undef _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // __APPLE__
|
||||
|
||||
#include "nvc_device.h"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
|
|
@ -20,6 +20,12 @@
|
|||
#ifndef NVC_MACROS_H
|
||||
#define NVC_MACROS_H
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#if _GLIBCXX_ATOMIC_BUILTINS == 1
|
||||
#undef _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // __APPLE__
|
||||
|
||||
#include <stdio.h>
|
||||
#include "math_constants.h"
|
||||
#define INT_MUL(x,y) (__mul24(x,y))
|
||||
|
|
|
@ -96,6 +96,17 @@ class NVC_Host {
|
|||
_end=_array+cols;
|
||||
}
|
||||
|
||||
// Allocate page-locked memory with fast write/slow read on host
|
||||
inline bool alloc_w(const size_t cols) {
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
if (cudaHostAlloc((void **)&_array,_row_bytes,cudaHostAllocWriteCombined)!=
|
||||
cudaSuccess)
|
||||
return false;
|
||||
_end=_array+cols;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allocate page-locked memory with fast read/write on host
|
||||
inline void safe_alloc_rw(const size_t cols) {
|
||||
_cols=cols;
|
||||
|
@ -104,6 +115,16 @@ class NVC_Host {
|
|||
_end=_array+cols;
|
||||
}
|
||||
|
||||
// Allocate page-locked memory with fast read/write on host
|
||||
inline bool alloc_rw(const size_t cols) {
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
if (cudaMallocHost((void **)&_array,_row_bytes)!=cudaSuccess)
|
||||
return false;
|
||||
_end=_array+cols;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Free any memory associated with device
|
||||
inline void clear()
|
||||
{ if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } }
|
||||
|
@ -163,6 +184,13 @@ class NVC_Host {
|
|||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
/// Asynchronous copy to device (numel is not bytes)
|
||||
inline void copy_to_device(size_t offset, numtyp *device_p, size_t numel,
|
||||
cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array+offset,numel*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
/// Asynchronous copy to 2D matrix on device (numel is not bytes)
|
||||
inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
|
||||
const size_t rows, const size_t cols,
|
||||
|
@ -194,6 +222,16 @@ class NVC_Vec {
|
|||
_end=_array+cols;
|
||||
}
|
||||
|
||||
// Row vector on device
|
||||
inline bool alloc(const size_t cols) {
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
if (cudaMalloc((void **)&_array,_row_bytes)!=cudaSuccess)
|
||||
return false;
|
||||
_end=_array+cols;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Row vector on device (allocate and assign texture and bind)
|
||||
inline void safe_alloc(const size_t cols, textureReference *t)
|
||||
{ safe_alloc(cols); assign_texture(t); bind(); }
|
||||
|
@ -221,11 +259,22 @@ class NVC_Vec {
|
|||
{ CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(),
|
||||
cudaMemcpyHostToDevice)); }
|
||||
|
||||
/// Copy from host (n elements)
|
||||
inline void copy_from_host(const numtyp *host_p, const size_t n)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,n*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice)); }
|
||||
|
||||
/// Asynchronous copy from host
|
||||
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
|
||||
{ CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(),
|
||||
cudaMemcpyHostToDevice, stream)); }
|
||||
|
||||
/// Asynchronous copy from host (n elements)
|
||||
inline void copy_from_host(const numtyp *host_p, const size_t n,
|
||||
cudaStream_t &stream)
|
||||
{ CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,n*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice, stream)); }
|
||||
|
||||
/// Copy to host
|
||||
inline void copy_to_host(numtyp *host_p)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(),
|
||||
|
|
|
@ -28,7 +28,9 @@ int PairGPUAtomT::bytes_per_atom() const {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::init(const int max_atoms) {
|
||||
bool PairGPUAtomT::init(const int max_atoms) {
|
||||
bool success=true;
|
||||
|
||||
if (allocated)
|
||||
clear();
|
||||
|
||||
|
@ -39,28 +41,48 @@ void PairGPUAtomT::init(const int max_atoms) {
|
|||
time_answer.init();
|
||||
|
||||
// Device matrices for atom and force data
|
||||
dev_x.safe_alloc(atom_fields(),max_atoms,x_get_texture<numtyp>());
|
||||
ans.safe_alloc(ans_fields(),max_atoms);
|
||||
success=success && dev_x.alloc(max_atoms*sizeof(vec4));
|
||||
success=success && dev_q.alloc(max_atoms*sizeof(vec4));
|
||||
success=success && ans.alloc(ans_fields()*max_atoms);
|
||||
// Get a host read/write buffer
|
||||
success=success && host_read.alloc_rw(max_atoms*ans_fields());
|
||||
|
||||
// Get a host write only buffer
|
||||
host_write.safe_alloc_w(max_atoms*atom_fields());
|
||||
// Get a host read/write buffer
|
||||
host_read.safe_alloc_rw(ans.row_size()*ans_fields());
|
||||
success=success && host_write.alloc_w(max_atoms*atom_fields());
|
||||
|
||||
allocated=true;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::resize(const int max_atoms, bool &success) {
|
||||
ans.clear();
|
||||
dev_x.clear();
|
||||
dev_q.clear();
|
||||
host_write.clear();
|
||||
host_read.clear();
|
||||
|
||||
_max_atoms=max_atoms;
|
||||
|
||||
success = success && dev_x.alloc(_max_atoms*sizeof(vec4));
|
||||
success = success && dev_q.alloc(_max_atoms*sizeof(vec4));
|
||||
success = success && ans.alloc(ans_fields()*_max_atoms);
|
||||
success = success && host_read.alloc_rw(_max_atoms*ans_fields());
|
||||
success = success && host_write.alloc_w(_max_atoms*atom_fields());
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::clear() {
|
||||
if (!allocated)
|
||||
return;
|
||||
allocated=false;
|
||||
|
||||
dev_x.unbind();
|
||||
ans.clear();
|
||||
ans.clear();
|
||||
dev_x.clear();
|
||||
dev_q.clear();
|
||||
host_write.clear();
|
||||
host_read.clear();
|
||||
dev_x.clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -82,88 +104,88 @@ void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
|
|||
if (!vflag)
|
||||
csize-=6;
|
||||
|
||||
host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s);
|
||||
host_read.copy_from_device(ans.begin(),_inum*csize,s);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom,
|
||||
double **vatom, double *virial) {
|
||||
double **vatom, double *virial,
|
||||
double **f, double **tor, const int n) {
|
||||
double evdwl=0.0;
|
||||
int gap=ans.row_size()-_inum;
|
||||
|
||||
acctyp *ap=host_read.begin();
|
||||
if (_eflag) {
|
||||
if (eflag_atom) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=ilist[i];
|
||||
if (_eflag) {
|
||||
if (eflag_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[ilist[i]]+=*ap*0.5;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap++;
|
||||
}
|
||||
} else
|
||||
for (int i=0; i<_inum; i++) {
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap++;
|
||||
}
|
||||
ap+=gap;
|
||||
evdwl*=0.5;
|
||||
}
|
||||
_read_loc=ap;
|
||||
gap=ans.row_size();
|
||||
if (_vflag) {
|
||||
if (vflag_atom) {
|
||||
for (int ii=0; ii<_inum; ii++) {
|
||||
int i=ilist[ii];
|
||||
ap=_read_loc+ii;
|
||||
}
|
||||
if (_vflag) {
|
||||
if (vflag_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
vatom[ii][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=gap;
|
||||
ap++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int ii=0; ii<_inum; ii++) {
|
||||
ap=_read_loc+ii;
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=gap;
|
||||
ap++;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
_read_loc+=gap*6;
|
||||
f[ii][0]+=*ap;
|
||||
ap++;
|
||||
f[ii][1]+=*ap;
|
||||
ap++;
|
||||
f[ii][2]+=*ap;
|
||||
ap++;
|
||||
if (i<n) {
|
||||
tor[ii][0]+=*ap;
|
||||
ap++;
|
||||
tor[ii][1]+=*ap;
|
||||
ap++;
|
||||
tor[ii][2]+=*ap;
|
||||
ap++;
|
||||
} else {
|
||||
ap+=3;
|
||||
}
|
||||
}
|
||||
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
evdwl*=0.5;
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::add_forces(const int *ilist, double **f) {
|
||||
int gap=ans.row_size();
|
||||
for (int ii=0; ii<_inum; ii++) {
|
||||
acctyp *ap=_read_loc+ii;
|
||||
int i=ilist[ii];
|
||||
f[i][0]+=*ap;
|
||||
ap+=gap;
|
||||
f[i][1]+=*ap;
|
||||
ap+=gap;
|
||||
f[i][2]+=*ap;
|
||||
}
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) {
|
||||
int gap=ans.row_size();
|
||||
_read_loc+=gap*3;
|
||||
for (int ii=0; ii<n; ii++) {
|
||||
acctyp *ap=_read_loc+ii;
|
||||
int i=ilist[ii];
|
||||
tor[i][0]+=*ap;
|
||||
ap+=gap;
|
||||
tor[i][1]+=*ap;
|
||||
ap+=gap;
|
||||
tor[i][2]+=*ap;
|
||||
void PairGPUAtomT::copy_asphere(const int *ilist, double **f, double **tor,
|
||||
const int n) {
|
||||
acctyp *ap=host_read.begin();
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=ilist[i];
|
||||
f[ii][0]+=*ap;
|
||||
ap++;
|
||||
f[ii][1]+=*ap;
|
||||
ap++;
|
||||
f[ii][2]+=*ap;
|
||||
ap++;
|
||||
if (i<n) {
|
||||
tor[ii][0]+=*ap;
|
||||
ap++;
|
||||
tor[ii][1]+=*ap;
|
||||
ap++;
|
||||
tor[ii][2]+=*ap;
|
||||
ap++;
|
||||
} else {
|
||||
ap+=3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -26,18 +26,21 @@
|
|||
#define PRECISION float
|
||||
#define ACC_PRECISION double
|
||||
#define MAX_ATOMS 65536
|
||||
#define vec4 float4
|
||||
#endif
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define PRECISION double
|
||||
#define ACC_PRECISION double
|
||||
#define MAX_ATOMS 32768
|
||||
struct vec4 { double x; double y; double z; double w; };
|
||||
#endif
|
||||
|
||||
#ifndef PRECISION
|
||||
#define PRECISION float
|
||||
#define ACC_PRECISION float
|
||||
#define MAX_ATOMS 65536
|
||||
#define vec4 float4
|
||||
#endif
|
||||
|
||||
#include "nvc_timer.h"
|
||||
|
@ -71,7 +74,8 @@ class PairGPUAtom {
|
|||
|
||||
/// Must be called once to allocate host and device memory
|
||||
/** \note atom_fields and ans_fields should be set first if not default **/
|
||||
void init(const int max_atoms);
|
||||
bool init(const int max_atoms);
|
||||
void resize(const int max_atoms, bool &success);
|
||||
|
||||
/// Free all memory on host and device
|
||||
void clear();
|
||||
|
@ -99,11 +103,35 @@ class PairGPUAtom {
|
|||
for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; }
|
||||
}
|
||||
|
||||
/// Copy num_rows x nall() write buffer to x in GPU
|
||||
/** num_rows<=atom_fields() **/
|
||||
inline void copy_atom_data(const int num_rows, cudaStream_t &stream)
|
||||
{ dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); }
|
||||
/// Add positions to write buffer
|
||||
/** Copies nall() elements **/
|
||||
inline void add_x_data(double **host_ptr, const int *host_type) {
|
||||
for (int i=0; i<_nall; i++) {
|
||||
*_write_loc=host_ptr[i][0];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][1];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][2];
|
||||
_write_loc++;
|
||||
*_write_loc=host_type[i];
|
||||
_write_loc++;
|
||||
}
|
||||
}
|
||||
|
||||
/// Add quaternions to write buffer
|
||||
/** Copies nall() elements **/
|
||||
template<class cpytyp>
|
||||
inline void add_q_data(const cpytyp *host_ptr) {
|
||||
const int end=_nall*4;
|
||||
for (int i=0; i<end; i++) { *_write_loc=host_ptr[i]; _write_loc++; }
|
||||
}
|
||||
|
||||
/// Copy num_rows positions+type to x in GPU
|
||||
/** num_rows<=atom_fields() **/
|
||||
inline void copy_x_data(cudaStream_t &stream)
|
||||
{ dev_x.copy_from_host(host_write.begin(),_nall*4,stream); }
|
||||
inline void copy_q_data(cudaStream_t &stream)
|
||||
{ dev_q.copy_from_host(host_write.begin()+_nall*4,_nall*4,stream); }
|
||||
|
||||
// -------------------------COPY FROM GPU -------------------------------
|
||||
|
||||
|
@ -113,22 +141,19 @@ class PairGPUAtom {
|
|||
/// Copy energy and virial data into LAMMPS memory
|
||||
double energy_virial(const int *ilist, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial);
|
||||
|
||||
/// Add forces from the GPU into a LAMMPS pointer
|
||||
void add_forces(const int *ilist, double **f);
|
||||
|
||||
/// Add torques from the GPU into a LAMMPS pointer
|
||||
void add_torques(const int *ilist, double **tor, const int n);
|
||||
|
||||
double *virial, double **f, double **tor, const int);
|
||||
|
||||
/// Add forces and torques from the GPU into a LAMMPS pointer
|
||||
void copy_asphere(const int *ilist, double **f, double **tor, const int n);
|
||||
// ------------------------------ DATA ----------------------------------
|
||||
|
||||
// atom_fields() x n (example: rows 1-3 position, 4 is type)
|
||||
NVC_ConstMatT dev_x;
|
||||
// ans_fields() x n Storage for Forces, etc.
|
||||
// example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force
|
||||
// example: if (!eflag) 1-3 is force
|
||||
NVC_Mat<acctyp> ans;
|
||||
// atom coordinates
|
||||
NVC_Vec<numtyp> dev_x;
|
||||
// quaterions
|
||||
NVC_Vec<numtyp> dev_q;
|
||||
// ans_fields()
|
||||
// example: if (eflag and vflag) 1 is energy, 2-7 is virial
|
||||
NVC_Vec<acctyp> ans;
|
||||
|
||||
// Buffer for moving floating point data to GPU
|
||||
NVC_HostT host_write;
|
||||
|
|
|
@ -17,6 +17,12 @@
|
|||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#if _GLIBCXX_ATOMIC_BUILTINS == 1
|
||||
#undef _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // __APPLE__
|
||||
|
||||
#include <assert.h>
|
||||
#include "lj_gpu_memory.h"
|
||||
#include "pair_gpu_cell.h"
|
||||
|
|
|
@ -26,8 +26,9 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
|
|||
return (max_nbors+3)*sizeof(int);
|
||||
}
|
||||
|
||||
void PairGPUNbor::init(const int ij_size, const int max_atoms,
|
||||
bool PairGPUNbor::init(const int ij_size, const int max_atoms,
|
||||
const int max_nbors) {
|
||||
bool success=true;
|
||||
if (allocated)
|
||||
clear();
|
||||
|
||||
|
@ -35,16 +36,29 @@ void PairGPUNbor::init(const int ij_size, const int max_atoms,
|
|||
time_nbor.init();
|
||||
|
||||
if (_use_packing)
|
||||
dev_nbor.safe_alloc(max_nbors+4,max_atoms);
|
||||
success=success && dev_nbor.alloc((max_nbors+4)*max_atoms);
|
||||
else
|
||||
dev_nbor.safe_alloc(3,max_atoms);
|
||||
success=success && dev_nbor.alloc(3*max_atoms);
|
||||
|
||||
ij.safe_alloc(max_nbors*max_atoms);
|
||||
host_ij.safe_alloc_w(ij_size);
|
||||
success=success && ij.alloc(max_nbors*max_atoms);
|
||||
success=success && host_ij.alloc_w(ij_size);
|
||||
|
||||
allocated=true;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
void PairGPUNbor::resize(const int nlocal, const int max_nbor, bool &success) {
|
||||
dev_nbor.clear();
|
||||
ij.clear();
|
||||
if (_use_packing)
|
||||
success=success && dev_nbor.alloc((max_nbor+4)*nlocal);
|
||||
else
|
||||
success=success && dev_nbor.alloc(3*nlocal);
|
||||
success=success && ij.alloc(max_nbor*nlocal);
|
||||
allocated=true;
|
||||
}
|
||||
|
||||
void PairGPUNbor::clear() {
|
||||
if (!allocated)
|
||||
return;
|
||||
|
@ -54,7 +68,7 @@ void PairGPUNbor::clear() {
|
|||
host_ij.clear();
|
||||
dev_nbor.clear();
|
||||
}
|
||||
|
||||
|
||||
double PairGPUNbor::host_memory_usage() const {
|
||||
return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor);
|
||||
}
|
||||
|
@ -65,12 +79,35 @@ void PairGPUNbor::reset(const int inum, int *ilist, const int *numj,
|
|||
|
||||
dev_nbor.copy_from_host(ilist,inum);
|
||||
int acc=0;
|
||||
for (int i=0; i<inum; i++) {
|
||||
host_ij[i]=numj[ilist[i]];
|
||||
host_ij[i+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
}
|
||||
|
||||
host_ij.copy_to_2Ddevice(dev_nbor.begin()+dev_nbor.row_size(),
|
||||
dev_nbor.row_size(),2,inum, s);
|
||||
|
||||
int ij_size=host_ij.numel();
|
||||
if (inum*2<ij_size) {
|
||||
for (int i=0; i<inum; i++) {
|
||||
host_ij[i]=numj[ilist[i]];
|
||||
host_ij[i+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
}
|
||||
host_ij.copy_to_device(dev_nbor.begin()+inum,2*inum, s);
|
||||
} else {
|
||||
int offset=0;
|
||||
int half=ij_size/2;
|
||||
int hi=0;
|
||||
for (int i=0; i<inum; i++) {
|
||||
host_ij[hi]=numj[ilist[i]];
|
||||
host_ij[hi+half]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
hi++;
|
||||
if (hi==half) {
|
||||
host_ij.copy_to_device(dev_nbor.begin()+inum+offset,half,s);
|
||||
host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,half,s);
|
||||
offset+=half;
|
||||
hi=0;
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(s));
|
||||
}
|
||||
}
|
||||
if (hi>0) {
|
||||
host_ij.copy_to_device(dev_nbor.begin()+inum+offset,hi,s);
|
||||
host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,hi,s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,8 +40,10 @@ class PairGPUNbor {
|
|||
void packing(const bool use_packing) { _use_packing=use_packing; }
|
||||
|
||||
/// Called once to allocate memory
|
||||
void init(const int ij_size, const int max_atoms, const int max_nbors);
|
||||
bool init(const int ij_size, const int max_atoms, const int max_nbors);
|
||||
|
||||
void resize(const int nlocal, const int max_nbor, bool &success);
|
||||
|
||||
/// Free all memory on host and device
|
||||
void clear();
|
||||
|
||||
|
@ -73,7 +75,7 @@ class PairGPUNbor {
|
|||
// - 1st row is i
|
||||
// - 2nd row is numj (number of neighbors)
|
||||
// - 3rd row is starting address in host_ij of neighbors
|
||||
NVC_MatI dev_nbor;
|
||||
NVC_VecI dev_nbor;
|
||||
|
||||
// --------------- Timing Stuff
|
||||
NVCTimer time_nbor;
|
||||
|
|
|
@ -31,32 +31,6 @@
|
|||
#define GB_GPU_DOUBLE
|
||||
#endif
|
||||
|
||||
// ------------------------------- x ------------------------------------
|
||||
|
||||
static texture<float, 2, cudaReadModeElementType> x_float_tex;
|
||||
static texture<int2, 2, cudaReadModeElementType> x_double_tex;
|
||||
template <class numtyp> inline textureReference * x_get_texture() {
|
||||
const textureReference *ptr;
|
||||
cudaGetTextureReference(&ptr,"x_float_tex");
|
||||
return const_cast<textureReference *>(ptr);
|
||||
}
|
||||
template <> inline textureReference * x_get_texture<double>() {
|
||||
const textureReference *ptr;
|
||||
cudaGetTextureReference(&ptr,"x_double_tex");
|
||||
return const_cast<textureReference *>(ptr);
|
||||
}
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp _x_(const int i, const int j) {
|
||||
return tex2D(x_float_tex,i,j);
|
||||
}
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double _x_<double>(const int i,const int j) {
|
||||
int2 t=tex2D(x_double_tex,i,j);
|
||||
return __hiloint2double(t.y, t.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- form ------------------------------------
|
||||
|
||||
static texture<int, 2, cudaReadModeElementType> form_tex;
|
||||
|
|
|
@ -17,6 +17,12 @@
|
|||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#if _GLIBCXX_ATOMIC_BUILTINS == 1
|
||||
#undef _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // __APPLE__
|
||||
|
||||
#include "pair_gpu_atom.cu"
|
||||
#include "lj_gpu.cu"
|
||||
#include "gb_gpu.cu"
|
||||
|
|
Loading…
Reference in New Issue