forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12655 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
c21dee6f53
commit
7510ec7960
|
@ -1 +1 @@
|
|||
Geryon Version 13.209
|
||||
Geryon Version 13.234
|
||||
|
|
|
@ -159,6 +159,12 @@ class UCL_Device {
|
|||
/// Returns true if double precision is support for the device
|
||||
inline bool double_precision(const int i) {return arch(i)>=1.3;}
|
||||
|
||||
/// Get the number of compute units on the current device
|
||||
inline unsigned cus() { return cus(_device); }
|
||||
/// Get the number of compute units
|
||||
inline unsigned cus(const int i)
|
||||
{ return _properties[i].multiProcessorCount; }
|
||||
|
||||
/// Get the number of cores in the current device
|
||||
inline unsigned cores() { return cores(_device); }
|
||||
/// Get the number of cores
|
||||
|
|
|
@ -185,13 +185,12 @@ class UCL_Device {
|
|||
inline bool double_precision(const int i)
|
||||
{return _properties[i].double_precision;}
|
||||
|
||||
/// Get the number of cores in the current device
|
||||
inline unsigned cores() { return cores(_device); }
|
||||
/// Get the number of cores
|
||||
inline unsigned cores(const int i)
|
||||
{ if (device_type(i)==UCL_CPU) return _properties[i].compute_units;
|
||||
else return _properties[i].compute_units*8; }
|
||||
|
||||
/// Get the number of compute units on the current device
|
||||
inline unsigned cus() { return cus(_device); }
|
||||
/// Get the number of compute units
|
||||
inline unsigned cus(const int i)
|
||||
{ return _properties[i].compute_units; }
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
/// Get the gigabytes of global memory
|
||||
|
|
|
@ -127,7 +127,8 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
|
|||
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
|
||||
|
||||
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
|
||||
mat.host_ptr(), &error_flag);
|
||||
*mat.host_ptr(), &error_flag);
|
||||
|
||||
CL_CHECK_ERR(error_flag);
|
||||
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||
return UCL_SUCCESS;
|
||||
|
@ -174,7 +175,7 @@ template <class mat_type>
|
|||
inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
|
||||
cl_int error_flag;
|
||||
mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
|
||||
n,mat.host_ptr(),&error_flag);
|
||||
n,*mat.host_ptr(),&error_flag);
|
||||
CL_CHECK_ERR(error_flag);
|
||||
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||
return UCL_SUCCESS;
|
||||
|
|
|
@ -221,7 +221,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
|||
return energy_virial(eatom,vatom,virial);
|
||||
|
||||
double evdwl=0.0;
|
||||
int vstart=0, iend=_inum;
|
||||
int ii, vstart=0, iend=_inum;
|
||||
if (_eflag) {
|
||||
iend=_inum*2;
|
||||
for (int i=0; i<_inum; i++)
|
||||
|
@ -235,10 +235,10 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
|||
for (int i=_inum; i<iend; i++)
|
||||
eatom[i]+=engv[i];
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++)
|
||||
eatom[_ilist[i]]+=engv[i];
|
||||
for (int i=_inum; i<iend; i++)
|
||||
eatom[_ilist[i]]+=engv[i];
|
||||
for (int i=0, ii=0; i<_inum; i++)
|
||||
eatom[_ilist[ii++]]+=engv[i];
|
||||
for (int i=_inum, ii=0; i<iend; i++)
|
||||
eatom[_ilist[ii++]]+=engv[i];
|
||||
}
|
||||
vstart=iend;
|
||||
iend+=_inum;
|
||||
|
@ -249,12 +249,10 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
|||
virial[j]+=engv[i];
|
||||
if (_vf_atom)
|
||||
if (_ilist==NULL) {
|
||||
int ii=0;
|
||||
for (int i=vstart; i<iend; i++)
|
||||
vatom[ii++][j]+=engv[i];
|
||||
vatom[i++][j]+=engv[i];
|
||||
} else {
|
||||
int ii=0;
|
||||
for (int i=vstart; i<iend; i++)
|
||||
for (int i=vstart, ii=0; i<iend; i++)
|
||||
vatom[_ilist[ii++]][j]+=engv[i];
|
||||
}
|
||||
vstart+=_inum;
|
||||
|
|
|
@ -23,22 +23,21 @@
|
|||
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
|
||||
|
||||
#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
|
||||
i, numj, stride, list_end, nbor) \
|
||||
nbor=nbor_mem+ii; \
|
||||
i=*nbor; \
|
||||
nbor+=nbor_stride; \
|
||||
numj=*nbor; \
|
||||
i, numj, stride, nbor_end, nbor_begin) \
|
||||
i=nbor_mem[ii]; \
|
||||
nbor_begin=ii+nbor_stride; \
|
||||
numj=nbor_mem[nbor_begin]; \
|
||||
if (nbor_mem==packed_mem) { \
|
||||
nbor+=nbor_stride+fast_mul(ii,t_per_atom-1); \
|
||||
nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \
|
||||
stride=fast_mul(t_per_atom,nbor_stride); \
|
||||
list_end=nbor+fast_mul(numj/t_per_atom,stride)+ (numj & (t_per_atom-1)); \
|
||||
nbor+=offset; \
|
||||
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
|
||||
nbor_begin+=offset; \
|
||||
} else { \
|
||||
nbor+=nbor_stride; \
|
||||
nbor=packed_mem+*nbor; \
|
||||
list_end=nbor+numj; \
|
||||
nbor_begin+=nbor_stride; \
|
||||
nbor_begin=nbor_mem[nbor_begin]; \
|
||||
nbor_end=nbor_begin+numj; \
|
||||
stride=t_per_atom; \
|
||||
nbor+=offset; \
|
||||
nbor_begin+=offset; \
|
||||
}
|
||||
|
||||
#if (ARCH < 300)
|
||||
|
@ -75,15 +74,15 @@
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
engv+=ii; \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
*engv=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*engv=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
@ -123,17 +122,17 @@
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
engv+=ii; \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
*engv=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
*engv=e_coul*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*engv=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
@ -158,15 +157,15 @@
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
engv+=ii; \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
*engv=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*engv=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
@ -190,17 +189,17 @@
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
engv+=ii; \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
*engv=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
*engv=e_coul*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*engv=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
|
|
@ -202,6 +202,7 @@ void BaseEllipsoidT::output_times() {
|
|||
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
|
||||
device->replica());
|
||||
double max_mb=mpi_max_bytes/(1024*1024);
|
||||
double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
|
||||
|
||||
if (device->replica_me()==0)
|
||||
if (screen && times[5]>0.0) {
|
||||
|
@ -209,11 +210,11 @@ void BaseEllipsoidT::output_times() {
|
|||
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
fprintf(screen," GPU Time Info (average): ");
|
||||
fprintf(screen," Device Time Info (average): ");
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
if (device->procs_per_gpu()==1) {
|
||||
if (device->procs_per_gpu()==1 && t_time>0) {
|
||||
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
|
||||
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size);
|
||||
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
|
||||
|
@ -226,7 +227,8 @@ void BaseEllipsoidT::output_times() {
|
|||
}
|
||||
if (nbor->gpu_nbor()==2)
|
||||
fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[9]/replica_size);
|
||||
fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size);
|
||||
if (times[6]>0)
|
||||
fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
|
||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||
fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom);
|
||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||
|
|
|
@ -52,19 +52,19 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -154,20 +154,20 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,9 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -53,19 +53,19 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -148,20 +148,20 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -66,18 +66,18 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -188,19 +188,19 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -52,7 +52,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -71,9 +71,9 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -69,11 +69,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -85,8 +85,8 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||
e_coul += (acctyp)2.0*e_self;
|
||||
}
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -105,7 +105,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<cutsq_sigma[mtype].x) { // cutsq
|
||||
numtyp r2inv = ucl_recip(rsq);
|
||||
numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
|
||||
numtyp forcecoul, forceborn, force, r6inv, prefactor;
|
||||
numtyp v_sh = (numtyp)0.0;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
|
@ -124,10 +124,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
|
||||
erfcc = erfc(arij);
|
||||
const numtyp erfcc = erfc(arij);
|
||||
v_sh = (erfcc - e_shift*r)*prefactor;
|
||||
numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
|
||||
forcecoul = prefactor * dvdrr*rsq*factor_coul;
|
||||
forcecoul = prefactor * dvdrr*rsq;
|
||||
if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
} else forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (forceborn + forcecoul) * r2inv;
|
||||
|
@ -137,8 +138,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += v_sh*factor_coul;
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp e=v_sh;
|
||||
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
e_coul += e;
|
||||
}
|
||||
if (rsq < cutsq_sigma[mtype].y) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||
+ coeff2[mtype].z*r2inv*r6inv;
|
||||
|
@ -201,11 +205,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -218,8 +222,8 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
|||
e_coul += (acctyp)2.0*e_self;
|
||||
}
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -237,7 +241,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (rsq<cutsq_sigma[mtype].x) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
|
||||
numtyp forcecoul, forceborn, force, r6inv, prefactor;
|
||||
numtyp v_sh = (numtyp)0.0;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
|
@ -256,10 +260,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
|||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
|
||||
erfcc = erfc(arij);
|
||||
const numtyp erfcc = erfc(arij);
|
||||
v_sh = (erfcc - e_shift*r)*prefactor;
|
||||
numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
|
||||
forcecoul = prefactor * dvdrr*rsq*factor_coul;
|
||||
forcecoul = prefactor * dvdrr*rsq;
|
||||
if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
} else forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (forceborn + forcecoul) * r2inv;
|
||||
|
@ -269,8 +274,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
|||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += v_sh*factor_coul;
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp e=v_sh;
|
||||
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
e_coul += e;
|
||||
}
|
||||
if (rsq < cutsq_sigma[mtype].y) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||
+ coeff2[mtype].z*r2inv*r6inv;
|
||||
|
|
|
@ -52,7 +52,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -72,9 +72,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -68,9 +68,9 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -52,19 +52,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -145,20 +145,20 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -65,18 +65,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -182,19 +182,19 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -51,7 +51,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -70,9 +70,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -66,18 +66,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -190,19 +190,19 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -52,7 +52,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -70,9 +70,9 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -49,7 +49,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -66,9 +66,9 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -52,19 +52,19 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -150,20 +150,20 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -66,9 +66,9 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -65,18 +65,18 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -191,19 +191,19 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -51,7 +51,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -69,9 +69,9 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -66,13 +66,14 @@ int CHARMMLongT::init(const int ntypes,
|
|||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (this->_block_bio_size>=64 && mix_arithmetic)
|
||||
int max_bio_shared_types=this->device->max_bio_shared_types();
|
||||
if (this->_block_bio_size>=64 && mix_arithmetic &&
|
||||
lj_types<=max_bio_shared_types)
|
||||
shared_types=true;
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
int h_size=lj_types*lj_types;
|
||||
int max_bio_shared_types=this->device->max_bio_shared_types();
|
||||
if (h_size<max_bio_shared_types)
|
||||
h_size=max_bio_shared_types;
|
||||
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
|
||||
|
@ -84,8 +85,10 @@ int CHARMMLongT::init(const int ntypes,
|
|||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_lj3,host_lj4);
|
||||
|
||||
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
|
||||
if (shared_types) {
|
||||
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
|
||||
}
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
|
|
|
@ -32,7 +32,7 @@ texture<int2> q_tex;
|
|||
__kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
|
@ -47,16 +47,6 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
|||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
sp_lj[4]=sp_lj_in[4];
|
||||
sp_lj[5]=sp_lj_in[5];
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
|
@ -66,18 +56,18 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -196,18 +186,18 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -53,7 +53,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -72,9 +72,9 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -55,19 +55,19 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -219,20 +219,20 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -70,9 +70,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -60,20 +60,19 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
numtyp factor_coul;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
numtyp factor_coul;
|
||||
int j=dev_packed[nbor];
|
||||
factor_coul = sp_cl[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -152,21 +151,21 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
numtyp factor_coul;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
numtyp factor_coul = sp_cl[sbmask(j)];
|
||||
int j=dev_packed[nbor];
|
||||
factor_coul = sp_cl[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
|
|
@ -30,19 +30,19 @@ texture<int2> q_tex;
|
|||
#endif
|
||||
|
||||
__kernel void k_coul_debye(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict scale,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_ ,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const numtyp qqrd2e, const numtyp kappa,
|
||||
const int t_per_atom) {
|
||||
const __global numtyp *restrict scale,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_ ,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const numtyp qqrd2e, const numtyp kappa,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
|
@ -61,20 +61,20 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_coul = sp_cl[sbmask(j)];
|
||||
numtyp factor_coul;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_coul = sp_cl[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -158,21 +158,21 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
numtyp factor_coul;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
numtyp factor_coul = sp_cl[sbmask(j)];
|
||||
int j=dev_packed[nbor];
|
||||
factor_coul = sp_cl[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
|
|
@ -62,11 +62,11 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -77,8 +77,8 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
|
|||
e_coul += (acctyp)2.0*e_self;
|
||||
}
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul, r, prefactor, erfcc;
|
||||
factor_coul = sp_lj[sbmask(j)];
|
||||
|
@ -163,11 +163,11 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -178,8 +178,8 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
|
|||
e_coul += (acctyp)2.0*e_self;
|
||||
}
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul, r, prefactor, erfcc;
|
||||
factor_coul = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -49,7 +49,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -66,9 +66,9 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -124,7 +124,8 @@ texture<int2> q_tex;
|
|||
#endif
|
||||
|
||||
__kernel void k_coul_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict scale,
|
||||
const __global numtyp4 *restrict lj1,
|
||||
const __global numtyp4 *restrict lj3,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
|
@ -153,44 +154,41 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul;
|
||||
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp force, prefactor, _erfc;
|
||||
|
||||
|
||||
numtyp r = ucl_rsqrt(r2inv);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * scale[mtype] * qtmp/r;
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
|
@ -198,7 +196,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
|
|||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
@ -217,7 +215,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
|
|||
}
|
||||
|
||||
__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict scale_in,
|
||||
const __global numtyp4 *restrict lj1_in,
|
||||
const __global numtyp4 *restrict lj3_in,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
|
@ -231,14 +230,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
|
|||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_cl[4];
|
||||
if (tid<4)
|
||||
sp_cl[tid]=sp_cl_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
scale[tid]=scale_in[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
|
@ -249,27 +244,24 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw = ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_coul;
|
||||
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
|
@ -280,13 +272,13 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp force, prefactor, _erfc;
|
||||
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
numtyp r = ucl_rsqrt(r2inv);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * scale[mtype] * qtmp/r;
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
|
@ -294,7 +286,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
|
|||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
|
|
@ -48,7 +48,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,9 @@ int cl_gpu_init(const int ntypes, double **host_scale,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -48,7 +48,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
|||
const int last_gpu, const int gpu_mode,
|
||||
const double p_split, const int nthreads,
|
||||
const int t_per_atom, const double cell_size,
|
||||
char *ocl_vendor) {
|
||||
char *ocl_vendor, const int block_pair) {
|
||||
_nthreads=nthreads;
|
||||
#ifdef _OPENMP
|
||||
omp_set_num_threads(nthreads);
|
||||
|
@ -66,6 +66,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
|||
_gpu_mode=gpu_mode;
|
||||
_particle_split=p_split;
|
||||
_cell_size=cell_size;
|
||||
_block_pair=block_pair;
|
||||
|
||||
// Get the rank/size within the world
|
||||
MPI_Comm_rank(_comm_world,&_world_me);
|
||||
|
@ -175,6 +176,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
|
|||
} else if (s_vendor=="cypress") {
|
||||
_ocl_vendor_name="AMD Cypress";
|
||||
_ocl_vendor_string="-DCYPRESS_OCL";
|
||||
} else if (s_vendor=="phi") {
|
||||
_ocl_vendor_name="Intel Phi";
|
||||
_ocl_vendor_string="-DPHI_OCL";
|
||||
} else if (s_vendor=="intel") {
|
||||
_ocl_vendor_name="Intel CPU";
|
||||
_ocl_vendor_string="-DINTEL_OCL";
|
||||
} else if (s_vendor=="generic") {
|
||||
_ocl_vendor_name="GENERIC";
|
||||
_ocl_vendor_string="-DGENERIC_OCL";
|
||||
|
@ -352,10 +359,10 @@ void DeviceT::init_message(FILE *screen, const char *name,
|
|||
for (int i=first_gpu; i<last; i++) {
|
||||
std::string sname;
|
||||
if (i==first_gpu)
|
||||
sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
|
||||
sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
|
||||
toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ (";
|
||||
else
|
||||
sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
|
||||
sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
|
||||
toa(gpu->clock_rate(i))+" GHZ (";
|
||||
if (sizeof(PRECISION)==4) {
|
||||
if (sizeof(ACC_PRECISION)==4)
|
||||
|
@ -520,7 +527,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
|
|||
if (screen && times[5]>0.0) {
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
fprintf(screen," Device Time Info (average): ");
|
||||
fprintf(screen," Device Time Info (average): ");
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
|
@ -582,7 +589,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
|
|||
if (screen && times[6]>0.0) {
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
fprintf(screen," Device Time Info (average): ");
|
||||
fprintf(screen," Device Time Info (average): ");
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
|
@ -672,7 +679,7 @@ int DeviceT::compile_kernels() {
|
|||
_threads_per_charge=gpu_lib_data[13];
|
||||
_pppm_max_spline=gpu_lib_data[4];
|
||||
_pppm_block=gpu_lib_data[5];
|
||||
_block_pair=gpu_lib_data[6];
|
||||
if (_block_pair == -1) _block_pair=gpu_lib_data[6];
|
||||
_max_shared_types=gpu_lib_data[7];
|
||||
_block_cell_2d=gpu_lib_data[8];
|
||||
_block_cell_id=gpu_lib_data[9];
|
||||
|
@ -714,10 +721,10 @@ int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
|||
const int last_gpu, const int gpu_mode,
|
||||
const double particle_split, const int nthreads,
|
||||
const int t_per_atom, const double cell_size,
|
||||
char *opencl_vendor) {
|
||||
char *opencl_vendor, const int block_pair) {
|
||||
return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
|
||||
particle_split,nthreads,t_per_atom,
|
||||
cell_size,opencl_vendor);
|
||||
cell_size,opencl_vendor,block_pair);
|
||||
}
|
||||
|
||||
void lmp_clear_device() {
|
||||
|
|
|
@ -51,7 +51,7 @@ class Device {
|
|||
const int last_gpu, const int gpu_mode,
|
||||
const double particle_split, const int nthreads,
|
||||
const int t_per_atom, const double cell_size,
|
||||
char *vendor_string);
|
||||
char *vendor_string, const int block_pair);
|
||||
|
||||
/// Initialize the device for Atom and Neighbor storage
|
||||
/** \param rot True if quaternions need to be stored
|
||||
|
@ -96,12 +96,12 @@ class Device {
|
|||
const int first_gpu, const int last_gpu);
|
||||
|
||||
/// Perform charge assignment asynchronously for PPPM
|
||||
void set_single_precompute(PPPM<numtyp,acctyp,
|
||||
float,_lgpu_float4> *pppm);
|
||||
void set_single_precompute(PPPM<numtyp,acctyp,
|
||||
float,_lgpu_float4> *pppm);
|
||||
|
||||
/// Perform charge assignment asynchronously for PPPM
|
||||
void set_double_precompute(PPPM<numtyp,acctyp,
|
||||
double,_lgpu_double4> *pppm);
|
||||
void set_double_precompute(PPPM<numtyp,acctyp,
|
||||
double,_lgpu_double4> *pppm);
|
||||
|
||||
/// Esimate the overhead from GPU calls from multiple procs
|
||||
/** \param kernel_calls Number of kernel calls/timestep for timing estimated
|
||||
|
|
|
@ -73,17 +73,17 @@ texture<int4,1> mu_tex;
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
engv+=ii; \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
*engv=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
*engv=e_coul*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*engv=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
@ -113,17 +113,17 @@ texture<int4,1> mu_tex;
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
engv+=ii; \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
*engv=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
*engv=e_coul*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*engv=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
@ -173,19 +173,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -385,11 +385,11 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -397,8 +397,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
|
|||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -50,7 +50,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -68,9 +68,9 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -74,17 +74,17 @@ texture<int4,1> mu_tex;
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
engv+=ii; \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
*engv=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
*engv=e_coul*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*engv=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
@ -114,17 +114,17 @@ texture<int4,1> mu_tex;
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
engv+=ii; \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
*engv=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
*engv=e_coul*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*engv=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
@ -174,19 +174,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -418,11 +418,11 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -430,8 +430,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
|
|||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -50,7 +50,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -68,9 +68,9 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -187,11 +187,10 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
@ -199,9 +198,9 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
|
|||
int itag=iv.w;
|
||||
|
||||
numtyp factor_dpd;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_dpd = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -308,11 +307,10 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
|
@ -321,9 +319,9 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
|
|||
int itag=iv.w;
|
||||
|
||||
numtyp factor_dpd;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_dpd = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -115,14 +115,15 @@ texture<int4> z2r_sp2_tex;
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
engv[ii]+=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]+=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ii]=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
@ -171,15 +172,15 @@ texture<int4> z2r_sp2_tex;
|
|||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
engv+=ii; \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
*engv+=energy*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]+=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
*engv=virial[i]*(acctyp)0.5; \
|
||||
engv+=inum; \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
|
@ -209,17 +210,17 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
|
|||
acctyp energy = (acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -286,17 +287,17 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -354,18 +355,18 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -468,19 +469,19 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
|
|
@ -58,7 +58,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -77,9 +77,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -31,14 +31,13 @@ texture<int4,1> pos_tex, quat_tex;
|
|||
#endif
|
||||
|
||||
#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
|
||||
i, numj, stride, list_end, nbor) \
|
||||
nbor=nbor_mem+ii; \
|
||||
i=*nbor; \
|
||||
nbor+=nbor_stride; \
|
||||
numj=*nbor; \
|
||||
nbor+=nbor_stride; \
|
||||
list_end=nbor+fast_mul(nbor_stride,numj); \
|
||||
nbor+=fast_mul(offset,nbor_stride); \
|
||||
i, numj, stride, nbor_end, nbor_begin) \
|
||||
i=nbor_mem[ii]; \
|
||||
nbor_begin=ii+nbor_stride; \
|
||||
numj=nbor_mem[nbor_begin]; \
|
||||
nbor_begin+=nbor_stride; \
|
||||
nbor_end=nbor_begin+fast_mul(nbor_stride,numj); \
|
||||
nbor_begin+=fast_mul(offset,nbor_stride); \
|
||||
stride=fast_mul(t_per_atom,nbor_stride);
|
||||
|
||||
#if (ARCH < 300)
|
||||
|
|
|
@ -41,20 +41,19 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
|
|||
int ii=GLOBAL_ID_X+start;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
int i=dev_ij[ii];
|
||||
int nbor=ii+nbor_pitch;
|
||||
int numj=dev_ij[nbor];
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
int nbor_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
int packed=ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul(iw,ntypes);
|
||||
int newj=0;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
int j=dev_ij[nbor];
|
||||
j &= NEIGHMASK;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
@ -70,7 +69,7 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
|
|||
rsq+=t*t;
|
||||
|
||||
if (rsq<cf.x) {
|
||||
*packed=j;
|
||||
dev_nbor[packed]=j;
|
||||
packed+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
|
@ -105,21 +104,20 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
int i=dev_ij[ii];
|
||||
int nbor=ii+nbor_pitch;
|
||||
int numj=dev_ij[nbor];
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
int nbor_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
int packed=ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
int newj=0;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
int j=dev_ij[nbor];
|
||||
j &= NEIGHMASK;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
@ -135,7 +133,7 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
|
|||
rsq+=t*t;
|
||||
|
||||
if (rsq<cutsq[mtype]) {
|
||||
*packed=j;
|
||||
dev_nbor[packed]=j;
|
||||
packed+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
|
|
|
@ -51,19 +51,19 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -138,20 +138,20 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,9 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -119,7 +119,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *nbor_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||
|
@ -140,7 +140,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
int j=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -71,9 +71,9 @@ int gb_gpu_init(const int ntypes, const double gamma,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *nbor_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||
|
@ -68,7 +68,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
|||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -276,19 +276,19 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -371,20 +371,20 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
|
|||
const __global numtyp4 *restrict lj1,
|
||||
const __global numtyp4 *restrict lj3,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
|
@ -38,12 +38,6 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
|
|||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
|
@ -52,19 +46,18 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -142,20 +135,19 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -52,19 +52,19 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -143,20 +143,20 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,9 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -65,18 +65,18 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -187,19 +187,19 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -51,7 +51,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -69,9 +69,9 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -65,18 +65,18 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -178,19 +178,19 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -66,18 +66,18 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -186,19 +186,19 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -51,7 +51,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -69,9 +69,9 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -68,9 +68,9 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -80,7 +80,7 @@ int LJCoulLongT::init(const int ntypes,
|
|||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq, host_cut_ljsq);
|
||||
host_cutsq, host_cut_ljsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
|
@ -102,6 +102,23 @@ int LJCoulLongT::init(const int ntypes,
|
|||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void LJCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **host_offset, double **host_cut_ljsq) {
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
|
||||
for (int i=0; i<_lj_types*_lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq, host_cut_ljsq);
|
||||
this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void LJCoulLongT::clear() {
|
||||
if (!_allocated)
|
||||
|
|
|
@ -65,18 +65,18 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -183,19 +183,19 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -46,6 +46,11 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
|
|||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
|
||||
/// Send updated coeffs from host to device (to be compatible with fix adapt)
|
||||
void reinit(const int ntypes, double **host_cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double **host_cut_ljsq);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
|
|
@ -51,7 +51,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -69,9 +69,9 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
@ -93,6 +93,29 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
return init_ok;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy updated coeffs from host to device
|
||||
// ---------------------------------------------------------------------------
|
||||
void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double **host_cut_ljsq) {
|
||||
int world_me=LJCLMF.device->world_me();
|
||||
int gpu_rank=LJCLMF.device->gpu_rank();
|
||||
int procs_per_gpu=LJCLMF.device->procs_per_gpu();
|
||||
|
||||
if (world_me==0)
|
||||
LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, host_cut_ljsq);
|
||||
LJCLMF.device->world_barrier();
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, host_cut_ljsq);
|
||||
LJCLMF.device->gpu_barrier();
|
||||
}
|
||||
}
|
||||
|
||||
void ljcl_gpu_clear() {
|
||||
LJCLMF.clear();
|
||||
}
|
||||
|
|
|
@ -118,11 +118,11 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -130,8 +130,8 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp cut_coul = ucl_sqrt(cut_coulsq);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -239,11 +239,11 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -252,8 +252,8 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp cut_coul = ucl_sqrt(cut_coulsq);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -51,7 +51,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -70,9 +70,9 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -68,11 +68,11 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -84,8 +84,8 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
|
|||
e_coul += (acctyp)2.0*e_self;
|
||||
}
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -195,11 +195,11 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
@ -212,8 +212,8 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
|
|||
e_coul += (acctyp)2.0*e_self;
|
||||
}
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
|
|
@ -52,7 +52,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -71,9 +71,9 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -54,19 +54,19 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -147,20 +147,20 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -66,9 +66,9 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -48,7 +48,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,9 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -55,19 +55,18 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -161,20 +160,19 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -52,19 +52,19 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -143,20 +143,20 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -68,9 +68,9 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -54,19 +54,19 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -145,20 +145,20 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -66,9 +66,9 @@ int mor_gpu_init(const int ntypes, double **cutsq,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -65,6 +65,11 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
|
|||
else
|
||||
_alloc_packed=false;
|
||||
|
||||
if (pre_cut)
|
||||
_packed_permissions=UCL_READ_WRITE;
|
||||
else
|
||||
_packed_permissions=UCL_READ_ONLY;
|
||||
|
||||
bool success=true;
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
|
@ -121,7 +126,7 @@ void Neighbor::alloc(bool &success) {
|
|||
if (_alloc_packed) {
|
||||
dev_packed.clear();
|
||||
success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
_packed_permissions)==UCL_SUCCESS);
|
||||
_c_bytes+=dev_packed.row_bytes();
|
||||
}
|
||||
if (_max_host>0) {
|
||||
|
|
|
@ -227,6 +227,7 @@ class Neighbor {
|
|||
int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
|
||||
bool _gpu_host, _alloc_packed;
|
||||
double _cutoff, _cell_size, _bin_time;
|
||||
enum UCL_MEMOPT _packed_permissions;
|
||||
|
||||
double _gpu_bytes, _c_bytes, _cell_bytes;
|
||||
void alloc(bool &success);
|
||||
|
|
|
@ -25,17 +25,17 @@ __kernel void kernel_unpack(__global int *dev_nbor,
|
|||
int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii+inum;
|
||||
int numj=*nbor;
|
||||
int nbor=ii+inum;
|
||||
int numj=dev_nbor[nbor];
|
||||
nbor+=inum;
|
||||
const __global int *list=dev_ij+*nbor;
|
||||
const __global int *list_end=list+numj;
|
||||
int list=dev_nbor[nbor];
|
||||
int list_end=list+numj;
|
||||
list+=offset;
|
||||
nbor+=fast_mul(ii,t_per_atom-1)+offset;
|
||||
int stride=fast_mul(t_per_atom,inum);
|
||||
|
||||
for ( ; list<list_end; list++) {
|
||||
*nbor=*list;
|
||||
dev_nbor[nbor]=dev_ij[list];
|
||||
nbor+=stride;
|
||||
}
|
||||
} // if ii
|
||||
|
|
|
@ -160,7 +160,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
|
|||
|
||||
// Allocate error flags for checking out of bounds atoms
|
||||
success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||
if (!success) {
|
||||
flag=-3;
|
||||
return 0;
|
||||
|
|
|
@ -51,7 +51,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -69,9 +69,9 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -333,6 +333,54 @@ inline double shfl_xor(double var, int laneMask, int width) {
|
|||
|
||||
#endif
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// INTEL CPU OPENCL DEFINITIONS
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
#ifdef INTEL_OCL
|
||||
|
||||
#define USE_OPENCL
|
||||
#define MEM_THREADS 16
|
||||
#define THREADS_PER_ATOM 1
|
||||
#define THREADS_PER_CHARGE 1
|
||||
#define BLOCK_PAIR 1
|
||||
#define MAX_SHARED_TYPES 0
|
||||
#define BLOCK_NBOR_BUILD 4
|
||||
#define BLOCK_BIO_PAIR 2
|
||||
#define BLOCK_ELLIPSE 2
|
||||
|
||||
#define WARP_SIZE 1
|
||||
#define PPPM_BLOCK_1D 32
|
||||
#define BLOCK_CELL_2D 1
|
||||
#define BLOCK_CELL_ID 2
|
||||
#define MAX_BIO_SHARED_TYPES 0
|
||||
|
||||
#endif
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// INTEL PHI OPENCL DEFINITIONS
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
#ifdef PHI_OCL
|
||||
|
||||
#define USE_OPENCL
|
||||
#define MEM_THREADS 16
|
||||
#define THREADS_PER_ATOM 1
|
||||
#define THREADS_PER_CHARGE 1
|
||||
#define BLOCK_PAIR 16
|
||||
#define MAX_SHARED_TYPES 0
|
||||
#define BLOCK_NBOR_BUILD 16
|
||||
#define BLOCK_BIO_PAIR 16
|
||||
#define BLOCK_ELLIPSE 16
|
||||
|
||||
#define WARP_SIZE 1
|
||||
#define PPPM_BLOCK_1D 32
|
||||
#define BLOCK_CELL_2D 4
|
||||
#define BLOCK_CELL_ID 16
|
||||
#define MAX_BIO_SHARED_TYPES 0
|
||||
|
||||
#endif
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// GENERIC OPENCL DEFINITIONS
|
||||
// -------------------------------------------------------------------------
|
||||
|
@ -433,7 +481,9 @@ inline double shfl_xor(double var, int laneMask, int width) {
|
|||
// ARCHITECTURE INDEPENDENT DEFINITIONS
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
#ifndef PPPM_MAX_SPLINE
|
||||
#define PPPM_MAX_SPLINE 8
|
||||
#endif
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
|
|
|
@ -74,7 +74,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *nbor_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||
|
@ -125,7 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
int j=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -68,9 +68,9 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -172,7 +172,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *nbor_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||
|
@ -199,7 +199,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
int j=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -424,7 +424,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *nbor_end;
|
||||
int nbor, nbor_end;
|
||||
int j, numj;
|
||||
__local int n_stride;
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
|
||||
|
@ -435,7 +435,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int i=*nbor;
|
||||
int i=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(i)];
|
||||
i &= NEIGHMASK;
|
||||
|
||||
|
@ -616,19 +616,19 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -710,20 +710,20 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -53,19 +53,19 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -141,20 +141,20 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,9 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -156,18 +156,18 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -195,17 +195,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
|||
numtyp sw_cut=sw3_ijparam.x;
|
||||
numtyp sw_cutsq=sw3_ijparam.y;
|
||||
numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
|
||||
ucl_powr(sw_sigma,sw_powerp);
|
||||
pow(sw_sigma,sw_powerp);
|
||||
numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
|
||||
ucl_powr(sw_sigma,sw_powerq);
|
||||
pow(sw_sigma,sw_powerq);
|
||||
numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
|
||||
ucl_powr(sw_sigma,sw_powerp+(numtyp)1.0);
|
||||
pow(sw_sigma,sw_powerp+(numtyp)1.0);
|
||||
numtyp pre_sw_c4=sw_biga*sw_epsilon*
|
||||
ucl_powr(sw_sigma,sw_powerq+(numtyp)1.0);
|
||||
pow(sw_sigma,sw_powerq+(numtyp)1.0);
|
||||
numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
|
||||
ucl_powr(sw_sigma,sw_powerp);
|
||||
pow(sw_sigma,sw_powerp);
|
||||
numtyp pre_sw_c6=sw_biga*sw_epsilon*
|
||||
ucl_powr(sw_sigma,sw_powerq);
|
||||
pow(sw_sigma,sw_powerq);
|
||||
|
||||
numtyp r=ucl_sqrt(rsq);
|
||||
numtyp rp=ucl_powr(r,-sw_powerp);
|
||||
|
@ -343,6 +343,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||
const int t_per_atom, const int evatom) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||
|
||||
|
@ -359,21 +360,20 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor_j, *list_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor_j, nbor_end;
|
||||
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,list_end,nbor_j);
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
|
||||
int j=*nbor_j;
|
||||
int j=dev_packed[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -392,15 +392,17 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||
if (rsq1 > sw3_ijparam.y) continue;
|
||||
|
||||
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||
sw_sigma=sw1_ijparam.y;
|
||||
sw_gamma=sw1_ijparam.w;
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
const __global int *nbor_k=nbor_j-offset_j+offset_k;
|
||||
int nbor_k=nbor_j-offset_j+offset_k;
|
||||
if (nbor_k<=nbor_j)
|
||||
nbor_k+=n_stride;
|
||||
|
||||
for ( ; nbor_k<list_end; nbor_k+=n_stride) {
|
||||
int k=*nbor_k;
|
||||
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
|
||||
int k=dev_packed[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||
|
@ -415,11 +417,15 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
||||
if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y;
|
||||
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||
sw_sigma=sw1_ikparam.y;
|
||||
sw_gamma=sw1_ikparam.w;
|
||||
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_ikparam.x;
|
||||
|
||||
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
|
||||
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||
sw_epsilon=sw1_ijkparam.x;
|
||||
sw_lambda=sw1_ijkparam.z;
|
||||
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||
|
@ -466,6 +472,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||
const int t_per_atom) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||
|
||||
|
@ -482,20 +489,19 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor_j, *list_end, *k_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,list_end,nbor_j);
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
|
||||
int j=*nbor_j;
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -513,27 +519,27 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (rsq1 > sw3_ijparam.y) continue;
|
||||
|
||||
int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype];
|
||||
numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex);
|
||||
numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex);
|
||||
sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_jiparam.x;
|
||||
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||
sw_sigma=sw1_ijparam.y;
|
||||
sw_gamma=sw1_ijparam.w;
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
|
||||
int numk=*nbor_k;
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
if (dev_nbor==dev_packed) {
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_packed+*nbor_k;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=*nbor_k;
|
||||
int k=dev_packed[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -541,25 +547,29 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||
int ktype=kx.w;
|
||||
ktype=map[ktype];
|
||||
int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype];
|
||||
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
|
||||
|
||||
numtyp delr2x = kx.x - jx.x;
|
||||
numtyp delr2y = kx.y - jx.y;
|
||||
numtyp delr2z = kx.z - jx.z;
|
||||
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
||||
numtyp4 sw3_jkparam; fetch4(sw3_jkparam,jkparam,sw3_tex);
|
||||
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
|
||||
|
||||
if (rsq2 < sw3_jkparam.y) {
|
||||
numtyp4 sw1_jkparam; fetch4(sw1_jkparam,jkparam,sw1_tex);
|
||||
sw_sigma_gamma_ik=sw1_jkparam.y*sw1_jkparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_jkparam.x;
|
||||
if (rsq2 < sw3_ikparam.y) {
|
||||
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||
sw_sigma=sw1_ikparam.y;
|
||||
sw_gamma=sw1_ikparam.w;
|
||||
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_ikparam.x;
|
||||
|
||||
int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
|
||||
numtyp4 sw1_jikparam; fetch4(sw1_jikparam,jikparam,sw1_tex);
|
||||
sw_lambda_epsilon_ijk=sw1_jikparam.x*sw1_jikparam.z; //sw_lambda*sw_epsilon;
|
||||
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
|
||||
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||
sw_epsilon=sw1_ijkparam.x;
|
||||
sw_lambda=sw1_ijkparam.z;
|
||||
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||
numtyp4 sw3_jikparam; fetch4(sw3_jikparam,jikparam,sw3_tex);
|
||||
sw_costheta_ijk=sw3_jikparam.z;
|
||||
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||
sw_costheta_ijk=sw3_ijkparam.z;
|
||||
|
||||
numtyp fjx, fjy, fjz;
|
||||
//if (evatom==0) {
|
||||
|
@ -602,6 +612,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
const int t_per_atom) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||
|
||||
|
@ -618,20 +629,19 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor_j, *list_end, *k_end;
|
||||
int i, numj;
|
||||
int i, numj, nbor_j, nbor_end, k_end;
|
||||
|
||||
int offset_j=offset/t_per_atom;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||
n_stride,list_end,nbor_j);
|
||||
n_stride,nbor_end,nbor_j);
|
||||
int offset_k=tid & (t_per_atom-1);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
|
||||
int j=*nbor_j;
|
||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||
int j=dev_packed[nbor_j];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
@ -649,27 +659,27 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (rsq1 > sw3_ijparam.y) continue;
|
||||
|
||||
int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype];
|
||||
numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex);
|
||||
numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex);
|
||||
sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_jiparam.x;
|
||||
|
||||
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
|
||||
int numk=*nbor_k;
|
||||
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||
sw_sigma=sw1_ijparam.y;
|
||||
sw_gamma=sw1_ijparam.w;
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
if (dev_nbor==dev_packed) {
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_packed+*nbor_k;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
nbor_k+=offset_k;
|
||||
}
|
||||
|
||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||
int k=*nbor_k;
|
||||
int k=dev_packed[nbor_k];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if (k == i) continue;
|
||||
|
@ -677,25 +687,29 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||
int ktype=kx.w;
|
||||
ktype=map[ktype];
|
||||
int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype];
|
||||
numtyp4 sw3_jkparam; fetch4(sw3_jkparam,jkparam,sw3_tex);
|
||||
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
|
||||
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
|
||||
|
||||
numtyp delr2x = kx.x - jx.x;
|
||||
numtyp delr2y = kx.y - jx.y;
|
||||
numtyp delr2z = kx.z - jx.z;
|
||||
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
||||
|
||||
if (rsq2 < sw3_jkparam.y) {
|
||||
numtyp4 sw1_jkparam; fetch4(sw1_jkparam,jkparam,sw1_tex);
|
||||
sw_sigma_gamma_ik=sw1_jkparam.y*sw1_jkparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_jkparam.x;
|
||||
if (rsq2 < sw3_ikparam.y) {
|
||||
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||
sw_sigma=sw1_ikparam.y;
|
||||
sw_gamma=sw1_ikparam.w;
|
||||
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_ikparam.x;
|
||||
|
||||
int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
|
||||
numtyp4 sw1_jikparam; fetch4(sw1_jikparam,jikparam,sw1_tex);
|
||||
sw_lambda_epsilon_ijk=sw1_jikparam.x*sw1_jikparam.z; //sw_lambda*sw_epsilon;
|
||||
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
|
||||
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||
sw_epsilon=sw1_ijkparam.x;
|
||||
sw_lambda=sw1_ijkparam.z;
|
||||
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||
numtyp4 sw3_jikparam; fetch4(sw3_jikparam,jikparam,sw3_tex);
|
||||
sw_costheta_ijk=sw3_jikparam.z;
|
||||
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||
sw_costheta_ijk=sw3_ijkparam.z;
|
||||
|
||||
numtyp fjx, fjy, fjz, fkx, fky, fkz;
|
||||
threebody(delr1x,delr1y,delr1z,eflag,energy);
|
||||
|
|
|
@ -73,19 +73,19 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
|
|||
int tlm1 = tablength - 1;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -171,20 +171,20 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
|
|||
int tlm1 = tablength - 1;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -269,19 +269,19 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
|
|||
int tlm1 = tablength - 1;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -371,20 +371,20 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
|
|||
int tlm1 = tablength - 1;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -473,19 +473,19 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
|
|||
int tlm1 = tablength - 1;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -582,20 +582,20 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
|
|||
int tlm1 = tablength - 1;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -693,19 +693,19 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
|
|||
int tlm1 = tablength - 1;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -800,20 +800,20 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
|
|||
int tlm1 = tablength - 1;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,9 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -51,19 +51,19 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -140,20 +140,20 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -58,20 +58,20 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp radi; fetch(radi,i,rad_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
@ -150,11 +150,11 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
const __global int *nbor, *list_end;
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp radi; fetch(radi,i,rad_tex);
|
||||
|
@ -162,9 +162,9 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,9 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
|
@ -48,7 +48,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
|
|||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
|
@ -65,9 +65,9 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
|
|||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue