git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12655 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2014-10-29 15:47:24 +00:00
parent c21dee6f53
commit 7510ec7960
95 changed files with 850 additions and 745 deletions

View File

@ -1 +1 @@
Geryon Version 13.209
Geryon Version 13.234

View File

@ -159,6 +159,12 @@ class UCL_Device {
/// Returns true if double precision is support for the device
inline bool double_precision(const int i) {return arch(i)>=1.3;}
/// Get the number of compute units on the current device
inline unsigned cus() { return cus(_device); }
/// Get the number of compute units
inline unsigned cus(const int i)
{ return _properties[i].multiProcessorCount; }
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores

View File

@ -185,13 +185,12 @@ class UCL_Device {
inline bool double_precision(const int i)
{return _properties[i].double_precision;}
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores
inline unsigned cores(const int i)
{ if (device_type(i)==UCL_CPU) return _properties[i].compute_units;
else return _properties[i].compute_units*8; }
/// Get the number of compute units on the current device
inline unsigned cus() { return cus(_device); }
/// Get the number of compute units
inline unsigned cus(const int i)
{ return _properties[i].compute_units; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory

View File

@ -127,7 +127,8 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
mat.host_ptr(), &error_flag);
*mat.host_ptr(), &error_flag);
CL_CHECK_ERR(error_flag);
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
@ -174,7 +175,7 @@ template <class mat_type>
inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
cl_int error_flag;
mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
n,mat.host_ptr(),&error_flag);
n,*mat.host_ptr(),&error_flag);
CL_CHECK_ERR(error_flag);
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;

View File

@ -221,7 +221,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
return energy_virial(eatom,vatom,virial);
double evdwl=0.0;
int vstart=0, iend=_inum;
int ii, vstart=0, iend=_inum;
if (_eflag) {
iend=_inum*2;
for (int i=0; i<_inum; i++)
@ -235,10 +235,10 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=_inum; i<iend; i++)
eatom[i]+=engv[i];
} else {
for (int i=0; i<_inum; i++)
eatom[_ilist[i]]+=engv[i];
for (int i=_inum; i<iend; i++)
eatom[_ilist[i]]+=engv[i];
for (int i=0, ii=0; i<_inum; i++)
eatom[_ilist[ii++]]+=engv[i];
for (int i=_inum, ii=0; i<iend; i++)
eatom[_ilist[ii++]]+=engv[i];
}
vstart=iend;
iend+=_inum;
@ -249,12 +249,10 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
virial[j]+=engv[i];
if (_vf_atom)
if (_ilist==NULL) {
int ii=0;
for (int i=vstart; i<iend; i++)
vatom[ii++][j]+=engv[i];
vatom[i++][j]+=engv[i];
} else {
int ii=0;
for (int i=vstart; i<iend; i++)
for (int i=vstart, ii=0; i<iend; i++)
vatom[_ilist[ii++]][j]+=engv[i];
}
vstart+=_inum;

View File

@ -23,22 +23,21 @@
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, list_end, nbor) \
nbor=nbor_mem+ii; \
i=*nbor; \
nbor+=nbor_stride; \
numj=*nbor; \
i, numj, stride, nbor_end, nbor_begin) \
i=nbor_mem[ii]; \
nbor_begin=ii+nbor_stride; \
numj=nbor_mem[nbor_begin]; \
if (nbor_mem==packed_mem) { \
nbor+=nbor_stride+fast_mul(ii,t_per_atom-1); \
nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \
stride=fast_mul(t_per_atom,nbor_stride); \
list_end=nbor+fast_mul(numj/t_per_atom,stride)+ (numj & (t_per_atom-1)); \
nbor+=offset; \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
nbor_begin+=offset; \
} else { \
nbor+=nbor_stride; \
nbor=packed_mem+*nbor; \
list_end=nbor+numj; \
nbor_begin+=nbor_stride; \
nbor_begin=nbor_mem[nbor_begin]; \
nbor_end=nbor_begin+numj; \
stride=t_per_atom; \
nbor+=offset; \
nbor_begin+=offset; \
}
#if (ARCH < 300)
@ -75,15 +74,15 @@
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \
@ -123,17 +122,17 @@
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \
@ -158,15 +157,15 @@
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \
@ -190,17 +189,17 @@
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \

View File

@ -202,6 +202,7 @@ void BaseEllipsoidT::output_times() {
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
device->replica());
double max_mb=mpi_max_bytes/(1024*1024);
double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
if (device->replica_me()==0)
if (screen && times[5]>0.0) {
@ -209,11 +210,11 @@ void BaseEllipsoidT::output_times() {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen," Device Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (device->procs_per_gpu()==1) {
if (device->procs_per_gpu()==1 && t_time>0) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
@ -226,7 +227,8 @@ void BaseEllipsoidT::output_times() {
}
if (nbor->gpu_nbor()==2)
fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[9]/replica_size);
fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size);
if (times[6]>0)
fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);

View File

@ -52,19 +52,19 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -154,20 +154,20 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -48,7 +48,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -65,9 +65,9 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -53,19 +53,19 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -148,20 +148,20 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -66,18 +66,18 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -188,19 +188,19 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -52,7 +52,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -71,9 +71,9 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -69,11 +69,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -85,8 +85,8 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -105,7 +105,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
int mtype=itype*lj_types+jtype;
if (rsq<cutsq_sigma[mtype].x) { // cutsq
numtyp r2inv = ucl_recip(rsq);
numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
numtyp forcecoul, forceborn, force, r6inv, prefactor;
numtyp v_sh = (numtyp)0.0;
numtyp rexp = (numtyp)0.0;
@ -124,10 +124,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
erfcc = erfc(arij);
const numtyp erfcc = erfc(arij);
v_sh = (erfcc - e_shift*r)*prefactor;
numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
forcecoul = prefactor * dvdrr*rsq*factor_coul;
forcecoul = prefactor * dvdrr*rsq;
if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
} else forcecoul = (numtyp)0.0;
force = (forceborn + forcecoul) * r2inv;
@ -137,8 +138,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += v_sh*factor_coul;
if (rsq < cut_coulsq) {
numtyp e=v_sh;
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
e_coul += e;
}
if (rsq < cutsq_sigma[mtype].y) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;
@ -201,11 +205,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -218,8 +222,8 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -237,7 +241,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
if (rsq<cutsq_sigma[mtype].x) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
numtyp forcecoul, forceborn, force, r6inv, prefactor;
numtyp v_sh = (numtyp)0.0;
numtyp rexp = (numtyp)0.0;
@ -256,10 +260,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
erfcc = erfc(arij);
const numtyp erfcc = erfc(arij);
v_sh = (erfcc - e_shift*r)*prefactor;
numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
forcecoul = prefactor * dvdrr*rsq*factor_coul;
forcecoul = prefactor * dvdrr*rsq;
if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
} else forcecoul = (numtyp)0.0;
force = (forceborn + forcecoul) * r2inv;
@ -269,8 +274,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += v_sh*factor_coul;
if (rsq < cut_coulsq) {
numtyp e=v_sh;
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
e_coul += e;
}
if (rsq < cutsq_sigma[mtype].y) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;

View File

@ -52,7 +52,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -72,9 +72,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -50,7 +50,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -68,9 +68,9 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -52,19 +52,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -145,20 +145,20 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -65,18 +65,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -182,19 +182,19 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -51,7 +51,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -70,9 +70,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -66,18 +66,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -190,19 +190,19 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -52,7 +52,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -70,9 +70,9 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -49,7 +49,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -66,9 +66,9 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -52,19 +52,19 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -150,20 +150,20 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -49,7 +49,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -66,9 +66,9 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -65,18 +65,18 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -191,19 +191,19 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -51,7 +51,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -69,9 +69,9 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -66,13 +66,14 @@ int CHARMMLongT::init(const int ntypes,
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (this->_block_bio_size>=64 && mix_arithmetic)
int max_bio_shared_types=this->device->max_bio_shared_types();
if (this->_block_bio_size>=64 && mix_arithmetic &&
lj_types<=max_bio_shared_types)
shared_types=true;
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
int h_size=lj_types*lj_types;
int max_bio_shared_types=this->device->max_bio_shared_types();
if (h_size<max_bio_shared_types)
h_size=max_bio_shared_types;
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
@ -84,8 +85,10 @@ int CHARMMLongT::init(const int ntypes,
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_lj3,host_lj4);
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
if (shared_types) {
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
}
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {

View File

@ -32,7 +32,7 @@ texture<int2> q_tex;
__kernel void k_charmm_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global numtyp *restrict sp_lj,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
@ -47,16 +47,6 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
@ -66,18 +56,18 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -196,18 +186,18 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -53,7 +53,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -72,9 +72,9 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -55,19 +55,19 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -219,20 +219,20 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -51,7 +51,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -70,9 +70,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -60,20 +60,19 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_coul;
for ( ; nbor<nbor_end; nbor+=n_stride) {
numtyp factor_coul;
int j=dev_packed[nbor];
factor_coul = sp_cl[sbmask(j)];
j &= NEIGHMASK;
@ -152,21 +151,21 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_coul;
for ( ; nbor<nbor_end; nbor+=n_stride) {
numtyp factor_coul = sp_cl[sbmask(j)];
int j=dev_packed[nbor];
factor_coul = sp_cl[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

View File

@ -30,19 +30,19 @@ texture<int2> q_tex;
#endif
__kernel void k_coul_debye(const __global numtyp4 *restrict x_,
const __global numtyp *restrict scale,
const int lj_types,
const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_ ,
const __global numtyp *restrict cutsq,
const numtyp qqrd2e, const numtyp kappa,
const int t_per_atom) {
const __global numtyp *restrict scale,
const int lj_types,
const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_ ,
const __global numtyp *restrict cutsq,
const numtyp qqrd2e, const numtyp kappa,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -61,20 +61,20 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_coul = sp_cl[sbmask(j)];
numtyp factor_coul;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
factor_coul = sp_cl[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -158,21 +158,21 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_coul;
for ( ; nbor<nbor_end; nbor+=n_stride) {
numtyp factor_coul = sp_cl[sbmask(j)];
int j=dev_packed[nbor];
factor_coul = sp_cl[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

View File

@ -62,11 +62,11 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -77,8 +77,8 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_coul, r, prefactor, erfcc;
factor_coul = sp_lj[sbmask(j)];
@ -163,11 +163,11 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -178,8 +178,8 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_coul, r, prefactor, erfcc;
factor_coul = sp_lj[sbmask(j)];

View File

@ -49,7 +49,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -66,9 +66,9 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -124,7 +124,8 @@ texture<int2> q_tex;
#endif
__kernel void k_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp *restrict scale,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor,
@ -153,44 +154,41 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp qtmp; fetch(qtmp,i,q_tex);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_coul;
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq < cut_coulsq) {
numtyp r2inv=ucl_recip(rsq);
numtyp force, prefactor, _erfc;
numtyp r = ucl_rsqrt(r2inv);
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * scale[mtype] * qtmp/r;
prefactor *= qqrd2e * qtmp/r;
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
f.x+=delx*force;
@ -198,7 +196,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*(_erfc-factor_coul);
e_coul += prefactor*(_erfc-factor_coul);
}
if (vflag>0) {
virial[0] += delx*delx*force;
@ -217,7 +215,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
}
__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict scale_in,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor,
const __global int *dev_packed,
@ -231,14 +230,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_cl[4];
if (tid<4)
sp_cl[tid]=sp_cl_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
scale[tid]=scale_in[tid];
}
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -249,27 +244,24 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw = ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_coul;
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
@ -280,13 +272,13 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
numtyp r2inv=ucl_recip(rsq);
numtyp force, prefactor, _erfc;
numtyp r = ucl_sqrt(rsq);
numtyp r = ucl_rsqrt(r2inv);
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * scale[mtype] * qtmp/r;
prefactor *= qqrd2e * qtmp/r;
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
f.x+=delx*force;
@ -294,7 +286,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*(_erfc-factor_coul);
e_coul += prefactor*(_erfc-factor_coul);
}
if (vflag>0) {
virial[0] += delx*delx*force;

View File

@ -48,7 +48,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -65,9 +65,9 @@ int cl_gpu_init(const int ntypes, double **host_scale,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -48,7 +48,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double p_split, const int nthreads,
const int t_per_atom, const double cell_size,
char *ocl_vendor) {
char *ocl_vendor, const int block_pair) {
_nthreads=nthreads;
#ifdef _OPENMP
omp_set_num_threads(nthreads);
@ -66,6 +66,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
_gpu_mode=gpu_mode;
_particle_split=p_split;
_cell_size=cell_size;
_block_pair=block_pair;
// Get the rank/size within the world
MPI_Comm_rank(_comm_world,&_world_me);
@ -175,6 +176,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
} else if (s_vendor=="cypress") {
_ocl_vendor_name="AMD Cypress";
_ocl_vendor_string="-DCYPRESS_OCL";
} else if (s_vendor=="phi") {
_ocl_vendor_name="Intel Phi";
_ocl_vendor_string="-DPHI_OCL";
} else if (s_vendor=="intel") {
_ocl_vendor_name="Intel CPU";
_ocl_vendor_string="-DINTEL_OCL";
} else if (s_vendor=="generic") {
_ocl_vendor_name="GENERIC";
_ocl_vendor_string="-DGENERIC_OCL";
@ -352,10 +359,10 @@ void DeviceT::init_message(FILE *screen, const char *name,
for (int i=first_gpu; i<last; i++) {
std::string sname;
if (i==first_gpu)
sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ (";
else
sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
toa(gpu->clock_rate(i))+" GHZ (";
if (sizeof(PRECISION)==4) {
if (sizeof(ACC_PRECISION)==4)
@ -520,7 +527,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
if (screen && times[5]>0.0) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," Device Time Info (average): ");
fprintf(screen," Device Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
@ -582,7 +589,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
if (screen && times[6]>0.0) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," Device Time Info (average): ");
fprintf(screen," Device Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
@ -672,7 +679,7 @@ int DeviceT::compile_kernels() {
_threads_per_charge=gpu_lib_data[13];
_pppm_max_spline=gpu_lib_data[4];
_pppm_block=gpu_lib_data[5];
_block_pair=gpu_lib_data[6];
if (_block_pair == -1) _block_pair=gpu_lib_data[6];
_max_shared_types=gpu_lib_data[7];
_block_cell_2d=gpu_lib_data[8];
_block_cell_id=gpu_lib_data[9];
@ -714,10 +721,10 @@ int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom, const double cell_size,
char *opencl_vendor) {
char *opencl_vendor, const int block_pair) {
return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
particle_split,nthreads,t_per_atom,
cell_size,opencl_vendor);
cell_size,opencl_vendor,block_pair);
}
void lmp_clear_device() {

View File

@ -51,7 +51,7 @@ class Device {
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom, const double cell_size,
char *vendor_string);
char *vendor_string, const int block_pair);
/// Initialize the device for Atom and Neighbor storage
/** \param rot True if quaternions need to be stored
@ -96,12 +96,12 @@ class Device {
const int first_gpu, const int last_gpu);
/// Perform charge assignment asynchronously for PPPM
void set_single_precompute(PPPM<numtyp,acctyp,
float,_lgpu_float4> *pppm);
void set_single_precompute(PPPM<numtyp,acctyp,
float,_lgpu_float4> *pppm);
/// Perform charge assignment asynchronously for PPPM
void set_double_precompute(PPPM<numtyp,acctyp,
double,_lgpu_double4> *pppm);
void set_double_precompute(PPPM<numtyp,acctyp,
double,_lgpu_double4> *pppm);
/// Esimate the overhead from GPU calls from multiple procs
/** \param kernel_calls Number of kernel calls/timestep for timing estimated

View File

@ -73,17 +73,17 @@ texture<int4,1> mu_tex;
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \
@ -113,17 +113,17 @@ texture<int4,1> mu_tex;
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \
@ -173,19 +173,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -385,11 +385,11 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -397,8 +397,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -50,7 +50,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -68,9 +68,9 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -74,17 +74,17 @@ texture<int4,1> mu_tex;
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \
@ -114,17 +114,17 @@ texture<int4,1> mu_tex;
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \
@ -174,19 +174,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -418,11 +418,11 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -430,8 +430,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -50,7 +50,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -68,9 +68,9 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -187,11 +187,10 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
@ -199,9 +198,9 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
int itag=iv.w;
numtyp factor_dpd;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_dpd = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -308,11 +307,10 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
@ -321,9 +319,9 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
int itag=iv.w;
numtyp factor_dpd;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_dpd = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -115,14 +115,15 @@ texture<int4> z2r_sp2_tex;
} \
} \
if (offset==0) { \
int ei=ii; \
if (eflag>0) { \
engv[ii]+=energy*(acctyp)0.5; \
engv+=inum; \
engv[ei]+=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
engv[ii]=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \
@ -171,15 +172,15 @@ texture<int4> z2r_sp2_tex;
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv+=energy*(acctyp)0.5; \
engv+=inum; \
engv[ei]+=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
ans[ii]=f; \
@ -209,17 +210,17 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
acctyp energy = (acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -286,17 +287,17 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -354,18 +355,18 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -468,19 +469,19 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

View File

@ -58,7 +58,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -77,9 +77,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -31,14 +31,13 @@ texture<int4,1> pos_tex, quat_tex;
#endif
#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, list_end, nbor) \
nbor=nbor_mem+ii; \
i=*nbor; \
nbor+=nbor_stride; \
numj=*nbor; \
nbor+=nbor_stride; \
list_end=nbor+fast_mul(nbor_stride,numj); \
nbor+=fast_mul(offset,nbor_stride); \
i, numj, stride, nbor_end, nbor_begin) \
i=nbor_mem[ii]; \
nbor_begin=ii+nbor_stride; \
numj=nbor_mem[nbor_begin]; \
nbor_begin+=nbor_stride; \
nbor_end=nbor_begin+fast_mul(nbor_stride,numj); \
nbor_begin+=fast_mul(offset,nbor_stride); \
stride=fast_mul(t_per_atom,nbor_stride);
#if (ARCH < 300)

View File

@ -41,20 +41,19 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
int ii=GLOBAL_ID_X+start;
if (ii<inum) {
const __global int *nbor=dev_ij+ii;
int i=*nbor;
int i=dev_ij[ii];
int nbor=ii+nbor_pitch;
int numj=dev_ij[nbor];
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
int nbor_end=nbor+fast_mul(numj,nbor_pitch);
int packed=ii+nbor_pitch+nbor_pitch;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul(iw,ntypes);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int j=dev_ij[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
@ -70,7 +69,7 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
rsq+=t*t;
if (rsq<cf.x) {
*packed=j;
dev_nbor[packed]=j;
packed+=nbor_pitch;
newj++;
}
@ -105,21 +104,20 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor=dev_ij+ii;
int i=*nbor;
int i=dev_ij[ii];
int nbor=ii+nbor_pitch;
int numj=dev_ij[nbor];
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
int nbor_end=nbor+fast_mul(numj,nbor_pitch);
int packed=ii+nbor_pitch+nbor_pitch;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int j=dev_ij[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
@ -135,7 +133,7 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
rsq+=t*t;
if (rsq<cutsq[mtype]) {
*packed=j;
dev_nbor[packed]=j;
packed+=nbor_pitch;
newj++;
}

View File

@ -51,19 +51,19 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -138,20 +138,20 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -48,7 +48,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -65,9 +65,9 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -119,7 +119,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *nbor_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@ -140,7 +140,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -52,7 +52,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -71,9 +71,9 @@ int gb_gpu_init(const int ntypes, const double gamma,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *nbor_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@ -68,7 +68,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -276,19 +276,19 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex);
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_ij[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -371,20 +371,20 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_ij[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -28,7 +28,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global numtyp *restrict sp_lj,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
@ -38,12 +38,6 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -52,19 +46,18 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -142,20 +135,19 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -52,19 +52,19 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -143,20 +143,20 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -48,7 +48,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -65,9 +65,9 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -65,18 +65,18 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -187,19 +187,19 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -51,7 +51,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -69,9 +69,9 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -65,18 +65,18 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -178,19 +178,19 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -66,18 +66,18 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -186,19 +186,19 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -51,7 +51,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -69,9 +69,9 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -50,7 +50,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -68,9 +68,9 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -80,7 +80,7 @@ int LJCoulLongT::init(const int ntypes,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_cut_ljsq);
host_cutsq, host_cut_ljsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
@ -102,6 +102,23 @@ int LJCoulLongT::init(const int ntypes,
return 0;
}
template <class numtyp, class acctyp>
void LJCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double **host_cut_ljsq) {
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<_lj_types*_lj_types; i++)
host_write[i]=0.0;
this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_cut_ljsq);
this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
}
template <class numtyp, class acctyp>
void LJCoulLongT::clear() {
if (!_allocated)

View File

@ -65,18 +65,18 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -183,19 +183,19 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -46,6 +46,11 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Send updated coeffs from host to device (to be compatible with fix adapt)
void reinit(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double **host_cut_ljsq);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();

View File

@ -51,7 +51,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -69,9 +69,9 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
@ -93,6 +93,29 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
return init_ok;
}
// ---------------------------------------------------------------------------
// Copy updated coeffs from host to device
// ---------------------------------------------------------------------------
void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double **host_cut_ljsq) {
int world_me=LJCLMF.device->world_me();
int gpu_rank=LJCLMF.device->gpu_rank();
int procs_per_gpu=LJCLMF.device->procs_per_gpu();
if (world_me==0)
LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, host_cut_ljsq);
LJCLMF.device->world_barrier();
for (int i=0; i<procs_per_gpu; i++) {
if (gpu_rank==i && world_me!=0)
LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, host_cut_ljsq);
LJCLMF.device->gpu_barrier();
}
}
void ljcl_gpu_clear() {
LJCLMF.clear();
}

View File

@ -118,11 +118,11 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -130,8 +130,8 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
numtyp cut_coul = ucl_sqrt(cut_coulsq);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
@ -239,11 +239,11 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -252,8 +252,8 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
numtyp cut_coul = ucl_sqrt(cut_coulsq);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];

View File

@ -51,7 +51,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -70,9 +70,9 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -68,11 +68,11 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -84,8 +84,8 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
factor_lj = sp_lj[sbmask(j)];
@ -195,11 +195,11 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
@ -212,8 +212,8 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
factor_lj = sp_lj[sbmask(j)];

View File

@ -52,7 +52,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -71,9 +71,9 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -54,19 +54,19 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -147,20 +147,20 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -49,7 +49,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -66,9 +66,9 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -48,7 +48,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -65,9 +65,9 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -55,19 +55,18 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -161,20 +160,19 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -52,19 +52,19 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -143,20 +143,20 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -50,7 +50,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -68,9 +68,9 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -54,19 +54,19 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -145,20 +145,20 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -49,7 +49,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -66,9 +66,9 @@ int mor_gpu_init(const int ntypes, double **cutsq,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -65,6 +65,11 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
else
_alloc_packed=false;
if (pre_cut)
_packed_permissions=UCL_READ_WRITE;
else
_packed_permissions=UCL_READ_ONLY;
bool success=true;
// Initialize timers for the selected GPU
@ -121,7 +126,7 @@ void Neighbor::alloc(bool &success) {
if (_alloc_packed) {
dev_packed.clear();
success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_packed_permissions)==UCL_SUCCESS);
_c_bytes+=dev_packed.row_bytes();
}
if (_max_host>0) {

View File

@ -227,6 +227,7 @@ class Neighbor {
int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
bool _gpu_host, _alloc_packed;
double _cutoff, _cell_size, _bin_time;
enum UCL_MEMOPT _packed_permissions;
double _gpu_bytes, _c_bytes, _cell_bytes;
void alloc(bool &success);

View File

@ -25,17 +25,17 @@ __kernel void kernel_unpack(__global int *dev_nbor,
int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
if (ii<inum) {
__global int *nbor=dev_nbor+ii+inum;
int numj=*nbor;
int nbor=ii+inum;
int numj=dev_nbor[nbor];
nbor+=inum;
const __global int *list=dev_ij+*nbor;
const __global int *list_end=list+numj;
int list=dev_nbor[nbor];
int list_end=list+numj;
list+=offset;
nbor+=fast_mul(ii,t_per_atom-1)+offset;
int stride=fast_mul(t_per_atom,inum);
for ( ; list<list_end; list++) {
*nbor=*list;
dev_nbor[nbor]=dev_ij[list];
nbor+=stride;
}
} // if ii

View File

@ -160,7 +160,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
// Allocate error flags for checking out of bounds atoms
success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
UCL_WRITE_ONLY)==UCL_SUCCESS);
UCL_READ_WRITE)==UCL_SUCCESS);
if (!success) {
flag=-3;
return 0;

View File

@ -51,7 +51,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -69,9 +69,9 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -333,6 +333,54 @@ inline double shfl_xor(double var, int laneMask, int width) {
#endif
// -------------------------------------------------------------------------
// INTEL CPU OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#ifdef INTEL_OCL
#define USE_OPENCL
#define MEM_THREADS 16
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
#define BLOCK_PAIR 1
#define MAX_SHARED_TYPES 0
#define BLOCK_NBOR_BUILD 4
#define BLOCK_BIO_PAIR 2
#define BLOCK_ELLIPSE 2
#define WARP_SIZE 1
#define PPPM_BLOCK_1D 32
#define BLOCK_CELL_2D 1
#define BLOCK_CELL_ID 2
#define MAX_BIO_SHARED_TYPES 0
#endif
// -------------------------------------------------------------------------
// INTEL PHI OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#ifdef PHI_OCL
#define USE_OPENCL
#define MEM_THREADS 16
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
#define BLOCK_PAIR 16
#define MAX_SHARED_TYPES 0
#define BLOCK_NBOR_BUILD 16
#define BLOCK_BIO_PAIR 16
#define BLOCK_ELLIPSE 16
#define WARP_SIZE 1
#define PPPM_BLOCK_1D 32
#define BLOCK_CELL_2D 4
#define BLOCK_CELL_ID 16
#define MAX_BIO_SHARED_TYPES 0
#endif
// -------------------------------------------------------------------------
// GENERIC OPENCL DEFINITIONS
// -------------------------------------------------------------------------
@ -433,7 +481,9 @@ inline double shfl_xor(double var, int laneMask, int width) {
// ARCHITECTURE INDEPENDENT DEFINITIONS
// -------------------------------------------------------------------------
#ifndef PPPM_MAX_SPLINE
#define PPPM_MAX_SPLINE 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double

View File

@ -74,7 +74,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *nbor_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@ -125,7 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -50,7 +50,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -68,9 +68,9 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -172,7 +172,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *nbor_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@ -199,7 +199,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -424,7 +424,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *nbor_end;
int nbor, nbor_end;
int j, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
@ -435,7 +435,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int i=*nbor;
int i=dev_nbor[nbor];
factor_lj = sp_lj[sbmask(i)];
i &= NEIGHMASK;
@ -616,19 +616,19 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex);
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_ij[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -710,20 +710,20 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_ij[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -53,19 +53,19 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -141,20 +141,20 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -48,7 +48,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -65,9 +65,9 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -156,18 +156,18 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -195,17 +195,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
numtyp sw_cut=sw3_ijparam.x;
numtyp sw_cutsq=sw3_ijparam.y;
numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
ucl_powr(sw_sigma,sw_powerp);
pow(sw_sigma,sw_powerp);
numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
ucl_powr(sw_sigma,sw_powerq);
pow(sw_sigma,sw_powerq);
numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
ucl_powr(sw_sigma,sw_powerp+(numtyp)1.0);
pow(sw_sigma,sw_powerp+(numtyp)1.0);
numtyp pre_sw_c4=sw_biga*sw_epsilon*
ucl_powr(sw_sigma,sw_powerq+(numtyp)1.0);
pow(sw_sigma,sw_powerq+(numtyp)1.0);
numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
ucl_powr(sw_sigma,sw_powerp);
pow(sw_sigma,sw_powerp);
numtyp pre_sw_c6=sw_biga*sw_epsilon*
ucl_powr(sw_sigma,sw_powerq);
pow(sw_sigma,sw_powerq);
numtyp r=ucl_sqrt(rsq);
numtyp rp=ucl_powr(r,-sw_powerp);
@ -343,6 +343,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const int t_per_atom, const int evatom) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
@ -359,21 +360,20 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor_j, *list_end;
int i, numj;
int i, numj, nbor_j, nbor_end;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,list_end,nbor_j);
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=*nbor_j;
int j=dev_packed[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -392,15 +392,17 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
const __global int *nbor_k=nbor_j-offset_j+offset_k;
int nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j)
nbor_k+=n_stride;
for ( ; nbor_k<list_end; nbor_k+=n_stride) {
int k=*nbor_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
k &= NEIGHMASK;
numtyp4 kx; fetch4(kx,k,pos_tex);
@ -415,11 +417,15 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y;
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -466,6 +472,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const int t_per_atom) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
@ -482,20 +489,19 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor_j, *list_end, *k_end;
int i, numj;
int i, numj, nbor_j, nbor_end, k_end;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,list_end,nbor_j);
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
int j=*nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -513,27 +519,27 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
if (rsq1 > sw3_ijparam.y) continue;
int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype];
numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex);
numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex);
sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_jiparam.x;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
int numk=*nbor_k;
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
if (dev_nbor==dev_packed) {
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k+=nbor_pitch;
nbor_k=dev_packed+*nbor_k;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=*nbor_k;
int k=dev_packed[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -541,25 +547,29 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype];
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
numtyp delr2x = kx.x - jx.x;
numtyp delr2y = kx.y - jx.y;
numtyp delr2z = kx.z - jx.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
numtyp4 sw3_jkparam; fetch4(sw3_jkparam,jkparam,sw3_tex);
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
if (rsq2 < sw3_jkparam.y) {
numtyp4 sw1_jkparam; fetch4(sw1_jkparam,jkparam,sw1_tex);
sw_sigma_gamma_ik=sw1_jkparam.y*sw1_jkparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_jkparam.x;
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
numtyp4 sw1_jikparam; fetch4(sw1_jikparam,jikparam,sw1_tex);
sw_lambda_epsilon_ijk=sw1_jikparam.x*sw1_jikparam.z; //sw_lambda*sw_epsilon;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_jikparam; fetch4(sw3_jikparam,jikparam,sw3_tex);
sw_costheta_ijk=sw3_jikparam.z;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz;
//if (evatom==0) {
@ -602,6 +612,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const int t_per_atom) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
@ -618,20 +629,19 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor_j, *list_end, *k_end;
int i, numj;
int i, numj, nbor_j, nbor_end, k_end;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,list_end,nbor_j);
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
int j=*nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -649,27 +659,27 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
if (rsq1 > sw3_ijparam.y) continue;
int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype];
numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex);
numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex);
sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_jiparam.x;
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
int numk=*nbor_k;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
if (dev_nbor==dev_packed) {
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k+=nbor_pitch;
nbor_k=dev_packed+*nbor_k;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=*nbor_k;
int k=dev_packed[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -677,25 +687,29 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype];
numtyp4 sw3_jkparam; fetch4(sw3_jkparam,jkparam,sw3_tex);
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
numtyp delr2x = kx.x - jx.x;
numtyp delr2y = kx.y - jx.y;
numtyp delr2z = kx.z - jx.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw3_jkparam.y) {
numtyp4 sw1_jkparam; fetch4(sw1_jkparam,jkparam,sw1_tex);
sw_sigma_gamma_ik=sw1_jkparam.y*sw1_jkparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_jkparam.x;
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
numtyp4 sw1_jikparam; fetch4(sw1_jikparam,jikparam,sw1_tex);
sw_lambda_epsilon_ijk=sw1_jikparam.x*sw1_jikparam.z; //sw_lambda*sw_epsilon;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_jikparam; fetch4(sw3_jikparam,jikparam,sw3_tex);
sw_costheta_ijk=sw3_jikparam.z;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz, fkx, fky, fkz;
threebody(delr1x,delr1y,delr1z,eflag,energy);

View File

@ -73,19 +73,19 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
int tlm1 = tablength - 1;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -171,20 +171,20 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
int tlm1 = tablength - 1;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -269,19 +269,19 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
int tlm1 = tablength - 1;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -371,20 +371,20 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
int tlm1 = tablength - 1;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -473,19 +473,19 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
int tlm1 = tablength - 1;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -582,20 +582,20 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
int tlm1 = tablength - 1;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -693,19 +693,19 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
int tlm1 = tablength - 1;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -800,20 +800,20 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
int tlm1 = tablength - 1;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -48,7 +48,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -65,9 +65,9 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -51,19 +51,19 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -140,20 +140,20 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -58,20 +58,20 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp radi; fetch(radi,i,rad_tex);
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
@ -150,11 +150,11 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int nbor, nbor_end;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp radi; fetch(radi,i,rad_tex);
@ -162,9 +162,9 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;

View File

@ -48,7 +48,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -65,9 +65,9 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}

View File

@ -48,7 +48,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
@ -65,9 +65,9 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}