git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8810 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2012-09-21 15:57:23 +00:00
parent 8bcf139b73
commit 209999068c
53 changed files with 1733 additions and 1081 deletions

View File

@ -87,7 +87,7 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \
$(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \
$(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \
$(OBJ_DIR)/buck_coul_wolf.cubin $(OBJ_DIR)/buck_coul_wolf_cubin.h \
$(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul_cubin.h \
$(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \
$(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \
$(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \

View File

@ -64,16 +64,11 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \
$(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \
$(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \
$(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
$(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
$(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
$(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
$(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
$(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
$(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
$(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
$(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
$(OBJ_DIR)/born_cl.h $(OBJ_DIR)/born_coul_wolf_cl.h \
$(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \
$(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \
$(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -131,11 +126,11 @@ $(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp
$(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h
$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cl.h $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o
$(OCL) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR)
@ -143,11 +138,11 @@ $(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/g
$(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp
$(OCL) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cl.h $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o
$(OCL) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR)

View File

@ -17,8 +17,10 @@
#include "lal_preprocessor.h"
#endif
__kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x,
__global int *type, const int nall) {
__kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
const __global double *restrict x,
const __global int *restrict type,
const int nall) {
int ii=GLOBAL_ID_X;
if (ii<nall) {

View File

@ -41,6 +41,8 @@
nbor+=offset; \
}
#if (ARCH < 300)
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
@ -137,3 +139,72 @@
ans[ii]=f; \
}
#else
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
engv+=inum; \
} \
} \
ans[ii]=f; \
}
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
e_coul += shfl_xor(e_coul, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
engv+=inum; \
*engv=e_coul; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
engv+=inum; \
} \
} \
ans[ii]=f; \
}
#endif

View File

@ -132,7 +132,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist,
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return false;
return NULL;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());

View File

@ -135,7 +135,7 @@ int * BaseChargeT::reset_nbors(const int nall, const int inum, int *ilist,
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return false;
return NULL;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());

View File

@ -137,7 +137,7 @@ int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist,
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return false;
return NULL;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());

View File

@ -80,7 +80,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
_block_size=device->block_ellipse();
compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
// Initialize host-device load balancer
@ -118,9 +118,8 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
ans->force.zero();
// Memory for ilist ordered by particle type
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
return 0;
else return -3;
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS)
return -3;
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

View File

@ -24,14 +24,18 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2,
__global numtyp2 *cutsq_sigma,
const int lj_types, __global numtyp *sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_born(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2,
const __global numtyp2 *restrict cutsq_sigma,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -49,7 +53,7 @@ __kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -108,14 +112,17 @@ __kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
} // if ii
}
__kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp2 *cutsq_sigma,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_born_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in,
const __global numtyp2 *restrict cutsq_sigma,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -140,7 +147,7 @@ __kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -29,14 +29,19 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *cutsq_sigma,
__kernel void k_born_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp4 *restrict cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
@ -61,7 +66,7 @@ __kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -143,14 +148,18 @@ __kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
} // if ii
}
__kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *cutsq_sigma,
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp4 *restrict cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
@ -178,7 +187,7 @@ __kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -31,14 +31,19 @@ texture<int2> q_tex;
#define MY_PIS (acctyp)1.77245385090551602729
__kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *cutsq_sigma,
__kernel void k_born_wolf(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp4 *restrict cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp alf, const numtyp e_shift,
const numtyp f_shift, const int t_per_atom) {
@ -64,7 +69,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -74,7 +79,8 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
int itype=ix.w;
if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self;
}
@ -83,7 +89,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -98,7 +104,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
int mtype=itype*lj_types+jtype;
if (rsq<cutsq_sigma[mtype].x) { // cutsq
numtyp r2inv = ucl_recip(rsq);
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
numtyp v_sh = (numtyp)0.0;
numtyp rexp = (numtyp)0.0;
@ -113,14 +119,14 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
if (rsq < cut_coulsq) {
numtyp r=ucl_rsqrt(r2inv);
numtyp arij = alf * r;
numtyp expm2 = ucl_exp(-arij*arij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
numtyp erfcd = ucl_exp(-arij*arij);
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
v_sh = (_erfc - e_shift*r)*prefactor;
numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
forcecoul = prefactor * (dvdrr*rsq-factor_coul);
erfcc = erfc(arij);
v_sh = (erfcc - e_shift*r)*prefactor;
numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
forcecoul = prefactor * dvdrr*rsq*factor_coul;
} else forcecoul = (numtyp)0.0;
force = (forceborn + forcecoul) * r2inv;
@ -131,8 +137,8 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(v_sh-factor_coul);
if (rsq < coeff1[mtype].w) {
e_coul += v_sh*factor_coul;
if (rsq < cutsq_sigma[mtype].y) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w);
@ -154,14 +160,18 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
} // if ii
}
__kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *cutsq_sigma,
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp4 *restrict cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp alf, const numtyp e_shift,
const numtyp f_shift, const int t_per_atom) {
@ -190,7 +200,7 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -201,7 +211,8 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self;
}
@ -210,7 +221,7 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -224,11 +235,11 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
if (rsq<cutsq_sigma[mtype].x) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
numtyp v_sh = (numtyp)0.0;
numtyp rexp = (numtyp)0.0;
if (rsq < coeff1[mtype].w) {
if (rsq < cutsq_sigma[mtype].y) {
numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv;
@ -237,16 +248,16 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
} else forceborn = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r=ucl_rsqrt(r2inv);
numtyp r=ucl_sqrt(rsq);
numtyp arij = alf * r;
numtyp expm2 = ucl_exp(-arij*arij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
numtyp erfcd = ucl_exp(-arij*arij);
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
v_sh = (_erfc - e_shift*r)*prefactor;
numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
forcecoul = prefactor * (dvdrr*rsq-factor_coul);
erfcc = erfc(arij);
v_sh = (erfcc - e_shift*r)*prefactor;
numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
forcecoul = prefactor * dvdrr*rsq*factor_coul;
} else forcecoul = (numtyp)0.0;
force = (forceborn + forcecoul) * r2inv;
@ -257,8 +268,8 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(v_sh-factor_coul);
if (rsq < coeff1[mtype].w) {
e_coul += v_sh*factor_coul;
if (rsq < cutsq_sigma[mtype].y) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w);

View File

@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_buck(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -48,7 +52,7 @@ __kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -106,13 +110,16 @@ __kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
} // if ii
}
__kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_buck_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -137,7 +144,7 @@ __kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -29,15 +29,20 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp4 *cutsq, const numtyp qqrd2e,
const int t_per_atom) {
__kernel void k_buck_coul(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_ ,
const __global numtyp4 *restrict cutsq,
const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -60,7 +65,7 @@ __kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -136,15 +141,19 @@ __kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
} // if ii
}
__kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *_cutsq, const numtyp qqrd2e,
const int t_per_atom) {
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp4 *restrict _cutsq,
const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -172,7 +181,7 @@ __kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -29,16 +29,21 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp *cutsq,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1,
const __global numtyp4 *restrict coeff2,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp *restrict cutsq,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -61,7 +66,7 @@ __kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -144,16 +149,21 @@ __kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
} // if ii
}
__kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp *cutsq,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1_in,
const __global numtyp4 *restrict coeff2_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp *restrict cutsq,
const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -179,7 +189,7 @@ __kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coef
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -48,7 +52,7 @@ __kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -111,13 +115,16 @@ __kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,__global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -142,7 +149,7 @@ __kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -29,15 +29,20 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_ ,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -60,7 +65,7 @@ __kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -148,15 +153,19 @@ __kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -181,7 +190,7 @@ __kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -29,16 +29,21 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
const int lj_types, __global numtyp *sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp denom_lj,
const numtyp cut_bothsq, const numtyp cut_ljsq,
const numtyp cut_lj_innersq, const int t_per_atom) {
__kernel void k_charmm_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp denom_lj,
const numtyp cut_bothsq, const numtyp cut_ljsq,
const numtyp cut_lj_innersq, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -61,7 +66,7 @@ __kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -152,17 +157,21 @@ __kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp denom_lj,
const numtyp cut_bothsq, const numtyp cut_ljsq,
const numtyp cut_lj_innersq,
const int t_per_atom) {
__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp2 *restrict ljd_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp denom_lj,
const numtyp cut_bothsq, const numtyp cut_ljsq,
const numtyp cut_lj_innersq,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -170,7 +179,8 @@ __kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
ljd[tid]=ljd_in[tid];
if (tid<MAX_BIO_SHARED_TYPES)
ljd[tid]=ljd_in[tid];
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
@ -185,7 +195,7 @@ __kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -24,16 +24,19 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in,
__global numtyp4* colloid1,
__global numtyp4* colloid2,
__global int *form,
__global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
__kernel void k_colloid(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global numtyp4 *restrict colloid1,
const __global numtyp4 *restrict colloid2,
const __global int *form,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -52,7 +55,7 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -98,9 +101,10 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
K[3] *= K[3]*K[3];
K[6] = K[3]*K[3];
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
force = 4.0/15.0*fR *
(2.0*(K[1]+K[2]) * (K[1]*(5.0*K[1]+22.0*K[2])+5.0*K[4]) *
colloid2[mtype].w/K[6]-5.0) / K[0];
force = (numtyp)4.0/(numtyp)15.0*fR *
((numtyp)2.0*(K[1]+K[2]) *
(K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
force*=factor_lj;
} else if (form[mtype]==2) { // LARGE_LARGE
r = ucl_sqrt(rsq);
@ -131,8 +135,9 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
numtyp dUA = -colloid1[mtype].x/3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
force = factor_lj * (dUR+dUA)/r;
}
@ -149,7 +154,8 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +
(numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
} else if (form[mtype]==2) {
e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
}
energy+=factor_lj*(e-lj3[mtype].z);
}
@ -169,15 +175,17 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_colloid_fast(__global numtyp4 *x_,
__global numtyp4 *lj1_in,
__global numtyp4 *lj3_in,
__global numtyp *sp_lj_in,
__global numtyp4 *colloid1_in,
__global numtyp4 *colloid2_in,
__global int *form_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global numtyp4 *restrict colloid1_in,
const __global numtyp4 *restrict colloid2_in,
const __global int *form_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
@ -210,7 +218,7 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -257,7 +265,8 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
K[6] = K[3]*K[3];
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
force = (numtyp)4.0/(numtyp)15.0*fR *
((numtyp)2.0*(K[1]+K[2]) * (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
((numtyp)2.0*(K[1]+K[2]) *
(K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
force*=factor_lj;
} else if (form[mtype]==2) { // LARGE_LARGE
@ -289,7 +298,8 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
force = factor_lj * (dUR+dUA)/r;
} else force = (numtyp)0.0;
@ -304,10 +314,12 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
} else if (form[mtype]==1) {
e=(numtyp)2.0/(numtyp)9.0*fR *
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
colloid2[mtype].w/K[6]);
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+
(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
colloid2[mtype].w/K[6]);
} else if (form[mtype]==2) {
e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
}
energy+=factor_lj*(e-lj3[mtype].z);
}

View File

@ -31,12 +31,16 @@ texture<int2> q_tex;
#define MY_PIS (acctyp)1.77245385090551602729
__kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_ ,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp e_shift, const numtyp f_shift,
const numtyp alpha, const int t_per_atom) {
@ -58,7 +62,7 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -66,6 +70,12 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
@ -91,8 +101,8 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
force = forcecoul * r2inv;
@ -122,11 +132,15 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
} // if ii
}
__kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp e_shift, const numtyp f_shift,
const numtyp alpha, const int t_per_atom) {
@ -148,13 +162,19 @@ __kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
@ -181,8 +201,8 @@ __kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
force = forcecoul * r2inv;

View File

@ -29,13 +29,112 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_cl_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
#if (ARCH < 300)
#define store_answers_lq(f, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
__local acctyp red_acc[6][BLOCK_PAIR]; \
\
red_acc[0][tid]=f.x; \
red_acc[1][tid]=f.y; \
red_acc[2][tid]=f.z; \
red_acc[3][tid]=e_coul; \
\
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<4; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
\
f.x=red_acc[0][tid]; \
f.y=red_acc[1][tid]; \
f.z=red_acc[2][tid]; \
e_coul=red_acc[3][tid]; \
\
if (vflag>0) { \
for (int r=0; r<6; r++) \
red_acc[r][tid]=virial[r]; \
\
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
\
for (int r=0; r<6; r++) \
virial[r]=red_acc[r][tid]; \
} \
} \
\
if (offset==0) { \
__global acctyp *ap1=engv+ii; \
if (eflag>0) { \
*ap1=(acctyp)0; \
ap1+=inum; \
*ap1=e_coul; \
ap1+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1=virial[i]; \
ap1+=inum; \
} \
} \
ans[ii]=f; \
}
#else
#define store_answers_lq(f, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
e_coul += shfl_xor(e_coul, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
__global acctyp *ap1=engv+ii; \
if (eflag>0) { \
*ap1=(acctyp)0; \
ap1+=inum; \
*ap1=e_coul; \
ap1+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1=virial[i]; \
ap1+=inum; \
} \
} \
ans[ii]=f; \
}
#endif
__kernel void k_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
@ -55,7 +154,7 @@ __kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -109,71 +208,22 @@ __kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
}
} // for nbor
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
e_coul=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=(acctyp)0;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
}
store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_cl_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_cl_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
@ -193,7 +243,7 @@ __kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -247,61 +297,8 @@ __kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
}
} // for nbor
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
e_coul=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=(acctyp)0;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
}
store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}

View File

@ -580,14 +580,14 @@ int DeviceT::compile_kernels() {
k_info.set_function(*dev_program,"kernel_info");
_compiled=true;
UCL_Vector<int,int> gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
UCL_Vector<int,int> gpu_lib_data(15,*gpu,UCL_NOT_PINNED);
k_info.set_size(1,1);
k_info.run(&gpu_lib_data);
gpu_lib_data.update_host(false);
_ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
#ifndef USE_OPENCL
if (_ptx_arch>gpu->arch())
if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
return -4;
#endif
@ -606,6 +606,7 @@ int DeviceT::compile_kernels() {
_block_nbor_build=gpu_lib_data[10];
_block_bio_pair=gpu_lib_data[11];
_max_bio_shared_types=gpu_lib_data[12];
_block_ellipse=gpu_lib_data[14];
if (static_cast<size_t>(_block_pair)>gpu->group_size())
_block_pair=gpu->group_size();

View File

@ -17,7 +17,8 @@
#include "lal_preprocessor.h"
#endif
__kernel void kernel_zero(__global int *mem, int numel) {
__kernel void kernel_zero(__global int *restrict mem,
int numel) {
int ii=GLOBAL_ID_X;
if (ii<numel)
@ -39,4 +40,5 @@ __kernel void kernel_info(__global int *info) {
info[11]=BLOCK_BIO_PAIR;
info[12]=MAX_BIO_SHARED_TYPES;
info[13]=THREADS_PER_CHARGE;
info[14]=BLOCK_ELLIPSE;
}

View File

@ -228,6 +228,8 @@ class Device {
inline int block_nbor_build() const { return _block_nbor_build; }
/// Return the block size for "bio" pair styles
inline int block_bio_pair() const { return _block_bio_pair; }
/// Return the block size for "ellipse" pair styles
inline int block_ellipse() const { return _block_ellipse; }
/// Return the maximum number of atom types for shared mem with "bio" styles
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
/// Architecture gpu code compiled for (returns 0 for OpenCL)
@ -292,7 +294,7 @@ class Device {
int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
int _pppm_max_spline, _pppm_block;
int _block_pair, _max_shared_types;
int _block_pair, _block_ellipse, _max_shared_types;
int _block_cell_2d, _block_cell_id, _block_nbor_build;
int _block_bio_pair, _max_bio_shared_types;

View File

@ -15,6 +15,23 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#define mu_tex mu_
#endif
#if (ARCH < 300)
#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
@ -73,32 +90,63 @@
ans[ii+inum]=tor; \
}
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
tor.x += shfl_xor(tor.x, s, t_per_atom); \
tor.y += shfl_xor(tor.y, s, t_per_atom); \
tor.z += shfl_xor(tor.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
e_coul += shfl_xor(e_coul, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
engv+=inum; \
*engv=e_coul; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
engv+=inum; \
} \
} \
ans[ii]=f; \
ans[ii+inum]=tor; \
}
#endif
#else
#define pos_tex x_
#define q_tex q_
#define mu_tex mu_
#endif
__kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp4 *mu_,
__global numtyp *cutsq, const numtyp qqrd2e,
const int t_per_atom) {
__kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp4 *restrict mu_,
const __global numtyp *restrict cutsq,
const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -125,7 +173,7 @@ __kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -291,16 +339,20 @@ __kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *mu_,
__global numtyp *_cutsq, const numtyp qqrd2e,
const int t_per_atom) {
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp4 *restrict mu_,
const __global numtyp *restrict _cutsq,
const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -332,7 +384,7 @@ __kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -16,6 +16,24 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#define mu_tex mu_
#endif
#if (ARCH < 300)
#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
@ -73,32 +91,63 @@
ans[ii+inum]=tor; \
}
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
tor.x += shfl_xor(tor.x, s, t_per_atom); \
tor.y += shfl_xor(tor.y, s, t_per_atom); \
tor.z += shfl_xor(tor.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
e_coul += shfl_xor(e_coul, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
engv+=inum; \
*engv=e_coul; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
engv+=inum; \
} \
} \
ans[ii]=f; \
ans[ii+inum]=tor; \
}
#endif
#else
#define pos_tex x_
#define q_tex q_
#define mu_tex mu_
#endif
__kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp4 *mu_,
__global numtyp *cutsq, const numtyp qqrd2e,
const int t_per_atom) {
__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_ ,
const __global numtyp4 *restrict mu_,
const __global numtyp *restrict cutsq,
const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -125,7 +174,7 @@ __kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -322,15 +371,20 @@ __kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *mu_,
__global numtyp *_cutsq, const numtyp qqrd2e,
__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp4 *restrict mu_,
const __global numtyp *restrict _cutsq,
const numtyp qqrd2e,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -363,7 +417,7 @@ __kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -46,10 +46,11 @@ template <class numtyp, class acctyp>
int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
int **host_type2z2r, int *host_type2frho,
double ***host_rhor_spline, double ***host_z2r_spline,
double ***host_frho_spline, double rdr, double rdrho, int nrhor,
int nrho, int nz2r, int nfrho, int nr, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, FILE *_screen)
double ***host_frho_spline, double rdr, double rdrho,
double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen)
{
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
@ -97,6 +98,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
_cutforcesq=host_cutforcesq;
_rdr=rdr;
_rdrho = rdrho;
_rhomax=rhomax;
_nrhor=nrhor;
_nrho=nrho;
_nz2r=nz2r;
@ -468,15 +470,15 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_fp, &this->ans->engv, &eflag, &ainum,
&nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
&_nrho, &_nr, &this->_threads_per_atom);
&_rhomax, &_nrho, &_nr, &this->_threads_per_atom);
} else {
this->k_energy.set_size(GX,BX);
this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
&rhor_spline2, &frho_spline1, &frho_spline2,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp,
&this->ans->engv,&eflag, &ainum, &nbor_pitch,
&_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr,
&this->_threads_per_atom);
&_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho,
&_nr, &this->_threads_per_atom);
}
this->time_pair.stop();

View File

@ -52,8 +52,10 @@ texture<int4> z2r_sp2_tex;
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define MAX(A,B) ((A) > (B) ? (A) : (B))
#if (ARCH < 300)
#define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset, \
eflag,vflag,engv,rdrho,nrho,i) \
eflag,vflag,engv,rdrho,nrho,i,rhomax) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=rho; \
@ -76,10 +78,11 @@ texture<int4> z2r_sp2_tex;
if (eflag>0) { \
fetch4(coeff,index,frho_sp2_tex); \
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
if (rho > rhomax) energy += fp*(rho-rhomax); \
engv[ii]=(acctyp)2.0*energy; \
} \
}
#define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom, \
offset, elag, vflag, ans, engv) \
if (t_per_atom>1) { \
@ -125,18 +128,80 @@ texture<int4> z2r_sp2_tex;
ans[ii]=f; \
}
__kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
__global int *type2frho,
__global numtyp4 *rhor_spline2,
__global numtyp4 *frho_spline1,
__global numtyp4 *frho_spline2,
__global int *dev_nbor, __global int *dev_packed,
__global numtyp *fp_, __global acctyp *engv,
const int eflag, const int inum,
const int nbor_pitch, const int ntypes,
const numtyp cutforcesq, const numtyp rdr,
const numtyp rdrho, const int nrho, const int nr,
const int t_per_atom) {
#else
#define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset, \
eflag,vflag,engv,rdrho,nrho,i,rhomax) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) \
rho += shfl_xor(rho, s, t_per_atom); \
} \
if (offset==0) { \
numtyp p = rho*rdrho + (numtyp)1.0; \
int m=p; \
m = MAX(1,MIN(m,nrho-1)); \
p -= m; \
p = MIN(p,(numtyp)1.0); \
int index = type2frho[itype]*(nrho+1)+m; \
numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex); \
numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z; \
fp_[i]=fp; \
if (eflag>0) { \
fetch4(coeff,index,frho_sp2_tex); \
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
if (rho > rhomax) energy += fp*(rho-rhomax); \
engv[ii]=(acctyp)2.0*energy; \
} \
}
#define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom, \
offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv+=energy; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
engv+=inum; \
} \
} \
ans[ii]=f; \
}
#endif
__kernel void k_energy(const __global numtyp4 *restrict x_,
const __global int2 *restrict type2rhor_z2r,
const __global int *restrict type2frho,
const __global numtyp4 *restrict rhor_spline2,
const __global numtyp4 *restrict frho_spline1,
const __global numtyp4 *restrict frho_spline2,
const __global int *dev_nbor,
const __global int *dev_packed,
__global numtyp *restrict fp_,
__global acctyp *restrict engv,
const int eflag, const int inum, const int nbor_pitch,
const int ntypes, const numtyp cutforcesq,
const numtyp rdr, const numtyp rdrho,
const numtyp rhomax, const int nrho,
const int nr, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -144,7 +209,7 @@ __kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
acctyp energy = (acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -180,24 +245,26 @@ __kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
} // for nbor
store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
eflag,vflag,engv,rdrho,nrho,i);
eflag,vflag,engv,rdrho,nrho,i,rhomax);
} // if ii
}
__kernel void k_energy_fast(__global numtyp4 *x_,
__global int2 *type2rhor_z2r_in,
__global int *type2frho_in,
__global numtyp4 *rhor_spline2,
__global numtyp4 *frho_spline1,
__global numtyp4 *frho_spline2,
__global int *dev_nbor,
__global int *dev_packed, __global numtyp *fp_,
__global acctyp *engv, const int eflag,
const int inum, const int nbor_pitch,
const int ntypes, const numtyp cutforcesq,
const numtyp rdr, const numtyp rdrho,
const int nrho, const int nr,
const int t_per_atom) {
__kernel void k_energy_fast(const __global numtyp4 *restrict x_,
const __global int2 *restrict type2rhor_z2r_in,
const __global int *restrict type2frho_in,
const __global numtyp4 *restrict rhor_spline2,
const __global numtyp4 *restrict frho_spline1,
const __global numtyp4 *restrict frho_spline2,
const __global int *dev_nbor,
const __global int *dev_packed,
__global numtyp *restrict fp_,
__global acctyp *restrict engv,
const int eflag, const int inum,
const int nbor_pitch, const int ntypes,
const numtyp cutforcesq, const numtyp rdr,
const numtyp rdrho, const numtyp rhomax,
const int nrho, const int nr,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -218,7 +285,7 @@ __kernel void k_energy_fast(__global numtyp4 *x_,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -254,22 +321,24 @@ __kernel void k_energy_fast(__global numtyp4 *x_,
} // for nbor
store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
eflag,vflag,engv,rdrho,nrho,i);
eflag,vflag,engv,rdrho,nrho,i,rhomax);
} // if ii
}
__kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
__global int2 *type2rhor_z2r,
__global numtyp4 *rhor_spline1,
__global numtyp4 *z2r_spline1,
__global numtyp4 *z2r_spline2,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int ntypes, const numtyp cutforcesq,
const numtyp rdr, const int nr,
const int t_per_atom) {
__kernel void k_eam(const __global numtyp4 *restrict x_,
const __global numtyp *fp_,
const __global int2 *type2rhor_z2r,
const __global numtyp4 *rhor_spline1,
const __global numtyp4 *z2r_spline1,
const __global numtyp4 *z2r_spline2,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *ans,
__global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int ntypes,
const numtyp cutforcesq, const numtyp rdr, const int nr,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -283,7 +352,7 @@ __kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -364,18 +433,19 @@ __kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
}
__kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
__global int2 *type2rhor_z2r_in,
__global numtyp4 *rhor_spline1,
__global numtyp4 *z2r_spline1,
__global numtyp4 *z2r_spline2,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const numtyp cutforcesq,
const numtyp rdr, const int nr,
const int t_per_atom) {
__kernel void k_eam_fast(const __global numtyp4 *x_,
const __global numtyp *fp_,
const __global int2 *type2rhor_z2r_in,
const __global numtyp4 *rhor_spline1,
const __global numtyp4 *z2r_spline1,
const __global numtyp4 *z2r_spline2,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *ans,
__global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const numtyp cutforcesq,
const numtyp rdr, const int nr, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -395,7 +465,7 @@ __kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -41,10 +41,10 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
int init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline,
double ***host_z2r_spline, double ***host_frho_spline, double rdr,
double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const double gpu_split,
FILE *_screen);
double rdrho, double rhomax, int nrhor, int nrho, int nz2r,
int nfrho, int nr, const int nlocal, const int nall,
const int max_nbors, const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen);
// Copy charges to device asynchronously
inline void add_fp_data() {
@ -112,7 +112,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
UCL_D_Vec<numtyp4> frho_spline1, frho_spline2;
UCL_D_Vec<numtyp4> rhor_spline1, rhor_spline2;
numtyp _cutforcesq,_rdr,_rdrho;
numtyp _cutforcesq,_rdr,_rdrho, _rhomax;
int _nfrho,_nrhor,_nrho,_nz2r,_nr;

View File

@ -31,7 +31,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
double ***host_rhor_spline, double ***host_z2r_spline,
double ***host_frho_spline,
double rdr, double rdrho, int nrhor,
double rdr, double rdrho, double rhomax, int nrhor,
int nrho, int nz2r, int nfrho, int nr,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
@ -66,9 +66,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
if (world_me==0)
init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
host_type2frho, host_rhor_spline, host_z2r_spline,
host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho,
nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split,
screen);
host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
gpu_split, screen);
EAMMF.device->world_barrier();
if (message)
@ -86,9 +86,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
if (gpu_rank==i && world_me!=0)
init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
host_type2frho, host_rhor_spline, host_z2r_spline,
host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho,
nr, nlocal, nall, 300, maxspecial, cell_size,
gpu_split, screen);
host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
cell_size, gpu_split, screen);
EAMMF.device->gpu_barrier();
if (message)

View File

@ -19,7 +19,7 @@
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#ifdef NV_KERNEL
#include "lal_preprocessor.h"
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex, quat_tex;
#else
@ -30,11 +30,6 @@ texture<int4,1> pos_tex, quat_tex;
#define quat_tex qif
#endif
#define atom_info(t_per_atom, ii, tid, offset) \
tid=THREAD_ID_X; \
offset=tid & (t_per_atom-1); \
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, list_end, nbor) \
nbor=nbor_mem+ii; \
@ -42,55 +37,11 @@ texture<int4,1> pos_tex, quat_tex;
nbor+=nbor_stride; \
numj=*nbor; \
nbor+=nbor_stride; \
list_end=nbor+fast_mul(nbor_stride,numj); \
nbor+=fast_mul(offset,nbor_stride); \
list_end=nbor+fast_mul(nbor_stride,numj); \
nbor+=fast_mul(offset,nbor_stride); \
stride=fast_mul(t_per_atom,nbor_stride);
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
__local acctyp red_acc[6][BLOCK_PAIR]; \
red_acc[0][tid]=f.x; \
red_acc[1][tid]=f.y; \
red_acc[2][tid]=f.z; \
red_acc[3][tid]=energy; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<4; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
f.x=red_acc[0][tid]; \
f.y=red_acc[1][tid]; \
f.z=red_acc[2][tid]; \
energy=red_acc[3][tid]; \
if (vflag>0) { \
for (int r=0; r<6; r++) \
red_acc[r][tid]=virial[r]; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
for (int r=0; r<6; r++) \
virial[r]=red_acc[r][tid]; \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
engv+=inum; \
} \
} \
ans[ii]=f; \
}
#if (ARCH < 300)
#define store_answers_t(f, tor, energy, virial, ii, astride, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
@ -195,6 +146,80 @@ texture<int4,1> pos_tex, quat_tex;
ans[ii]=old; \
}
#else
#define store_answers_t(f, tor, energy, virial, ii, astride, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
tor.x += shfl_xor(tor.x, s, t_per_atom); \
tor.y += shfl_xor(tor.y, s, t_per_atom); \
tor.z += shfl_xor(tor.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
__global acctyp *ap1=engv+ii; \
if (eflag>0) { \
*ap1=energy; \
ap1+=astride; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1=virial[i]; \
ap1+=astride; \
} \
} \
ans[ii]=f; \
ans[ii+astride]=tor; \
}
#define acc_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv+=energy; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv+=virial[i]; \
engv+=inum; \
} \
} \
acctyp4 old=ans[ii]; \
old.x+=f.x; \
old.y+=f.y; \
old.z+=f.z; \
ans[ii]=old; \
}
#endif
/* ----------------------------------------------------------------------
dot product of 2 vectors
------------------------------------------------------------------------- */

View File

@ -29,22 +29,24 @@ texture<int4,1> pos_tex;
// -- Only unpack neighbors matching the specified inclusive range of forms
// -- Only unpack neighbors within cutoff
// ---------------------------------------------------------------------------
__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
const int ntypes, __global int *dev_nbor,
__kernel void kernel_nbor(const __global numtyp4 *restrict x_,
const __global numtyp2 *restrict cut_form,
const int ntypes,
__global int *dev_nbor,
const int nbor_pitch, const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high) {
const __global int *dev_ij,
const int form_low, const int form_high) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X+start;
if (ii<inum) {
__global int *nbor=dev_ij+ii;
const __global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@ -84,11 +86,13 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
// -- Only unpack neighbors within cutoff
// -- Fast version of routine that uses shared memory for LJ constants
// ---------------------------------------------------------------------------
__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
__global int *dev_nbor, const int nbor_pitch,
const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high) {
__kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
const __global numtyp2 *restrict cut_form,
__global int *dev_nbor,
const int nbor_pitch, const int start,
const int inum,
const __global int *dev_ij,
const int form_low, const int form_high) {
int ii=THREAD_ID_X;
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@ -101,12 +105,12 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_ij+ii;
const __global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];

View File

@ -24,12 +24,15 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
__kernel void k_gauss(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict gauss1,
const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -48,7 +51,7 @@ __kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -104,10 +107,13 @@ __kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
} // if ii
}
__kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict gauss1_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
@ -131,7 +137,7 @@ __kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -80,13 +80,20 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
__kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, const int astride,
__global acctyp *engv, __global int *err_flag,
__kernel void k_gayberne(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict q,
const __global numtyp4 *restrict shape,
const __global numtyp4 *restrict well,
const __global numtyp *restrict gum,
const __global numtyp2 *restrict sig_eps,
const int ntypes,
const __global numtyp *restrict lshape,
const __global int *dev_nbor,
const int stride,
__global acctyp4 *restrict ans,
const int astride,
__global acctyp *restrict engv,
__global int *restrict err_flag,
const int eflag, const int vflag, const int inum,
const int t_per_atom) {
int tid, ii, offset;
@ -112,7 +119,7 @@ __kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *nbor_end;
const __global int *nbor, *nbor_end;
int i, numj, n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);

View File

@ -17,15 +17,22 @@
#include "lal_ellipsoid_extra.h"
#endif
__kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
__global numtyp4 *q, __global numtyp4* shape,
__global numtyp4* well, __global numtyp *gum,
__global numtyp2* sig_eps, const int ntypes,
__global numtyp *lshape, __global int *dev_nbor,
const int stride, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag,const int start,
const int inum, const int t_per_atom) {
__kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict q,
const __global numtyp4 *restrict shape,
const __global numtyp4 *restrict well,
const __global numtyp *restrict gum,
const __global numtyp2 *restrict sig_eps,
const int ntypes,
const __global numtyp *restrict lshape,
const __global int *dev_nbor,
const int stride,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global int *restrict err_flag,
const int eflag, const int vflag,
const int start, const int inum,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -46,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *nbor_end;
const __global int *nbor, *nbor_end;
int i, numj, n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -236,11 +243,16 @@ __kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
} // if ii
}
__kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *gum, const int stride,
__global int *dev_ij, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
__kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict gum,
const int stride,
const __global int *dev_ij,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global int *restrict err_flag,
const int eflag, const int vflag, const int start,
const int inum, const int t_per_atom) {
int tid, ii, offset;
@ -263,7 +275,7 @@ __kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -318,13 +330,18 @@ __kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start,
const int inum, const int t_per_atom) {
__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict gum,
const int stride,
const __global int *dev_ij,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global int *restrict err_flag,
const int eflag, const int vflag,
const int start, const int inum,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -352,7 +369,7 @@ __kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_lj(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -48,7 +52,7 @@ __kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -103,13 +107,16 @@ __kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_lj_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -134,7 +141,7 @@ __kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_lj96(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -48,7 +52,7 @@ __kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -104,13 +108,16 @@ __kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -135,7 +142,7 @@ __kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -29,15 +29,20 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
__kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -60,7 +65,7 @@ __kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -141,15 +146,21 @@ __kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq,
const numtyp qqrd2e,
const numtyp g_ewald,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -175,7 +186,7 @@ __kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -29,15 +29,20 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp *cutsq, const numtyp qqrd2e,
const int t_per_atom) {
__kernel void k_lj_coul(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp *restrict cutsq,
const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -60,7 +65,7 @@ __kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -132,15 +137,19 @@ __kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp *_cutsq, const numtyp qqrd2e,
const int t_per_atom) {
__kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp *restrict _cutsq,
const numtyp qqrd2e, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -168,7 +177,7 @@ __kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -29,16 +29,21 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp *cutsq, const numtyp qqrd2e,
const numtyp kappa,
const int t_per_atom) {
__kernel void k_lj_debye(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_ ,
const __global numtyp *restrict cutsq,
const numtyp qqrd2e, const numtyp kappa,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -61,7 +66,7 @@ __kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -139,16 +144,20 @@ __kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp *_cutsq, const numtyp qqrd2e,
const numtyp kappa,
const int t_per_atom) {
__kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const __global numtyp *restrict _cutsq,
const numtyp qqrd2e, const numtyp kappa,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -176,7 +185,7 @@ __kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -29,15 +29,20 @@ texture<int2> q_tex;
#define q_tex q_
#endif
__kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -60,7 +65,7 @@ __kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -139,15 +144,19 @@ __kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -173,7 +182,7 @@ __kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -31,13 +31,18 @@ texture<int2> q_tex;
#define MY_PIS (acctyp)1.77245385090551602729
__kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch,
const __global numtyp *restrict q_ ,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp e_shift, const numtyp f_shift,
const numtyp alpha, const int t_per_atom) {
@ -63,7 +68,7 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -72,6 +77,12 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
@ -107,8 +118,8 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
} else
forcecoul = (numtyp)0.0;
@ -144,12 +155,17 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp e_shift, const numtyp f_shift,
const numtyp alpha, const int t_per_atom) {
@ -178,7 +194,7 @@ __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -188,6 +204,12 @@ __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
@ -222,8 +244,8 @@ __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
} else
forcecoul = (numtyp)0.0;

View File

@ -26,12 +26,16 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
__kernel void k_lj_expand(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -50,7 +54,7 @@ __kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -108,11 +112,14 @@ __kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
@ -139,7 +146,7 @@ __kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -26,13 +26,17 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
__global numtyp2* mor2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_morse(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict mor1,
const __global numtyp2 *restrict mor2,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -50,7 +54,7 @@ __kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -106,13 +110,16 @@ __kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
} // if ii
}
__kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
__global numtyp2* mor2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_morse_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict mor1_in,
const __global numtyp2 *restrict mor2_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -137,7 +144,7 @@ __kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -17,7 +17,8 @@
#include "lal_preprocessor.h"
#endif
__kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
__kernel void kernel_unpack(__global int *dev_nbor,
const __global int *dev_ij,
const int inum, const int t_per_atom) {
int tid=THREAD_ID_X;
int offset=tid & (t_per_atom-1);
@ -27,8 +28,8 @@ __kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
__global int *nbor=dev_nbor+ii+inum;
int numj=*nbor;
nbor+=inum;
__global int *list=dev_ij+*nbor;
__global int *list_end=list+numj;
const __global int *list=dev_ij+*nbor;
const __global int *list_end=list+numj;
list+=offset;
nbor+=fast_mul(ii,t_per_atom-1)+offset;
int stride=fast_mul(t_per_atom,inum);

View File

@ -22,7 +22,9 @@ texture<float4> pos_tex;
texture<int4,1> pos_tex;
#endif
__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
__kernel void calc_cell_id(const numtyp4 *restrict pos,
unsigned *restrict cell_id,
int *restrict particle_id,
numtyp boxlo0, numtyp boxlo1, numtyp boxlo2,
numtyp i_cell_size, int ncellx, int ncelly,
int ncellz, int inum, int nall,
@ -62,8 +64,9 @@ __kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
}
}
__kernel void kernel_calc_cell_counts(unsigned *cell_id,
int *cell_counts, int nall, int ncell) {
__kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
int *restrict cell_counts,
int nall, int ncell) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nall) {
int id = cell_id[idx];
@ -94,8 +97,9 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
__kernel void transpose(__global int *out, __global int *in, int columns_in,
int rows_in)
__kernel void transpose(__global int *restrict out,
const __global int *restrict in,
int columns_in, int rows_in)
{
__local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
@ -117,9 +121,9 @@ __kernel void transpose(__global int *out, __global int *in, int columns_in,
out[j*rows_in+i] = block[ti][tj];
}
__kernel void calc_neigh_list_cell(__global numtyp4 *x_,
__global int *cell_particle_id,
__global int *cell_counts,
__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
const __global int *restrict cell_particle_id,
const __global int *restrict cell_counts,
__global int *nbor_list,
__global int *host_nbor_list,
__global int *host_numj,
@ -234,8 +238,10 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
__kernel void kernel_special(__global int *dev_nbor,
__global int *host_nbor_list,
__global int *host_numj, __global int *tag,
__global int *nspecial, __global int *special,
const __global int *host_numj,
const __global int *restrict tag,
const __global int *restrict nspecial,
const __global int *restrict special,
int inum, int nt, int max_nbors, int t_per_atom) {
int tid=THREAD_ID_X;
int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);

View File

@ -41,15 +41,18 @@ texture<int2> q_tex;
// Number of pencils per block for charge spread
#define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE)
__kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_,
__kernel void particle_map(const __global numtyp4 *restrict x_,
const __global numtyp *restrict q_,
const grdtyp delvolinv, const int nlocal,
__global int *counts, __global grdtyp4 *ans,
__global int *restrict counts,
__global grdtyp4 *restrict ans,
const grdtyp b_lo_x, const grdtyp b_lo_y,
const grdtyp b_lo_z, const grdtyp delxinv,
const grdtyp delyinv, const grdtyp delzinv,
const int nlocal_x, const int nlocal_y,
const int nlocal_z, const int atom_stride,
const int max_atoms, __global int *error) {
const int max_atoms,
__global int *restrict error) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
@ -97,8 +100,10 @@ __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_,
/* --------------------------- */
__kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
__global grdtyp *brick, __global grdtyp *_rho_coeff,
__kernel void make_rho(const __global int *restrict counts,
const __global grdtyp4 *restrict atoms,
__global grdtyp *restrict brick,
const __global grdtyp *restrict _rho_coeff,
const int atom_stride, const int npts_x,
const int npts_y, const int npts_z, const int nlocal_x,
const int nlocal_y, const int nlocal_z,
@ -192,15 +197,17 @@ __kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
}
}
__kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
const int nlocal, __global grdtyp4 *brick,
__global grdtyp *_rho_coeff, const int npts_x,
const int npts_yx, const grdtyp b_lo_x,
__kernel void interp(const __global numtyp4 *restrict x_,
const __global numtyp *restrict q_,
const int nlocal,
const __global grdtyp4 *restrict brick,
const __global grdtyp *restrict _rho_coeff,
const int npts_x, const int npts_yx, const grdtyp b_lo_x,
const grdtyp b_lo_y, const grdtyp b_lo_z,
const grdtyp delxinv, const grdtyp delyinv,
const grdtyp delzinv, const int order,
const int order2, const grdtyp qqrd2e_scale,
__global acctyp4 *ans) {
__global acctyp4 *restrict ans) {
__local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
__local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
__local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];

View File

@ -56,8 +56,7 @@
// Definition: Default thread block size for "bio" pair styles
// MAX_BIO_SHARED_TYPES
// Definition: Max # of atom type params can be stored in shared memory
// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 &&
// MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR
// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
//
//*************************************************************************/
@ -80,6 +79,7 @@
#define __kernel extern "C" __global__
#define __local __shared__
#define __global
#define restrict __restrict__
#define atom_add atomicAdd
#define ucl_inline static __inline__ __device__
@ -116,8 +116,22 @@
#define BLOCK_NBOR_BUILD 128
#define BLOCK_PAIR 512
#define BLOCK_BIO_PAIR 512
#define BLOCK_ELLIPSE 256
#define MAX_SHARED_TYPES 11
#ifdef _SINGLE_SINGLE
#define shfl_xor __shfl_xor
#else
ucl_inline double shfl_xor(double var, int laneMask, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
tmp.x = __shfl_xor(tmp.x,laneMask,width);
tmp.y = __shfl_xor(tmp.y,laneMask,width);
return __hiloint2double(tmp.x,tmp.y);
}
#endif
#endif
#endif
@ -380,3 +394,7 @@ typedef struct _double4 double4;
#define NEIGHMASK 0x3FFFFFFF
ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
#ifndef BLOCK_ELLIPSE
#define BLOCK_ELLIPSE BLOCK_PAIR
#endif

View File

@ -32,14 +32,20 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
return ans;
}
__kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *splj, __global numtyp2* sig_eps,
const int ntypes, __global int *dev_nbor,
const int stride, __global acctyp4 *ans,
const int astride, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int inum,
__kernel void k_resquared(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict q,
const __global numtyp4 *restrict shape,
const __global numtyp4 *restrict well,
const __global numtyp *restrict splj,
const __global numtyp2 *restrict sig_eps,
const int ntypes,
const __global int *dev_nbor,
const int stride,
__global acctyp4 *restrict ans,
const int astride,
__global acctyp *restrict engv,
__global int *restrict err_flag,
const int eflag, const int vflag, const int inum,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -68,7 +74,7 @@ __kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *nbor_end;
const __global int *nbor, *nbor_end;
int i, numj, n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);

View File

@ -17,15 +17,132 @@
#include "lal_ellipsoid_extra.h"
#endif
__kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
__global numtyp4 *q, __global numtyp4* shape,
__global numtyp4* well, __global numtyp *splj,
__global numtyp2* sig_eps, const int ntypes,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, const int astride,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int inum,
const int t_per_atom) {
#if (ARCH < 300)
#define store_answers_rt(f, tor, energy, virial, ii, astride, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
__local acctyp red_acc[7][BLOCK_PAIR]; \
red_acc[0][tid]=f.x; \
red_acc[1][tid]=f.y; \
red_acc[2][tid]=f.z; \
red_acc[3][tid]=tor.x; \
red_acc[4][tid]=tor.y; \
red_acc[5][tid]=tor.z; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
f.x=red_acc[0][tid]; \
f.y=red_acc[1][tid]; \
f.z=red_acc[2][tid]; \
tor.x=red_acc[3][tid]; \
tor.y=red_acc[4][tid]; \
tor.z=red_acc[5][tid]; \
if (eflag>0 || vflag>0) { \
for (int r=0; r<6; r++) \
red_acc[r][tid]=virial[r]; \
red_acc[6][tid]=energy; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<7; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
for (int r=0; r<6; r++) \
virial[r]=red_acc[r][tid]; \
energy=red_acc[6][tid]; \
} \
} \
if (offset==0) { \
__global acctyp *ap1=engv+ii; \
if (eflag>0) { \
*ap1+=energy; \
ap1+=astride; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1+=virial[i]; \
ap1+=astride; \
} \
} \
acctyp4 old=ans[ii]; \
old.x+=f.x; \
old.y+=f.y; \
old.z+=f.z; \
ans[ii]=old; \
old=ans[ii+astride]; \
old.x+=tor.x; \
old.y+=tor.y; \
old.z+=tor.z; \
ans[ii+astride]=old; \
}
#else
#define store_answers_rt(f, tor, energy, virial, ii, astride, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
tor.x += shfl_xor(tor.x, s, t_per_atom); \
tor.y += shfl_xor(tor.y, s, t_per_atom); \
tor.z += shfl_xor(tor.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
} \
} \
if (offset==0) { \
__global acctyp *ap1=engv+ii; \
if (eflag>0) { \
*ap1+=energy; \
ap1+=astride; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1+=virial[i]; \
ap1+=astride; \
} \
} \
acctyp4 old=ans[ii]; \
old.x+=f.x; \
old.y+=f.y; \
old.z+=f.z; \
ans[ii]=old; \
old=ans[ii+astride]; \
old.x+=tor.x; \
old.y+=tor.y; \
old.z+=tor.z; \
ans[ii+astride]=old; \
}
#endif
__kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict q,
const __global numtyp4 *restrict shape,
const __global numtyp4 *restrict well,
const __global numtyp *restrict splj,
const __global numtyp2 *restrict sig_eps,
const int ntypes,
const __global int *dev_nbor,
const int stride,
__global acctyp4 *restrict ans,
const int astride,
__global acctyp *restrict engv,
__global int *restrict err_flag,
const int eflag, const int vflag,
const int inum,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -55,7 +172,7 @@ __kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *nbor_end;
const __global int *nbor, *nbor_end;
int i, numj, n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -260,86 +377,26 @@ __kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
}
} // for nbor
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[7][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=tor.x;
red_acc[4][tid]=tor.y;
red_acc[5][tid]=tor.z;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
tor.x=red_acc[3][tid];
tor.y=red_acc[4][tid];
tor.z=red_acc[5][tid];
if (eflag>0 || vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
red_acc[6][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<7; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
energy=red_acc[6][tid];
}
}
// Store answers
if (offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=astride;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=astride;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
old=ans[ii+astride];
old.x+=tor.x;
old.y+=tor.y;
old.z+=tor.z;
ans[ii+astride]=old;
}
store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
__global numtyp4 *q, __global numtyp4* shape,
__global numtyp4* well, __global numtyp *splj,
__global numtyp2* sig_eps, const int ntypes,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag, const int vflag,
const int start, const int inum, const int t_per_atom) {
__kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict q,
const __global numtyp4 *restrict shape,
const __global numtyp4 *restrict well,
const __global numtyp *restrict splj,
const __global numtyp2 *restrict sig_eps,
const int ntypes,
const __global int *dev_nbor,
const int stride,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global int *restrict err_flag,
const int eflag, const int vflag,
const int start, const int inum,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -366,7 +423,7 @@ __kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *nbor_end;
const __global int *nbor, *nbor_end;
int j, numj, n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
n_stride,nbor_end,nbor);
@ -525,11 +582,16 @@ __kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
} // if ii
}
__kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *gum, const int stride,
__global int *dev_ij, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
__kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
const int lj_types,
const __global numtyp *restrict gum,
const int stride,
const __global int *dev_ij,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global int *restrict err_flag,
const int eflag, const int vflag, const int start,
const int inum, const int t_per_atom) {
int tid, ii, offset;
@ -552,7 +614,7 @@ __kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -606,12 +668,18 @@ __kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void k_resquared_lj_fast(__global numtyp4 *x_,
__global numtyp4 *lj1_in, __global numtyp4* lj3_in,
__global numtyp *gum, const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag, const int vflag,
const int start, const int inum, const int t_per_atom) {
__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1_in,
const __global numtyp4 *restrict lj3_in,
const __global numtyp *restrict gum,
const int stride,
const __global int *dev_ij,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global int *restrict err_flag,
const int eflag, const int vflag,
const int start, const int inum,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -639,7 +707,7 @@ __kernel void k_resquared_lj_fast(__global numtyp4 *x_,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -39,18 +39,21 @@ typedef union {
/// ---------------- LOOKUP -------------------------------------------------
__kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
const int lj_types,
__global numtyp *cutsq,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
__kernel void k_table(const __global numtyp4 *restrict x_,
const __global int *restrict tabindex,
const __global numtyp4 *restrict coeff2,
const __global numtyp4 *restrict coeff3,
const __global numtyp4 *restrict coeff4,
const int lj_types,
const __global numtyp *restrict cutsq,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -70,7 +73,7 @@ __kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
int tlm1 = tablength - 1;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -130,17 +133,20 @@ __kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
} // if ii
}
__kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
__global numtyp *cutsq_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
__kernel void k_table_fast(const __global numtyp4 *restrict x_,
const __global int *restrict tabindex,
const __global numtyp4 *restrict coeff2,
const __global numtyp4 *restrict coeff3,
const __global numtyp4 *restrict coeff4,
const __global numtyp *restrict cutsq_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -164,7 +170,7 @@ __kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
int tlm1 = tablength - 1;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -227,18 +233,21 @@ __kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
/// ---------------- LINEAR -------------------------------------------------
__kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
const int lj_types,
__global numtyp *cutsq,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
__kernel void k_table_linear(const __global numtyp4 *restrict x_,
const __global int *restrict tabindex,
const __global numtyp4 *restrict coeff2,
const __global numtyp4 *restrict coeff3,
const __global numtyp4 *restrict coeff4,
const int lj_types,
const __global numtyp *restrict cutsq,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -258,7 +267,7 @@ __kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
int tlm1 = tablength - 1;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -322,17 +331,20 @@ __kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
} // if ii
}
__kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
__global numtyp *cutsq_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
const __global int *restrict tabindex,
const __global numtyp4 *restrict coeff2,
const __global numtyp4 *restrict coeff3,
const __global numtyp4 *restrict coeff4,
const __global numtyp *restrict cutsq_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, int tablength) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -356,7 +368,7 @@ __kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
int tlm1 = tablength - 1;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -423,18 +435,21 @@ __kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
/// ---------------- SPLINE -------------------------------------------------
__kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
const int lj_types,
__global numtyp *cutsq,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
__kernel void k_table_spline(const __global numtyp4 *restrict x_,
const __global int *restrict tabindex,
const __global numtyp4 *restrict coeff2,
const __global numtyp4 *restrict coeff3,
const __global numtyp4 *restrict coeff4,
const int lj_types,
const __global numtyp *restrict cutsq,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -454,7 +469,7 @@ __kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
int tlm1 = tablength - 1;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -525,17 +540,20 @@ __kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
} // if ii
}
__kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
__global numtyp *cutsq_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
__kernel void k_table_spline_fast(const __global numtyp4 *x_,
const __global int *tabindex,
const __global numtyp4* coeff2,
const __global numtyp4 *coeff3,
const __global numtyp4 *coeff4,
const __global numtyp *cutsq_in,
const __global numtyp* sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *ans,
__global acctyp *engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, int tablength) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -559,7 +577,7 @@ __kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
int tlm1 = tablength - 1;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -633,19 +651,23 @@ __kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
/// ---------------- BITMAP -------------------------------------------------
__kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
__global int *nshiftbits, __global int *nmask,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
const int lj_types,
__global numtyp *cutsq,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
__kernel void k_table_bitmap(const __global numtyp4 *x_,
const __global int *tabindex,
const __global int *nshiftbits,
const __global int *nmask,
const __global numtyp4* coeff2,
const __global numtyp4 *coeff3,
const __global numtyp4 *coeff4,
const int lj_types,
const __global numtyp *cutsq,
const __global numtyp* sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *ans,
__global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -665,7 +687,7 @@ __kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
int tlm1 = tablength - 1;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -732,18 +754,22 @@ __kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
} // if ii
}
__kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
__global int *nshiftbits, __global int *nmask,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
__global numtyp *cutsq_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
int tablength) {
__kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
const __global int *tabindex,
const __global int *nshiftbits,
const __global int *nmask,
const __global numtyp4* coeff2,
const __global numtyp4 *coeff3,
const __global numtyp4 *coeff4,
const __global numtyp *cutsq_in,
const __global numtyp* sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *ans,
__global acctyp *engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, int tablength) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -767,7 +793,7 @@ __kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
int tlm1 = tablength - 1;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -24,13 +24,16 @@ texture<int4,1> pos_tex;
#define pos_tex x_
#endif
__kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
const numtyp kappa, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_yukawa(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff,
const numtyp kappa, const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -48,7 +51,7 @@ __kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -105,12 +108,16 @@ __kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
} // if ii
}
__kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
const numtyp kappa, __global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff_in,
const numtyp kappa,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -132,7 +139,7 @@ __kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -30,7 +30,7 @@ extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(),
_allocated(false), _max_rad_size(0) {
_max_rad_size(0), _allocated(false) {
}
template <class numtyp, class acctyp>

View File

@ -29,12 +29,16 @@ texture<int2> rad_tex;
#define rad_tex rad_
#endif
__kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
__global numtyp4 *coeff, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
const __global numtyp *restrict rad_,
const __global numtyp4 *restrict coeff,
const int lj_types,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
const numtyp kappa) {
int tid, ii, offset;
@ -54,7 +58,7 @@ __kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -113,13 +117,17 @@ __kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
} // if ii
}
__kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
__global numtyp4 *coeff_in, __global numtyp *sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
const numtyp kappa) {
__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict rad_,
const __global numtyp4 *restrict coeff_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, const numtyp kappa) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -141,7 +149,7 @@ __kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
const __global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);