git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11230 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2014-01-13 15:51:54 +00:00
parent 427e90c151
commit bbffbe06a2
10 changed files with 438 additions and 193 deletions

View File

@ -64,7 +64,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
$(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -109,7 +110,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
$(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
$(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
$(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h
$(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
$(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h
all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
@ -644,6 +646,18 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(O
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_gromacs.cubin: lal_lj_gromacs.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_gromacs.cu
$(OBJ_DIR)/lj_gromacs_cubin.h: $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs.cubin
$(BIN2C) -c -n lj_gromacs $(OBJ_DIR)/lj_gromacs.cubin > $(OBJ_DIR)/lj_gromacs_cubin.h
$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda

View File

@ -53,7 +53,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
$(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -75,7 +76,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
$(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
$(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h
$(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
$(OBJ_DIR)/lj_gromacs_cl.h
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -460,6 +462,15 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_gromacs_cl.h: lal_lj_gromacs.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh lj_gromacs $(PRE1_H) lal_lj_gromacs.cu $(OBJ_DIR)/lj_gromacs_cl.h;
$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)

View File

@ -150,7 +150,7 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
csize-=_e_fields;
if (!vflag)
csize-=6;
if (csize>0)
engv.update_host(_inum*csize,true);
if (_rot)
@ -194,12 +194,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=vstart; i<iend; i++)
virial[j]+=engv[i];
if (_vf_atom)
if (_ilist==NULL)
for (int i=vstart; i<iend; i++)
vatom[i][j]+=engv[i];
else
for (int i=vstart; i<iend; i++)
vatom[_ilist[i]][j]+=engv[i];
if (_ilist==NULL) {
int ii=0;
for (int i=vstart; i<iend; i++)
vatom[ii++][j]+=engv[i];
} else {
int ii=0;
for (int i=vstart; i<iend; i++)
vatom[_ilist[ii++]][j]+=engv[i];
}
vstart+=_inum;
iend+=_inum;
}
@ -218,8 +221,9 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
return energy_virial(eatom,vatom,virial);
double evdwl=0.0;
int vstart=0, iend=_inum*2;
int vstart=0, iend=_inum;
if (_eflag) {
iend=_inum*2;
for (int i=0; i<_inum; i++)
evdwl+=engv[i];
for (int i=_inum; i<iend; i++)
@ -238,18 +242,21 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
}
vstart=iend;
iend+=_inum;
}
if (_vflag) {
}
if (_vflag) {
for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++)
virial[j]+=engv[i];
if (_vf_atom)
if (_ilist==NULL)
if (_ilist==NULL) {
int ii=0;
for (int i=vstart; i<iend; i++)
vatom[i][j]+=engv[i];
else
vatom[ii++][j]+=engv[i];
} else {
int ii=0;
for (int i=vstart; i<iend; i++)
vatom[_ilist[i]][j]+=engv[i];
vatom[_ilist[ii++]][j]+=engv[i];
}
vstart+=_inum;
iend+=_inum;
}

View File

@ -112,6 +112,7 @@ void BuckCoulLongT::clear() {
coeff1.clear();
coeff2.clear();
cutsq.clear();
sp_lj.clear();
this->clear_atomic();
}

View File

@ -10,7 +10,7 @@
__________________________________________________________________________
begin : 8/15/2012
email : nguyentdw@ornl.gov
email : nguyentd@ornl.gov
***************************************************************************/
#if defined(USE_OPENCL)

View File

@ -10,7 +10,7 @@
__________________________________________________________________________
begin : 8/15/2012
email : nguyentdw@ornl.gov
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_LJ_DSF_H

View File

@ -16,7 +16,7 @@
#if defined(USE_OPENCL)
#include "sw_cl.h"
#elif defined(USE_CUDART)
const char *lj=0;
const char *sw=0;
#else
#include "sw_cubin.h"
#endif
@ -43,28 +43,15 @@ int SWT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
int SWT::init(const int nlocal, const int nall, const int max_nbors,
const double cell_size, const double gpu_split, FILE *_screen,
const double epsilon, const double sigma,
const double lambda, const double gamma,
const double costheta, const double biga,
const double bigb, const double powerp,
const double powerq, const double cut, const double cutsq) {
sw_epsilon=static_cast<numtyp>(epsilon);
sw_sigma=static_cast<numtyp>(sigma);
sw_lambda=static_cast<numtyp>(lambda);
sw_gamma=static_cast<numtyp>(gamma);
sw_costheta=static_cast<numtyp>(costheta);
sw_biga=static_cast<numtyp>(biga);
sw_bigb=static_cast<numtyp>(bigb);
sw_powerp=static_cast<numtyp>(powerp);
sw_powerq=static_cast<numtyp>(powerq);
sw_cut=static_cast<numtyp>(cut);
sw_cutsq=static_cast<numtyp>(cutsq);
if (sw_cutsq>=sw_cut*sw_cut)
sw_cutsq=sw_cut*sw_cut-1e-4;
int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
const double cell_size, const double gpu_split, FILE *_screen,
int* host_map, const int nelements, int*** host_elem2param, const int nparams,
const double* epsilon, const double* sigma,
const double* lambda, const double* gamma,
const double* costheta, const double* biga,
const double* bigb, const double* powerp,
const double* powerq, const double* cut, const double* cutsq)
{
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,sw,"k_sw","k_sw_three_center",
@ -73,10 +60,93 @@ int SWT::init(const int nlocal, const int nall, const int max_nbors,
return success;
// If atom type constants fit in shared memory use fast kernel
shared_types=true;
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
_nparams = nparams;
_nelements = nelements;
UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++) {
dview[i].x=(numtyp)0;
dview[i].y=(numtyp)0;
dview[i].z=(numtyp)0;
dview[i].w=(numtyp)0;
}
// pack coefficients into arrays
sw1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<nparams; i++) {
dview[i].x=static_cast<numtyp>(epsilon[i]);
dview[i].y=static_cast<numtyp>(sigma[i]);
dview[i].z=static_cast<numtyp>(lambda[i]);
dview[i].w=static_cast<numtyp>(gamma[i]);
}
ucl_copy(sw1,dview,false);
sw1_tex.get_texture(*(this->pair_program),"sw1_tex");
sw1_tex.bind_float(sw1,4);
sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<nparams; i++) {
dview[i].x=static_cast<numtyp>(biga[i]);
dview[i].y=static_cast<numtyp>(bigb[i]);
dview[i].z=static_cast<numtyp>(powerp[i]);
dview[i].w=static_cast<numtyp>(powerq[i]);
}
ucl_copy(sw2,dview,false);
sw2_tex.get_texture(*(this->pair_program),"sw2_tex");
sw2_tex.bind_float(sw2,4);
sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<nparams; i++) {
dview[i].x=static_cast<numtyp>(cut[i]);
dview[i].y=static_cast<numtyp>(cutsq[i]);
dview[i].z=static_cast<numtyp>(costheta[i]);
dview[i].w=(numtyp)0;
}
ucl_copy(sw3,dview,false);
sw3_tex.get_texture(*(this->pair_program),"sw3_tex");
sw3_tex.bind_float(sw3,4);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
elem2param.alloc(nelements*nelements*nelements,*(this->ucl_device),
UCL_READ_ONLY);
for (int i = 0; i < nelements; i++)
for (int j = 0; j < nelements; j++)
for (int k = 0; k < nelements; k++) {
int idx = i*nelements*nelements+j*nelements+k;
dview_elem2param[idx] = host_elem2param[i][j][k];
}
ucl_copy(elem2param,dview_elem2param,false);
UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < lj_types; i++)
dview_map[i] = host_map[i];
map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(map,dview_map,false);
_allocated=true;
this->_max_bytes=0;
this->_max_bytes=sw1.row_bytes()+sw2.row_bytes()+sw3.row_bytes()+
map.row_bytes()+elem2param.row_bytes();
return 0;
}
@ -86,6 +156,11 @@ void SWT::clear() {
return;
_allocated=false;
sw1.clear();
sw2.clear();
sw3.clear();
map.clear();
elem2param.clear();
this->clear_atomic();
}
@ -121,22 +196,23 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom, &sw_cut, &sw_epsilon, &sw_sigma,
&sw_biga, &sw_bigb, &sw_powerp, &sw_powerq, &sw_cutsq);
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
BX=this->block_size();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/(KTHREADS*JTHREADS))));
this->k_three_center.set_size(GX,BX);
this->k_three_center.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom,
&sw_cut, &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
&sw_costheta, &sw_cutsq);
this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
Answer<numtyp,acctyp> *end_ans;
#ifdef THREE_CONCURRENT
end_ans=this->ans2;
@ -145,20 +221,20 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
#endif
if (evatom!=0) {
this->k_three_end_vatom.set_size(GX,BX);
this->k_three_end_vatom.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &end_ans->force,
&end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &sw_cut,
&sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
&sw_costheta, &sw_cutsq);
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &end_ans->force,
&end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &sw_cut,
&sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
&sw_costheta, &sw_cutsq);
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -15,13 +15,24 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float4> sw1_tex;
texture<float4> sw2_tex;
texture<float4> sw3_tex;
#else
texture<int4,1> pos_tex;
texture<int4> sw1_tex;
texture<int4> sw2_tex;
texture<int4> sw3_tex;
#endif
#else
#define pos_tex x_
#define sw1_tex sw1
#define sw2_tex sw2
#define sw3_tex sw3
#endif
#define THIRD (numtyp)0.66666667
@ -60,15 +71,15 @@ texture<int4,1> pos_tex;
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv+=energy*(acctyp)0.5; \
engv+=inum; \
engv[ei]+=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv+=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]+=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
acctyp4 old=ans[ii]; \
@ -97,15 +108,15 @@ texture<int4,1> pos_tex;
} \
} \
if (offset==0) { \
engv+=ii; \
int ei=ii; \
if (eflag>0) { \
*engv+=energy*(acctyp)0.5; \
engv+=inum; \
engv[ei]+=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv+=virial[i]*(acctyp)0.5; \
engv+=inum; \
engv[ei]+=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
acctyp4 old=ans[ii]; \
@ -118,34 +129,20 @@ texture<int4,1> pos_tex;
#endif
__kernel void k_sw(const __global numtyp4 *restrict x_,
__kernel void k_sw(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
const __global numtyp4 *restrict sw2,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
const numtyp sw_cut, const numtyp sw_epsilon,
const numtyp sw_sigma, const numtyp sw_biga,
const numtyp sw_bigb, const numtyp sw_powerp,
const numtyp sw_powerq, const numtyp sw_cutsq) {
const int nbor_pitch, const int t_per_atom) {
__local int n_stride;
__local numtyp pre_sw_c1, pre_sw_c2, pre_sw_c3, pre_sw_c4;
__local numtyp pre_sw_c5, pre_sw_c6;
pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
pow(sw_sigma,sw_powerp);
pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
pow(sw_sigma,sw_powerq);
pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp+(numtyp)1.0);
pre_sw_c4=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq+(numtyp)1.0);
pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp);
pre_sw_c6=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq);
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
@ -157,7 +154,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj;
@ -165,16 +162,19 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
int itype=ix.w;
itype=map[itype];
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w;
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
@ -182,7 +182,31 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<sw_cutsq) {
if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
numtyp sw_epsilon=sw1_ijparam.x;
numtyp sw_sigma=sw1_ijparam.y;
numtyp4 sw2_ijparam; fetch4(sw2_ijparam,ijparam,sw2_tex);
numtyp sw_biga=sw2_ijparam.x;
numtyp sw_bigb=sw2_ijparam.y;
numtyp sw_powerp=sw2_ijparam.z;
numtyp sw_powerq=sw2_ijparam.w;
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
numtyp sw_cut=sw3_ijparam.x;
numtyp sw_cutsq=sw3_ijparam.y;
numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
pow(sw_sigma,sw_powerp);
numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
pow(sw_sigma,sw_powerq);
numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp+(numtyp)1.0);
numtyp pre_sw_c4=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq+(numtyp)1.0);
numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp);
numtyp pre_sw_c6=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq);
numtyp r=ucl_sqrt(rsq);
numtyp rp=ucl_powr(r,-sw_powerp);
numtyp rq=ucl_powr(r,-sw_powerq);
@ -209,40 +233,41 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}
#define threebody(delr1x, delr1y, delr1z, eflag, energy) \
{ \
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - sw_cut); \
numtyp gsrainv1 = sw_sigma_gamma * rainv1; \
numtyp rainv1 = ucl_recip(r1 - sw_cut_ij); \
numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rinvsq2 = ucl_recip(rsq2); \
numtyp rainv2 = ucl_recip(r2 - sw_cut); \
numtyp gsrainv2 = sw_sigma_gamma * rainv2; \
numtyp rainv2 = ucl_recip(r2 - sw_cut_ik); \
numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2; \
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - sw_costheta; \
numtyp delcs = cs - sw_costheta_ijk; \
numtyp delcssq = delcs*delcs; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = sw_lambda_epsilon * facexp*delcssq; \
numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp frad2 = facrad*gsrainvsq2; \
numtyp facang = sw_lambda_epsilon2 * facexp*delcs; \
numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
@ -273,26 +298,26 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
{ \
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - sw_cut); \
numtyp gsrainv1 = sw_sigma_gamma * rainv1; \
numtyp rainv1 = ucl_recip(r1 - sw_cut_ij); \
numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rainv2 = ucl_recip(r2 - sw_cut); \
numtyp gsrainv2 = sw_sigma_gamma * rainv2; \
numtyp rainv2 = ucl_recip(r2 - sw_cut_ik); \
numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - sw_costheta; \
numtyp delcs = cs - sw_costheta_ijk; \
numtyp delcssq = delcs*delcs; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = sw_lambda_epsilon * facexp*delcssq; \
numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp facang = sw_lambda_epsilon2 * facexp*delcs; \
numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
@ -303,24 +328,24 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
}
__kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
const __global numtyp4 *restrict sw2,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, const int evatom,
const numtyp sw_cut, const numtyp sw_epsilon,
const numtyp sw_sigma, const numtyp sw_lambda,
const numtyp sw_gamma, const numtyp sw_costheta,
const numtyp sw_cutsq) {
const int t_per_atom, const int evatom) {
__local int tpa_sq, n_stride;
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
__local numtyp sw_lambda_epsilon2;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
sw_sigma_gamma=sw_sigma*sw_gamma;
sw_lambda_epsilon=sw_lambda*sw_epsilon;
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
int tid, ii, offset;
atom_info(tpa_sq,ii,tid,offset);
@ -344,8 +369,8 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
int itype=ix.w;
itype=map[itype];
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
@ -353,15 +378,25 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w;
int jtype=jx.w;
jtype=map[jtype];
// Compute r12
numtyp delr1x = jx.x-ix.x;
numtyp delr1y = jx.y-ix.y;
numtyp delr1z = jx.z-ix.z;
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
if (rsq1 > sw_cutsq) continue;
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
const __global int *nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j)
@ -372,11 +407,31 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
k &= NEIGHMASK;
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
numtyp delr2x = kx.x-ix.x;
numtyp delr2y = kx.y-ix.y;
numtyp delr2z = kx.z-ix.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw_cutsq) {
if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y;
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz, fkx, fky, fkz;
threebody(delr1x,delr1y,delr1z,eflag,energy);
@ -403,23 +458,24 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
}
__kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
const __global numtyp4 *restrict sw2,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, const numtyp sw_cut,
const numtyp sw_epsilon, const numtyp sw_sigma,
const numtyp sw_lambda, const numtyp sw_gamma,
const numtyp sw_costheta, const numtyp sw_cutsq) {
const int t_per_atom) {
__local int tpa_sq, n_stride;
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
__local numtyp sw_lambda_epsilon2;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
sw_sigma_gamma=sw_sigma*sw_gamma;
sw_lambda_epsilon=sw_lambda*sw_epsilon;
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
int tid, ii, offset;
atom_info(tpa_sq,ii,tid,offset);
@ -443,23 +499,33 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
int itype=ix.w;
itype=map[itype];
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
int j=*nbor_j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w;
int jtype=jx.w;
jtype=map[jtype];
// Compute r12
numtyp delr1x = ix.x-jx.x;
numtyp delr1y = ix.y-jx.y;
numtyp delr1z = ix.z-jx.z;
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
if (rsq1 > sw_cutsq) continue;
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
int numk=*nbor_k;
@ -478,15 +544,35 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
int k=*nbor_k;
k &= NEIGHMASK;
if (k == i)
continue;
if (k == i) continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
numtyp delr2x = kx.x - jx.x;
numtyp delr2y = kx.y - jx.y;
numtyp delr2z = kx.z - jx.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw_cutsq) {
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz;
//if (evatom==0) {
threebody_half(delr1x,delr1y,delr1z);
@ -512,24 +598,25 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
} // if ii
}
__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
const __global numtyp4 *restrict sw2,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom, const numtyp sw_cut,
const numtyp sw_epsilon, const numtyp sw_sigma,
const numtyp sw_lambda, const numtyp sw_gamma,
const numtyp sw_costheta, const numtyp sw_cutsq) {
const int t_per_atom) {
__local int tpa_sq, n_stride;
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
__local numtyp sw_lambda_epsilon2;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
sw_sigma_gamma=sw_sigma*sw_gamma;
sw_lambda_epsilon=sw_lambda*sw_epsilon;
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
int tid, ii, offset;
atom_info(tpa_sq,ii,tid,offset);
@ -553,26 +640,36 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
int itype=ix.w;
itype=map[itype];
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
int j=*nbor_j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w;
int jtype=jx.w;
jtype=map[jtype];
// Compute r12
numtyp delr1x = ix.x-jx.x;
numtyp delr1y = ix.y-jx.y;
numtyp delr1z = ix.z-jx.z;
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
if (rsq1 > sw_cutsq) continue;
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
int numk=*nbor_k;
int numk=*nbor_k;
if (dev_nbor==dev_packed) {
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
@ -588,15 +685,35 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
int k=*nbor_k;
k &= NEIGHMASK;
if (k == i)
continue;
if (k == i) continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
numtyp delr2x = kx.x - jx.x;
numtyp delr2y = kx.y - jx.y;
numtyp delr2z = kx.z - jx.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw_cutsq) {
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz, fkx, fky, fkz;
threebody(delr1x,delr1y,delr1z,eflag,energy);

View File

@ -37,13 +37,14 @@ class SW : public BaseThree<numtyp, acctyp> {
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int nlocal, const int nall, const int max_nbors,
int init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
const double cell_size, const double gpu_split, FILE *screen,
const double epsilon, const double sigma,
const double lambda, const double gamma,
const double costheta, const double biga,
const double bigb, const double powerp,
const double powerq, const double cut, const double cutsq);
int* host_map, const int nelements, int*** host_elem2param, const int nparams,
const double* epsilon, const double* sigma,
const double* lambda, const double* gamma,
const double* costheta, const double* biga,
const double* bigb, const double* powerp,
const double* powerq, const double* cut, const double* cutsq);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
@ -60,11 +61,26 @@ class SW : public BaseThree<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
/// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma
UCL_D_Vec<numtyp4> sw1;
/// sw2.x = biga, sw2.y = bigb, sw2.z = powerp, sw2.w = powerq
UCL_D_Vec<numtyp4> sw2;
/// sw3.x = cut, sw3.y = cutsq, sw3.z = costheta
UCL_D_Vec<numtyp4> sw3;
UCL_D_Vec<int> elem2param;
UCL_D_Vec<int> map;
int _nparams,_nelements;
UCL_Texture sw1_tex, sw2_tex, sw3_tex;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag, const int evatom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta;
numtyp sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq;
};
}

View File

@ -27,14 +27,15 @@ static SW<PRECISION,ACC_PRECISION> SWMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int sw_gpu_init(const int inum, const int nall, const int max_nbors,
int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
const double cell_size, int &gpu_mode, FILE *screen,
const double sw_epsilon, const double sw_sigma,
const double sw_lambda, const double sw_gamma,
const double sw_costheta, const double sw_biga,
const double sw_bigb, const double sw_powerp,
const double sw_powerq, const double sw_cut,
const double sw_cutsq) {
int* host_map, const int nelements, int*** host_elem2param, const int nparams,
const double* sw_epsilon, const double* sw_sigma,
const double* sw_lambda, const double* sw_gamma,
const double* sw_costheta, const double* sw_biga,
const double* sw_bigb, const double* sw_powerp,
const double* sw_powerq, const double* sw_cut,
const double* sw_cutsq) {
SWMF.clear();
gpu_mode=SWMF.device->gpu_mode();
double gpu_split=SWMF.device->particle_split();
@ -55,13 +56,14 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
host_map, nelements, host_elem2param, nparams,
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
@ -72,14 +74,15 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
host_map, nelements, host_elem2param, nparams,
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut,
sw_cutsq);