git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11230 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2014-01-13 15:51:54 +00:00
parent 427e90c151
commit bbffbe06a2
10 changed files with 438 additions and 193 deletions

View File

@ -64,7 +64,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \ $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
$(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \ $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -109,7 +110,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \ $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
$(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \ $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
$(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \ $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
$(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
$(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h
all: $(OBJ_DIR) $(GPU_LIB) $(EXECS) all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
@ -644,6 +646,18 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(O
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h $(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR) $(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_gromacs.cubin: lal_lj_gromacs.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_gromacs.cu
$(OBJ_DIR)/lj_gromacs_cubin.h: $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs.cubin
$(BIN2C) -c -n lj_gromacs $(OBJ_DIR)/lj_gromacs.cubin > $(OBJ_DIR)/lj_gromacs_cubin.h
$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H) $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda

View File

@ -53,7 +53,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \ $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
$(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \ $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -75,7 +76,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \ $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \ $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
$(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \ $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
$(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
$(OBJ_DIR)/lj_gromacs_cl.h
OCL_EXECS = $(BIN_DIR)/ocl_get_devices OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -460,6 +462,15 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h $(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR) $(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_gromacs_cl.h: lal_lj_gromacs.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh lj_gromacs $(PRE1_H) lal_lj_gromacs.cu $(OBJ_DIR)/lj_gromacs_cl.h;
$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)

View File

@ -194,12 +194,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
virial[j]+=engv[i]; virial[j]+=engv[i];
if (_vf_atom) if (_vf_atom)
if (_ilist==NULL) if (_ilist==NULL) {
int ii=0;
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
vatom[i][j]+=engv[i]; vatom[ii++][j]+=engv[i];
else } else {
int ii=0;
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
vatom[_ilist[i]][j]+=engv[i]; vatom[_ilist[ii++]][j]+=engv[i];
}
vstart+=_inum; vstart+=_inum;
iend+=_inum; iend+=_inum;
} }
@ -218,8 +221,9 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
return energy_virial(eatom,vatom,virial); return energy_virial(eatom,vatom,virial);
double evdwl=0.0; double evdwl=0.0;
int vstart=0, iend=_inum*2; int vstart=0, iend=_inum;
if (_eflag) { if (_eflag) {
iend=_inum*2;
for (int i=0; i<_inum; i++) for (int i=0; i<_inum; i++)
evdwl+=engv[i]; evdwl+=engv[i];
for (int i=_inum; i<iend; i++) for (int i=_inum; i<iend; i++)
@ -244,12 +248,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
virial[j]+=engv[i]; virial[j]+=engv[i];
if (_vf_atom) if (_vf_atom)
if (_ilist==NULL) if (_ilist==NULL) {
int ii=0;
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
vatom[i][j]+=engv[i]; vatom[ii++][j]+=engv[i];
else } else {
int ii=0;
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
vatom[_ilist[i]][j]+=engv[i]; vatom[_ilist[ii++]][j]+=engv[i];
}
vstart+=_inum; vstart+=_inum;
iend+=_inum; iend+=_inum;
} }

View File

@ -112,6 +112,7 @@ void BuckCoulLongT::clear() {
coeff1.clear(); coeff1.clear();
coeff2.clear(); coeff2.clear();
cutsq.clear();
sp_lj.clear(); sp_lj.clear();
this->clear_atomic(); this->clear_atomic();
} }

View File

@ -10,7 +10,7 @@
__________________________________________________________________________ __________________________________________________________________________
begin : 8/15/2012 begin : 8/15/2012
email : nguyentdw@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
#if defined(USE_OPENCL) #if defined(USE_OPENCL)

View File

@ -10,7 +10,7 @@
__________________________________________________________________________ __________________________________________________________________________
begin : 8/15/2012 begin : 8/15/2012
email : nguyentdw@ornl.gov email : nguyentd@ornl.gov
***************************************************************************/ ***************************************************************************/
#ifndef LAL_LJ_DSF_H #ifndef LAL_LJ_DSF_H

View File

@ -16,7 +16,7 @@
#if defined(USE_OPENCL) #if defined(USE_OPENCL)
#include "sw_cl.h" #include "sw_cl.h"
#elif defined(USE_CUDART) #elif defined(USE_CUDART)
const char *lj=0; const char *sw=0;
#else #else
#include "sw_cubin.h" #include "sw_cubin.h"
#endif #endif
@ -43,28 +43,15 @@ int SWT::bytes_per_atom(const int max_nbors) const {
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int SWT::init(const int nlocal, const int nall, const int max_nbors, int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
const double cell_size, const double gpu_split, FILE *_screen, const double cell_size, const double gpu_split, FILE *_screen,
const double epsilon, const double sigma, int* host_map, const int nelements, int*** host_elem2param, const int nparams,
const double lambda, const double gamma, const double* epsilon, const double* sigma,
const double costheta, const double biga, const double* lambda, const double* gamma,
const double bigb, const double powerp, const double* costheta, const double* biga,
const double powerq, const double cut, const double cutsq) { const double* bigb, const double* powerp,
const double* powerq, const double* cut, const double* cutsq)
sw_epsilon=static_cast<numtyp>(epsilon); {
sw_sigma=static_cast<numtyp>(sigma);
sw_lambda=static_cast<numtyp>(lambda);
sw_gamma=static_cast<numtyp>(gamma);
sw_costheta=static_cast<numtyp>(costheta);
sw_biga=static_cast<numtyp>(biga);
sw_bigb=static_cast<numtyp>(bigb);
sw_powerp=static_cast<numtyp>(powerp);
sw_powerq=static_cast<numtyp>(powerq);
sw_cut=static_cast<numtyp>(cut);
sw_cutsq=static_cast<numtyp>(cutsq);
if (sw_cutsq>=sw_cut*sw_cut)
sw_cutsq=sw_cut*sw_cut-1e-4;
int success; int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,sw,"k_sw","k_sw_three_center", _screen,sw,"k_sw","k_sw_three_center",
@ -73,10 +60,93 @@ int SWT::init(const int nlocal, const int nall, const int max_nbors,
return success; return success;
// If atom type constants fit in shared memory use fast kernel // If atom type constants fit in shared memory use fast kernel
shared_types=true; int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
_nparams = nparams;
_nelements = nelements;
UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++) {
dview[i].x=(numtyp)0;
dview[i].y=(numtyp)0;
dview[i].z=(numtyp)0;
dview[i].w=(numtyp)0;
}
// pack coefficients into arrays
sw1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<nparams; i++) {
dview[i].x=static_cast<numtyp>(epsilon[i]);
dview[i].y=static_cast<numtyp>(sigma[i]);
dview[i].z=static_cast<numtyp>(lambda[i]);
dview[i].w=static_cast<numtyp>(gamma[i]);
}
ucl_copy(sw1,dview,false);
sw1_tex.get_texture(*(this->pair_program),"sw1_tex");
sw1_tex.bind_float(sw1,4);
sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<nparams; i++) {
dview[i].x=static_cast<numtyp>(biga[i]);
dview[i].y=static_cast<numtyp>(bigb[i]);
dview[i].z=static_cast<numtyp>(powerp[i]);
dview[i].w=static_cast<numtyp>(powerq[i]);
}
ucl_copy(sw2,dview,false);
sw2_tex.get_texture(*(this->pair_program),"sw2_tex");
sw2_tex.bind_float(sw2,4);
sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<nparams; i++) {
dview[i].x=static_cast<numtyp>(cut[i]);
dview[i].y=static_cast<numtyp>(cutsq[i]);
dview[i].z=static_cast<numtyp>(costheta[i]);
dview[i].w=(numtyp)0;
}
ucl_copy(sw3,dview,false);
sw3_tex.get_texture(*(this->pair_program),"sw3_tex");
sw3_tex.bind_float(sw3,4);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
elem2param.alloc(nelements*nelements*nelements,*(this->ucl_device),
UCL_READ_ONLY);
for (int i = 0; i < nelements; i++)
for (int j = 0; j < nelements; j++)
for (int k = 0; k < nelements; k++) {
int idx = i*nelements*nelements+j*nelements+k;
dview_elem2param[idx] = host_elem2param[i][j][k];
}
ucl_copy(elem2param,dview_elem2param,false);
UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < lj_types; i++)
dview_map[i] = host_map[i];
map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(map,dview_map,false);
_allocated=true; _allocated=true;
this->_max_bytes=0; this->_max_bytes=sw1.row_bytes()+sw2.row_bytes()+sw3.row_bytes()+
map.row_bytes()+elem2param.row_bytes();
return 0; return 0;
} }
@ -86,6 +156,11 @@ void SWT::clear() {
return; return;
_allocated=false; _allocated=false;
sw1.clear();
sw2.clear();
sw3.clear();
map.clear();
elem2param.clear();
this->clear_atomic(); this->clear_atomic();
} }
@ -121,22 +196,23 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &this->nbor->dev_nbor, this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
&this->_nbor_data->begin(), &this->ans->force, &map, &elem2param, &_nelements,
&this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->_threads_per_atom, &sw_cut, &sw_epsilon, &sw_sigma, &this->ans->force, &this->ans->engv,
&sw_biga, &sw_bigb, &sw_powerp, &sw_powerq, &sw_cutsq); &eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
BX=this->block_size(); BX=this->block_size();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/(KTHREADS*JTHREADS)))); (BX/(KTHREADS*JTHREADS))));
this->k_three_center.set_size(GX,BX); this->k_three_center.set_size(GX,BX);
this->k_three_center.run(&this->atom->x, &this->nbor->dev_nbor, this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
&this->_nbor_data->begin(), &this->ans->force, &map, &elem2param, &_nelements,
&this->ans->engv, &eflag, &vflag, &ainum, &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&nbor_pitch, &this->_threads_per_atom, &evatom, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&sw_cut, &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma, &nbor_pitch, &this->_threads_per_atom, &evatom);
&sw_costheta, &sw_cutsq);
Answer<numtyp,acctyp> *end_ans; Answer<numtyp,acctyp> *end_ans;
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
end_ans=this->ans2; end_ans=this->ans2;
@ -145,20 +221,20 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
#endif #endif
if (evatom!=0) { if (evatom!=0) {
this->k_three_end_vatom.set_size(GX,BX); this->k_three_end_vatom.set_size(GX,BX);
this->k_three_end_vatom.run(&this->atom->x, &this->nbor->dev_nbor, this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
&this->_nbor_data->begin(), &end_ans->force, &map, &elem2param, &_nelements,
&end_ans->engv, &eflag, &vflag, &ainum, &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&nbor_pitch, &this->_threads_per_atom, &sw_cut, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma, &nbor_pitch, &this->_threads_per_atom);
&sw_costheta, &sw_cutsq);
} else { } else {
this->k_three_end.set_size(GX,BX); this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &this->nbor->dev_nbor, this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
&this->_nbor_data->begin(), &end_ans->force, &map, &elem2param, &_nelements,
&end_ans->engv, &eflag, &vflag, &ainum, &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&nbor_pitch, &this->_threads_per_atom, &sw_cut, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma, &nbor_pitch, &this->_threads_per_atom);
&sw_costheta, &sw_cutsq);
} }
this->time_pair.stop(); this->time_pair.stop();
} }

View File

@ -15,13 +15,24 @@
#ifdef NV_KERNEL #ifdef NV_KERNEL
#include "lal_aux_fun1.h" #include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE #ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex; texture<float4> pos_tex;
texture<float4> sw1_tex;
texture<float4> sw2_tex;
texture<float4> sw3_tex;
#else #else
texture<int4,1> pos_tex; texture<int4,1> pos_tex;
texture<int4> sw1_tex;
texture<int4> sw2_tex;
texture<int4> sw3_tex;
#endif #endif
#else #else
#define pos_tex x_ #define pos_tex x_
#define sw1_tex sw1
#define sw2_tex sw2
#define sw3_tex sw3
#endif #endif
#define THIRD (numtyp)0.66666667 #define THIRD (numtyp)0.66666667
@ -60,15 +71,15 @@ texture<int4,1> pos_tex;
} \ } \
} \ } \
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ int ei=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv+=energy*(acctyp)0.5; \ engv[ei]+=energy*(acctyp)0.5; \
engv+=inum; \ ei+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv+=virial[i]*(acctyp)0.5; \ engv[ei]+=virial[i]*(acctyp)0.5; \
engv+=inum; \ ei+=inum; \
} \ } \
} \ } \
acctyp4 old=ans[ii]; \ acctyp4 old=ans[ii]; \
@ -97,15 +108,15 @@ texture<int4,1> pos_tex;
} \ } \
} \ } \
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ int ei=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv+=energy*(acctyp)0.5; \ engv[ei]+=energy*(acctyp)0.5; \
engv+=inum; \ ei+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv+=virial[i]*(acctyp)0.5; \ engv[ei]+=virial[i]*(acctyp)0.5; \
engv+=inum; \ ei+=inum; \
} \ } \
} \ } \
acctyp4 old=ans[ii]; \ acctyp4 old=ans[ii]; \
@ -119,33 +130,19 @@ texture<int4,1> pos_tex;
__kernel void k_sw(const __global numtyp4 *restrict x_, __kernel void k_sw(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
const __global numtyp4 *restrict sw2,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom) {
const numtyp sw_cut, const numtyp sw_epsilon,
const numtyp sw_sigma, const numtyp sw_biga,
const numtyp sw_bigb, const numtyp sw_powerp,
const numtyp sw_powerq, const numtyp sw_cutsq) {
__local int n_stride; __local int n_stride;
__local numtyp pre_sw_c1, pre_sw_c2, pre_sw_c3, pre_sw_c4;
__local numtyp pre_sw_c5, pre_sw_c6;
pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
pow(sw_sigma,sw_powerp);
pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
pow(sw_sigma,sw_powerq);
pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp+(numtyp)1.0);
pre_sw_c4=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq+(numtyp)1.0);
pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp);
pre_sw_c6=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq);
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
@ -165,8 +162,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
n_stride,list_end,nbor); n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w; int itype=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw); itype=map[itype];
for ( ; nbor<list_end; nbor+=n_stride) { for ( ; nbor<list_end; nbor+=n_stride) {
@ -174,7 +171,10 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w; int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12 // Compute r12
numtyp delx = ix.x-jx.x; numtyp delx = ix.x-jx.x;
@ -182,7 +182,31 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z; numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<sw_cutsq) { if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
numtyp sw_epsilon=sw1_ijparam.x;
numtyp sw_sigma=sw1_ijparam.y;
numtyp4 sw2_ijparam; fetch4(sw2_ijparam,ijparam,sw2_tex);
numtyp sw_biga=sw2_ijparam.x;
numtyp sw_bigb=sw2_ijparam.y;
numtyp sw_powerp=sw2_ijparam.z;
numtyp sw_powerq=sw2_ijparam.w;
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
numtyp sw_cut=sw3_ijparam.x;
numtyp sw_cutsq=sw3_ijparam.y;
numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
pow(sw_sigma,sw_powerp);
numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
pow(sw_sigma,sw_powerq);
numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp+(numtyp)1.0);
numtyp pre_sw_c4=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq+(numtyp)1.0);
numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
pow(sw_sigma,sw_powerp);
numtyp pre_sw_c6=sw_biga*sw_epsilon*
pow(sw_sigma,sw_powerq);
numtyp r=ucl_sqrt(rsq); numtyp r=ucl_sqrt(rsq);
numtyp rp=ucl_powr(r,-sw_powerp); numtyp rp=ucl_powr(r,-sw_powerp);
numtyp rq=ucl_powr(r,-sw_powerq); numtyp rq=ucl_powr(r,-sw_powerq);
@ -209,40 +233,41 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
virial[5] += dely*delz*force; virial[5] += dely*delz*force;
} }
} }
} // for nbor } // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} // if ii } // if ii
} }
#define threebody(delr1x, delr1y, delr1z, eflag, energy) \ #define threebody(delr1x, delr1y, delr1z, eflag, energy) \
{ \ { \
numtyp r1 = ucl_sqrt(rsq1); \ numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \ numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - sw_cut); \ numtyp rainv1 = ucl_recip(r1 - sw_cut_ij); \
numtyp gsrainv1 = sw_sigma_gamma * rainv1; \ numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \ numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \ numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\ \
numtyp r2 = ucl_sqrt(rsq2); \ numtyp r2 = ucl_sqrt(rsq2); \
numtyp rinvsq2 = ucl_recip(rsq2); \ numtyp rinvsq2 = ucl_recip(rsq2); \
numtyp rainv2 = ucl_recip(r2 - sw_cut); \ numtyp rainv2 = ucl_recip(r2 - sw_cut_ik); \
numtyp gsrainv2 = sw_sigma_gamma * rainv2; \ numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2; \
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \ numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \ numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\ \
numtyp rinv12 = ucl_recip(r1*r2); \ numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \ numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - sw_costheta; \ numtyp delcs = cs - sw_costheta_ijk; \
numtyp delcssq = delcs*delcs; \ numtyp delcssq = delcs*delcs; \
\ \
numtyp facexp = expgsrainv1*expgsrainv2; \ numtyp facexp = expgsrainv1*expgsrainv2; \
\ \
numtyp facrad = sw_lambda_epsilon * facexp*delcssq; \ numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq; \
numtyp frad1 = facrad*gsrainvsq1; \ numtyp frad1 = facrad*gsrainvsq1; \
numtyp frad2 = facrad*gsrainvsq2; \ numtyp frad2 = facrad*gsrainvsq2; \
numtyp facang = sw_lambda_epsilon2 * facexp*delcs; \ numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs; \
numtyp facang12 = rinv12*facang; \ numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \ numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \ numtyp csfac1 = rinvsq1*csfacang; \
@ -273,26 +298,26 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
{ \ { \
numtyp r1 = ucl_sqrt(rsq1); \ numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \ numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - sw_cut); \ numtyp rainv1 = ucl_recip(r1 - sw_cut_ij); \
numtyp gsrainv1 = sw_sigma_gamma * rainv1; \ numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \ numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \ numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\ \
numtyp r2 = ucl_sqrt(rsq2); \ numtyp r2 = ucl_sqrt(rsq2); \
numtyp rainv2 = ucl_recip(r2 - sw_cut); \ numtyp rainv2 = ucl_recip(r2 - sw_cut_ik); \
numtyp gsrainv2 = sw_sigma_gamma * rainv2; \ numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \ numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\ \
numtyp rinv12 = ucl_recip(r1*r2); \ numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \ numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - sw_costheta; \ numtyp delcs = cs - sw_costheta_ijk; \
numtyp delcssq = delcs*delcs; \ numtyp delcssq = delcs*delcs; \
\ \
numtyp facexp = expgsrainv1*expgsrainv2; \ numtyp facexp = expgsrainv1*expgsrainv2; \
\ \
numtyp facrad = sw_lambda_epsilon * facexp*delcssq; \ numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq; \
numtyp frad1 = facrad*gsrainvsq1; \ numtyp frad1 = facrad*gsrainvsq1; \
numtyp facang = sw_lambda_epsilon2 * facexp*delcs; \ numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs; \
numtyp facang12 = rinv12*facang; \ numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \ numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \ numtyp csfac1 = rinvsq1*csfacang; \
@ -303,24 +328,24 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
} }
__kernel void k_sw_three_center(const __global numtyp4 *restrict x_, __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
const __global numtyp4 *restrict sw2,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
const int t_per_atom, const int evatom, const int t_per_atom, const int evatom) {
const numtyp sw_cut, const numtyp sw_epsilon,
const numtyp sw_sigma, const numtyp sw_lambda,
const numtyp sw_gamma, const numtyp sw_costheta,
const numtyp sw_cutsq) {
__local int tpa_sq, n_stride; __local int tpa_sq, n_stride;
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
__local numtyp sw_lambda_epsilon2;
tpa_sq=fast_mul(t_per_atom,t_per_atom); tpa_sq=fast_mul(t_per_atom,t_per_atom);
sw_sigma_gamma=sw_sigma*sw_gamma; numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
sw_lambda_epsilon=sw_lambda*sw_epsilon; numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon; numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
int tid, ii, offset; int tid, ii, offset;
atom_info(tpa_sq,ii,tid,offset); atom_info(tpa_sq,ii,tid,offset);
@ -344,8 +369,8 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
int offset_k=tid & (t_per_atom-1); int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w; int itype=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw); itype=map[itype];
for ( ; nbor_j<list_end; nbor_j+=n_stride) { for ( ; nbor_j<list_end; nbor_j+=n_stride) {
@ -353,7 +378,8 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w; int jtype=jx.w;
jtype=map[jtype];
// Compute r12 // Compute r12
numtyp delr1x = jx.x-ix.x; numtyp delr1x = jx.x-ix.x;
@ -361,7 +387,16 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
numtyp delr1z = jx.z-ix.z; numtyp delr1z = jx.z-ix.z;
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z; numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
if (rsq1 > sw_cutsq) continue; int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
const __global int *nbor_k=nbor_j-offset_j+offset_k; const __global int *nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j) if (nbor_k<=nbor_j)
@ -372,11 +407,31 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
k &= NEIGHMASK; k &= NEIGHMASK;
numtyp4 kx; fetch4(kx,k,pos_tex); numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
numtyp delr2x = kx.x-ix.x; numtyp delr2x = kx.x-ix.x;
numtyp delr2y = kx.y-ix.y; numtyp delr2y = kx.y-ix.y;
numtyp delr2z = kx.z-ix.z; numtyp delr2z = kx.z-ix.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z; numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw_cutsq) { if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y;
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz, fkx, fky, fkz; numtyp fjx, fjy, fjz, fkx, fky, fkz;
threebody(delr1x,delr1y,delr1z,eflag,energy); threebody(delr1x,delr1y,delr1z,eflag,energy);
@ -403,23 +458,24 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
} }
__kernel void k_sw_three_end(const __global numtyp4 *restrict x_, __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
const __global numtyp4 *restrict sw2,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
const int t_per_atom, const numtyp sw_cut, const int t_per_atom) {
const numtyp sw_epsilon, const numtyp sw_sigma,
const numtyp sw_lambda, const numtyp sw_gamma,
const numtyp sw_costheta, const numtyp sw_cutsq) {
__local int tpa_sq, n_stride; __local int tpa_sq, n_stride;
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
__local numtyp sw_lambda_epsilon2;
tpa_sq=fast_mul(t_per_atom,t_per_atom); tpa_sq=fast_mul(t_per_atom,t_per_atom);
sw_sigma_gamma=sw_sigma*sw_gamma; numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
sw_lambda_epsilon=sw_lambda*sw_epsilon; numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon; numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
int tid, ii, offset; int tid, ii, offset;
atom_info(tpa_sq,ii,tid,offset); atom_info(tpa_sq,ii,tid,offset);
@ -443,15 +499,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
int offset_k=tid & (t_per_atom-1); int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w; int itype=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw); itype=map[itype];
for ( ; nbor_j<list_end; nbor_j+=n_stride) { for ( ; nbor_j<list_end; nbor_j+=n_stride) {
int j=*nbor_j; int j=*nbor_j;
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w; int jtype=jx.w;
jtype=map[jtype];
// Compute r12 // Compute r12
numtyp delr1x = ix.x-jx.x; numtyp delr1x = ix.x-jx.x;
@ -459,7 +516,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
numtyp delr1z = ix.z-jx.z; numtyp delr1z = ix.z-jx.z;
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z; numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
if (rsq1 > sw_cutsq) continue; int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
const __global int *nbor_k=dev_nbor+j+nbor_pitch; const __global int *nbor_k=dev_nbor+j+nbor_pitch;
int numk=*nbor_k; int numk=*nbor_k;
@ -478,15 +544,35 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
int k=*nbor_k; int k=*nbor_k;
k &= NEIGHMASK; k &= NEIGHMASK;
if (k == i) if (k == i) continue;
continue;
numtyp4 kx; fetch4(kx,k,pos_tex); numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
numtyp delr2x = kx.x - jx.x; numtyp delr2x = kx.x - jx.x;
numtyp delr2y = kx.y - jx.y; numtyp delr2y = kx.y - jx.y;
numtyp delr2z = kx.z - jx.z; numtyp delr2z = kx.z - jx.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z; numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw_cutsq) { numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz; numtyp fjx, fjy, fjz;
//if (evatom==0) { //if (evatom==0) {
threebody_half(delr1x,delr1y,delr1z); threebody_half(delr1x,delr1y,delr1z);
@ -513,23 +599,24 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
} }
__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
const __global numtyp4 *restrict sw2,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
const int t_per_atom, const numtyp sw_cut, const int t_per_atom) {
const numtyp sw_epsilon, const numtyp sw_sigma,
const numtyp sw_lambda, const numtyp sw_gamma,
const numtyp sw_costheta, const numtyp sw_cutsq) {
__local int tpa_sq, n_stride; __local int tpa_sq, n_stride;
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
__local numtyp sw_lambda_epsilon2;
tpa_sq=fast_mul(t_per_atom,t_per_atom); tpa_sq=fast_mul(t_per_atom,t_per_atom);
sw_sigma_gamma=sw_sigma*sw_gamma; numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
sw_lambda_epsilon=sw_lambda*sw_epsilon; numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon; numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
int tid, ii, offset; int tid, ii, offset;
atom_info(tpa_sq,ii,tid,offset); atom_info(tpa_sq,ii,tid,offset);
@ -553,15 +640,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
int offset_k=tid & (t_per_atom-1); int offset_k=tid & (t_per_atom-1);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
//int iw=ix.w; int itype=ix.w;
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw); itype=map[itype];
for ( ; nbor_j<list_end; nbor_j+=n_stride) { for ( ; nbor_j<list_end; nbor_j+=n_stride) {
int j=*nbor_j; int j=*nbor_j;
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
//int mtype=itype+jx.w; int jtype=jx.w;
jtype=map[jtype];
// Compute r12 // Compute r12
numtyp delr1x = ix.x-jx.x; numtyp delr1x = ix.x-jx.x;
@ -569,7 +657,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
numtyp delr1z = ix.z-jx.z; numtyp delr1z = ix.z-jx.z;
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z; numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
if (rsq1 > sw_cutsq) continue; int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
const __global int *nbor_k=dev_nbor+j+nbor_pitch; const __global int *nbor_k=dev_nbor+j+nbor_pitch;
int numk=*nbor_k; int numk=*nbor_k;
@ -588,15 +685,35 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
int k=*nbor_k; int k=*nbor_k;
k &= NEIGHMASK; k &= NEIGHMASK;
if (k == i) if (k == i) continue;
continue;
numtyp4 kx; fetch4(kx,k,pos_tex); numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
numtyp delr2x = kx.x - jx.x; numtyp delr2x = kx.x - jx.x;
numtyp delr2y = kx.y - jx.y; numtyp delr2y = kx.y - jx.y;
numtyp delr2z = kx.z - jx.z; numtyp delr2z = kx.z - jx.z;
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z; numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw_cutsq) {
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
sw_costheta_ijk=sw3_ijkparam.z;
numtyp fjx, fjy, fjz, fkx, fky, fkz; numtyp fjx, fjy, fjz, fkx, fky, fkz;
threebody(delr1x,delr1y,delr1z,eflag,energy); threebody(delr1x,delr1y,delr1z,eflag,energy);

View File

@ -37,13 +37,14 @@ class SW : public BaseThree<numtyp, acctyp> {
* - -3 if there is an out of memory error * - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU * - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/ * - -5 Double precision is not supported on card **/
int init(const int nlocal, const int nall, const int max_nbors, int init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
const double cell_size, const double gpu_split, FILE *screen, const double cell_size, const double gpu_split, FILE *screen,
const double epsilon, const double sigma, int* host_map, const int nelements, int*** host_elem2param, const int nparams,
const double lambda, const double gamma, const double* epsilon, const double* sigma,
const double costheta, const double biga, const double* lambda, const double* gamma,
const double bigb, const double powerp, const double* costheta, const double* biga,
const double powerq, const double cut, const double cutsq); const double* bigb, const double* powerp,
const double* powerq, const double* cut, const double* cutsq);
/// Clear all host and device data /// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/ /** \note This is called at the beginning of the init() routine **/
@ -60,11 +61,26 @@ class SW : public BaseThree<numtyp, acctyp> {
/// If atom type constants fit in shared memory, use fast kernels /// If atom type constants fit in shared memory, use fast kernels
bool shared_types; bool shared_types;
/// Number of atom types
int _lj_types;
/// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma
UCL_D_Vec<numtyp4> sw1;
/// sw2.x = biga, sw2.y = bigb, sw2.z = powerp, sw2.w = powerq
UCL_D_Vec<numtyp4> sw2;
/// sw3.x = cut, sw3.y = cutsq, sw3.z = costheta
UCL_D_Vec<numtyp4> sw3;
UCL_D_Vec<int> elem2param;
UCL_D_Vec<int> map;
int _nparams,_nelements;
UCL_Texture sw1_tex, sw2_tex, sw3_tex;
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag, const int evatom); void loop(const bool _eflag, const bool _vflag, const int evatom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta;
numtyp sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq;
}; };
} }

View File

@ -27,14 +27,15 @@ static SW<PRECISION,ACC_PRECISION> SWMF;
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device // Allocate memory on host and device and copy constants to device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
int sw_gpu_init(const int inum, const int nall, const int max_nbors, int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
const double cell_size, int &gpu_mode, FILE *screen, const double cell_size, int &gpu_mode, FILE *screen,
const double sw_epsilon, const double sw_sigma, int* host_map, const int nelements, int*** host_elem2param, const int nparams,
const double sw_lambda, const double sw_gamma, const double* sw_epsilon, const double* sw_sigma,
const double sw_costheta, const double sw_biga, const double* sw_lambda, const double* sw_gamma,
const double sw_bigb, const double sw_powerp, const double* sw_costheta, const double* sw_biga,
const double sw_powerq, const double sw_cut, const double* sw_bigb, const double* sw_powerp,
const double sw_cutsq) { const double* sw_powerq, const double* sw_cut,
const double* sw_cutsq) {
SWMF.clear(); SWMF.clear();
gpu_mode=SWMF.device->gpu_mode(); gpu_mode=SWMF.device->gpu_mode();
double gpu_split=SWMF.device->particle_split(); double gpu_split=SWMF.device->particle_split();
@ -55,13 +56,14 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
message=true; message=true;
if (message) { if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0..."); fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen); fflush(screen);
} }
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen, init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
host_map, nelements, host_elem2param, nparams,
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq); sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
@ -72,14 +74,15 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
for (int i=0; i<procs_per_gpu; i++) { for (int i=0; i<procs_per_gpu; i++) {
if (message) { if (message) {
if (last_gpu-first_gpu==0) if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i); last_gpu,i);
fflush(screen); fflush(screen);
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen, init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
host_map, nelements, host_elem2param, nparams,
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut,
sw_cutsq); sw_cutsq);