forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11230 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
427e90c151
commit
bbffbe06a2
|
@ -64,7 +64,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
|
||||||
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
|
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
|
||||||
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
|
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
|
||||||
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
|
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
|
||||||
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
|
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
|
||||||
|
|
||||||
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
|
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
|
||||||
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
|
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
|
||||||
|
@ -109,7 +110,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
|
||||||
$(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
|
$(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
|
||||||
$(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
|
$(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
|
||||||
$(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
|
$(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
|
||||||
$(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h
|
$(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
|
||||||
|
$(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h
|
||||||
|
|
||||||
all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
|
all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
|
||||||
|
|
||||||
|
@ -644,6 +646,18 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(O
|
||||||
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
|
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
|
||||||
$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_gromacs.cubin: lal_lj_gromacs.cu lal_precision.h lal_preprocessor.h
|
||||||
|
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_gromacs.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_gromacs_cubin.h: $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs.cubin
|
||||||
|
$(BIN2C) -c -n lj_gromacs $(OBJ_DIR)/lj_gromacs.cubin > $(OBJ_DIR)/lj_gromacs_cubin.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||||
|
$(CUDR) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
|
||||||
|
$(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
|
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
|
||||||
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda
|
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda
|
||||||
|
|
||||||
|
|
|
@ -53,7 +53,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
|
||||||
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
|
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
|
||||||
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
|
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
|
||||||
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
|
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
|
||||||
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
|
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o
|
||||||
|
|
||||||
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
|
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
|
||||||
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
|
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
|
||||||
|
@ -75,7 +76,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
|
||||||
$(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
|
$(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
|
||||||
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
|
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
|
||||||
$(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
|
$(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
|
||||||
$(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h
|
$(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
|
||||||
|
$(OBJ_DIR)/lj_gromacs_cl.h
|
||||||
|
|
||||||
|
|
||||||
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
||||||
|
@ -460,6 +462,15 @@ $(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(
|
||||||
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
|
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
|
||||||
$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_gromacs_cl.h: lal_lj_gromacs.cu $(PRE1_H)
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh lj_gromacs $(PRE1_H) lal_lj_gromacs.cu $(OBJ_DIR)/lj_gromacs_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lal_base_atomic.o
|
||||||
|
$(OCL) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
|
||||||
|
$(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
|
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
|
||||||
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
|
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
|
||||||
|
|
||||||
|
|
|
@ -194,12 +194,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||||
for (int i=vstart; i<iend; i++)
|
for (int i=vstart; i<iend; i++)
|
||||||
virial[j]+=engv[i];
|
virial[j]+=engv[i];
|
||||||
if (_vf_atom)
|
if (_vf_atom)
|
||||||
if (_ilist==NULL)
|
if (_ilist==NULL) {
|
||||||
|
int ii=0;
|
||||||
for (int i=vstart; i<iend; i++)
|
for (int i=vstart; i<iend; i++)
|
||||||
vatom[i][j]+=engv[i];
|
vatom[ii++][j]+=engv[i];
|
||||||
else
|
} else {
|
||||||
|
int ii=0;
|
||||||
for (int i=vstart; i<iend; i++)
|
for (int i=vstart; i<iend; i++)
|
||||||
vatom[_ilist[i]][j]+=engv[i];
|
vatom[_ilist[ii++]][j]+=engv[i];
|
||||||
|
}
|
||||||
vstart+=_inum;
|
vstart+=_inum;
|
||||||
iend+=_inum;
|
iend+=_inum;
|
||||||
}
|
}
|
||||||
|
@ -218,8 +221,9 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||||
return energy_virial(eatom,vatom,virial);
|
return energy_virial(eatom,vatom,virial);
|
||||||
|
|
||||||
double evdwl=0.0;
|
double evdwl=0.0;
|
||||||
int vstart=0, iend=_inum*2;
|
int vstart=0, iend=_inum;
|
||||||
if (_eflag) {
|
if (_eflag) {
|
||||||
|
iend=_inum*2;
|
||||||
for (int i=0; i<_inum; i++)
|
for (int i=0; i<_inum; i++)
|
||||||
evdwl+=engv[i];
|
evdwl+=engv[i];
|
||||||
for (int i=_inum; i<iend; i++)
|
for (int i=_inum; i<iend; i++)
|
||||||
|
@ -244,12 +248,15 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||||
for (int i=vstart; i<iend; i++)
|
for (int i=vstart; i<iend; i++)
|
||||||
virial[j]+=engv[i];
|
virial[j]+=engv[i];
|
||||||
if (_vf_atom)
|
if (_vf_atom)
|
||||||
if (_ilist==NULL)
|
if (_ilist==NULL) {
|
||||||
|
int ii=0;
|
||||||
for (int i=vstart; i<iend; i++)
|
for (int i=vstart; i<iend; i++)
|
||||||
vatom[i][j]+=engv[i];
|
vatom[ii++][j]+=engv[i];
|
||||||
else
|
} else {
|
||||||
|
int ii=0;
|
||||||
for (int i=vstart; i<iend; i++)
|
for (int i=vstart; i<iend; i++)
|
||||||
vatom[_ilist[i]][j]+=engv[i];
|
vatom[_ilist[ii++]][j]+=engv[i];
|
||||||
|
}
|
||||||
vstart+=_inum;
|
vstart+=_inum;
|
||||||
iend+=_inum;
|
iend+=_inum;
|
||||||
}
|
}
|
||||||
|
|
|
@ -112,6 +112,7 @@ void BuckCoulLongT::clear() {
|
||||||
|
|
||||||
coeff1.clear();
|
coeff1.clear();
|
||||||
coeff2.clear();
|
coeff2.clear();
|
||||||
|
cutsq.clear();
|
||||||
sp_lj.clear();
|
sp_lj.clear();
|
||||||
this->clear_atomic();
|
this->clear_atomic();
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
__________________________________________________________________________
|
__________________________________________________________________________
|
||||||
|
|
||||||
begin : 8/15/2012
|
begin : 8/15/2012
|
||||||
email : nguyentdw@ornl.gov
|
email : nguyentd@ornl.gov
|
||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
#if defined(USE_OPENCL)
|
#if defined(USE_OPENCL)
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
__________________________________________________________________________
|
__________________________________________________________________________
|
||||||
|
|
||||||
begin : 8/15/2012
|
begin : 8/15/2012
|
||||||
email : nguyentdw@ornl.gov
|
email : nguyentd@ornl.gov
|
||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
#ifndef LAL_LJ_DSF_H
|
#ifndef LAL_LJ_DSF_H
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
#if defined(USE_OPENCL)
|
#if defined(USE_OPENCL)
|
||||||
#include "sw_cl.h"
|
#include "sw_cl.h"
|
||||||
#elif defined(USE_CUDART)
|
#elif defined(USE_CUDART)
|
||||||
const char *lj=0;
|
const char *sw=0;
|
||||||
#else
|
#else
|
||||||
#include "sw_cubin.h"
|
#include "sw_cubin.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -43,28 +43,15 @@ int SWT::bytes_per_atom(const int max_nbors) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int SWT::init(const int nlocal, const int nall, const int max_nbors,
|
int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
|
||||||
const double cell_size, const double gpu_split, FILE *_screen,
|
const double cell_size, const double gpu_split, FILE *_screen,
|
||||||
const double epsilon, const double sigma,
|
int* host_map, const int nelements, int*** host_elem2param, const int nparams,
|
||||||
const double lambda, const double gamma,
|
const double* epsilon, const double* sigma,
|
||||||
const double costheta, const double biga,
|
const double* lambda, const double* gamma,
|
||||||
const double bigb, const double powerp,
|
const double* costheta, const double* biga,
|
||||||
const double powerq, const double cut, const double cutsq) {
|
const double* bigb, const double* powerp,
|
||||||
|
const double* powerq, const double* cut, const double* cutsq)
|
||||||
sw_epsilon=static_cast<numtyp>(epsilon);
|
{
|
||||||
sw_sigma=static_cast<numtyp>(sigma);
|
|
||||||
sw_lambda=static_cast<numtyp>(lambda);
|
|
||||||
sw_gamma=static_cast<numtyp>(gamma);
|
|
||||||
sw_costheta=static_cast<numtyp>(costheta);
|
|
||||||
sw_biga=static_cast<numtyp>(biga);
|
|
||||||
sw_bigb=static_cast<numtyp>(bigb);
|
|
||||||
sw_powerp=static_cast<numtyp>(powerp);
|
|
||||||
sw_powerq=static_cast<numtyp>(powerq);
|
|
||||||
sw_cut=static_cast<numtyp>(cut);
|
|
||||||
sw_cutsq=static_cast<numtyp>(cutsq);
|
|
||||||
if (sw_cutsq>=sw_cut*sw_cut)
|
|
||||||
sw_cutsq=sw_cut*sw_cut-1e-4;
|
|
||||||
|
|
||||||
int success;
|
int success;
|
||||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||||
_screen,sw,"k_sw","k_sw_three_center",
|
_screen,sw,"k_sw","k_sw_three_center",
|
||||||
|
@ -73,10 +60,93 @@ int SWT::init(const int nlocal, const int nall, const int max_nbors,
|
||||||
return success;
|
return success;
|
||||||
|
|
||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
shared_types=true;
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
_nparams = nparams;
|
||||||
|
_nelements = nelements;
|
||||||
|
|
||||||
|
UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
|
||||||
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
|
for (int i=0; i<nparams; i++) {
|
||||||
|
dview[i].x=(numtyp)0;
|
||||||
|
dview[i].y=(numtyp)0;
|
||||||
|
dview[i].z=(numtyp)0;
|
||||||
|
dview[i].w=(numtyp)0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// pack coefficients into arrays
|
||||||
|
sw1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
|
||||||
|
for (int i=0; i<nparams; i++) {
|
||||||
|
dview[i].x=static_cast<numtyp>(epsilon[i]);
|
||||||
|
dview[i].y=static_cast<numtyp>(sigma[i]);
|
||||||
|
dview[i].z=static_cast<numtyp>(lambda[i]);
|
||||||
|
dview[i].w=static_cast<numtyp>(gamma[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
ucl_copy(sw1,dview,false);
|
||||||
|
sw1_tex.get_texture(*(this->pair_program),"sw1_tex");
|
||||||
|
sw1_tex.bind_float(sw1,4);
|
||||||
|
|
||||||
|
sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
|
||||||
|
for (int i=0; i<nparams; i++) {
|
||||||
|
dview[i].x=static_cast<numtyp>(biga[i]);
|
||||||
|
dview[i].y=static_cast<numtyp>(bigb[i]);
|
||||||
|
dview[i].z=static_cast<numtyp>(powerp[i]);
|
||||||
|
dview[i].w=static_cast<numtyp>(powerq[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
ucl_copy(sw2,dview,false);
|
||||||
|
sw2_tex.get_texture(*(this->pair_program),"sw2_tex");
|
||||||
|
sw2_tex.bind_float(sw2,4);
|
||||||
|
|
||||||
|
sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
|
||||||
|
for (int i=0; i<nparams; i++) {
|
||||||
|
dview[i].x=static_cast<numtyp>(cut[i]);
|
||||||
|
dview[i].y=static_cast<numtyp>(cutsq[i]);
|
||||||
|
dview[i].z=static_cast<numtyp>(costheta[i]);
|
||||||
|
dview[i].w=(numtyp)0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ucl_copy(sw3,dview,false);
|
||||||
|
sw3_tex.get_texture(*(this->pair_program),"sw3_tex");
|
||||||
|
sw3_tex.bind_float(sw3,4);
|
||||||
|
|
||||||
|
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
|
||||||
|
*(this->ucl_device), UCL_WRITE_ONLY);
|
||||||
|
|
||||||
|
elem2param.alloc(nelements*nelements*nelements,*(this->ucl_device),
|
||||||
|
UCL_READ_ONLY);
|
||||||
|
|
||||||
|
for (int i = 0; i < nelements; i++)
|
||||||
|
for (int j = 0; j < nelements; j++)
|
||||||
|
for (int k = 0; k < nelements; k++) {
|
||||||
|
int idx = i*nelements*nelements+j*nelements+k;
|
||||||
|
dview_elem2param[idx] = host_elem2param[i][j][k];
|
||||||
|
}
|
||||||
|
|
||||||
|
ucl_copy(elem2param,dview_elem2param,false);
|
||||||
|
|
||||||
|
UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
|
||||||
|
for (int i = 0; i < lj_types; i++)
|
||||||
|
dview_map[i] = host_map[i];
|
||||||
|
|
||||||
|
map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
|
||||||
|
ucl_copy(map,dview_map,false);
|
||||||
|
|
||||||
_allocated=true;
|
_allocated=true;
|
||||||
this->_max_bytes=0;
|
this->_max_bytes=sw1.row_bytes()+sw2.row_bytes()+sw3.row_bytes()+
|
||||||
|
map.row_bytes()+elem2param.row_bytes();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -86,6 +156,11 @@ void SWT::clear() {
|
||||||
return;
|
return;
|
||||||
_allocated=false;
|
_allocated=false;
|
||||||
|
|
||||||
|
sw1.clear();
|
||||||
|
sw2.clear();
|
||||||
|
sw3.clear();
|
||||||
|
map.clear();
|
||||||
|
elem2param.clear();
|
||||||
this->clear_atomic();
|
this->clear_atomic();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,22 +196,23 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
this->time_pair.start();
|
this->time_pair.start();
|
||||||
this->k_pair.set_size(GX,BX);
|
this->k_pair.set_size(GX,BX);
|
||||||
this->k_pair.run(&this->atom->x, &this->nbor->dev_nbor,
|
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||||
&this->_nbor_data->begin(), &this->ans->force,
|
&map, &elem2param, &_nelements,
|
||||||
&this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
&this->_threads_per_atom, &sw_cut, &sw_epsilon, &sw_sigma,
|
&this->ans->force, &this->ans->engv,
|
||||||
&sw_biga, &sw_bigb, &sw_powerp, &sw_powerq, &sw_cutsq);
|
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||||
|
&this->_threads_per_atom);
|
||||||
|
|
||||||
BX=this->block_size();
|
BX=this->block_size();
|
||||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
(BX/(KTHREADS*JTHREADS))));
|
(BX/(KTHREADS*JTHREADS))));
|
||||||
this->k_three_center.set_size(GX,BX);
|
this->k_three_center.set_size(GX,BX);
|
||||||
this->k_three_center.run(&this->atom->x, &this->nbor->dev_nbor,
|
this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||||
&this->_nbor_data->begin(), &this->ans->force,
|
&map, &elem2param, &_nelements,
|
||||||
&this->ans->engv, &eflag, &vflag, &ainum,
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
&nbor_pitch, &this->_threads_per_atom, &evatom,
|
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||||
&sw_cut, &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
|
&nbor_pitch, &this->_threads_per_atom, &evatom);
|
||||||
&sw_costheta, &sw_cutsq);
|
|
||||||
Answer<numtyp,acctyp> *end_ans;
|
Answer<numtyp,acctyp> *end_ans;
|
||||||
#ifdef THREE_CONCURRENT
|
#ifdef THREE_CONCURRENT
|
||||||
end_ans=this->ans2;
|
end_ans=this->ans2;
|
||||||
|
@ -145,20 +221,20 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||||
#endif
|
#endif
|
||||||
if (evatom!=0) {
|
if (evatom!=0) {
|
||||||
this->k_three_end_vatom.set_size(GX,BX);
|
this->k_three_end_vatom.set_size(GX,BX);
|
||||||
this->k_three_end_vatom.run(&this->atom->x, &this->nbor->dev_nbor,
|
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||||
&this->_nbor_data->begin(), &end_ans->force,
|
&map, &elem2param, &_nelements,
|
||||||
&end_ans->engv, &eflag, &vflag, &ainum,
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
&nbor_pitch, &this->_threads_per_atom, &sw_cut,
|
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||||
&sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
|
&nbor_pitch, &this->_threads_per_atom);
|
||||||
&sw_costheta, &sw_cutsq);
|
|
||||||
} else {
|
} else {
|
||||||
this->k_three_end.set_size(GX,BX);
|
this->k_three_end.set_size(GX,BX);
|
||||||
this->k_three_end.run(&this->atom->x, &this->nbor->dev_nbor,
|
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||||
&this->_nbor_data->begin(), &end_ans->force,
|
&map, &elem2param, &_nelements,
|
||||||
&end_ans->engv, &eflag, &vflag, &ainum,
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
&nbor_pitch, &this->_threads_per_atom, &sw_cut,
|
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||||
&sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
|
&nbor_pitch, &this->_threads_per_atom);
|
||||||
&sw_costheta, &sw_cutsq);
|
|
||||||
}
|
}
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,13 +15,24 @@
|
||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
#include "lal_aux_fun1.h"
|
#include "lal_aux_fun1.h"
|
||||||
|
|
||||||
#ifndef _DOUBLE_DOUBLE
|
#ifndef _DOUBLE_DOUBLE
|
||||||
texture<float4> pos_tex;
|
texture<float4> pos_tex;
|
||||||
|
texture<float4> sw1_tex;
|
||||||
|
texture<float4> sw2_tex;
|
||||||
|
texture<float4> sw3_tex;
|
||||||
#else
|
#else
|
||||||
texture<int4,1> pos_tex;
|
texture<int4,1> pos_tex;
|
||||||
|
texture<int4> sw1_tex;
|
||||||
|
texture<int4> sw2_tex;
|
||||||
|
texture<int4> sw3_tex;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#define pos_tex x_
|
#define pos_tex x_
|
||||||
|
#define sw1_tex sw1
|
||||||
|
#define sw2_tex sw2
|
||||||
|
#define sw3_tex sw3
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define THIRD (numtyp)0.66666667
|
#define THIRD (numtyp)0.66666667
|
||||||
|
@ -60,15 +71,15 @@ texture<int4,1> pos_tex;
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
int ei=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv+=energy*(acctyp)0.5; \
|
engv[ei]+=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
ei+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv+=virial[i]*(acctyp)0.5; \
|
engv[ei]+=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
ei+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
acctyp4 old=ans[ii]; \
|
acctyp4 old=ans[ii]; \
|
||||||
|
@ -97,15 +108,15 @@ texture<int4,1> pos_tex;
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
int ei=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv+=energy*(acctyp)0.5; \
|
engv[ei]+=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
ei+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv+=virial[i]*(acctyp)0.5; \
|
engv[ei]+=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
ei+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
acctyp4 old=ans[ii]; \
|
acctyp4 old=ans[ii]; \
|
||||||
|
@ -119,33 +130,19 @@ texture<int4,1> pos_tex;
|
||||||
|
|
||||||
|
|
||||||
__kernel void k_sw(const __global numtyp4 *restrict x_,
|
__kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||||
|
const __global numtyp4 *restrict sw1,
|
||||||
|
const __global numtyp4 *restrict sw2,
|
||||||
|
const __global numtyp4 *restrict sw3,
|
||||||
|
const __global int *restrict map,
|
||||||
|
const __global int *restrict elem2param,
|
||||||
|
const int nelements,
|
||||||
const __global int * dev_nbor,
|
const __global int * dev_nbor,
|
||||||
const __global int * dev_packed,
|
const __global int * dev_packed,
|
||||||
__global acctyp4 *restrict ans,
|
__global acctyp4 *restrict ans,
|
||||||
__global acctyp *restrict engv,
|
__global acctyp *restrict engv,
|
||||||
const int eflag, const int vflag, const int inum,
|
const int eflag, const int vflag, const int inum,
|
||||||
const int nbor_pitch, const int t_per_atom,
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
const numtyp sw_cut, const numtyp sw_epsilon,
|
|
||||||
const numtyp sw_sigma, const numtyp sw_biga,
|
|
||||||
const numtyp sw_bigb, const numtyp sw_powerp,
|
|
||||||
const numtyp sw_powerq, const numtyp sw_cutsq) {
|
|
||||||
|
|
||||||
__local int n_stride;
|
__local int n_stride;
|
||||||
__local numtyp pre_sw_c1, pre_sw_c2, pre_sw_c3, pre_sw_c4;
|
|
||||||
__local numtyp pre_sw_c5, pre_sw_c6;
|
|
||||||
pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
|
|
||||||
pow(sw_sigma,sw_powerp);
|
|
||||||
pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
|
|
||||||
pow(sw_sigma,sw_powerq);
|
|
||||||
pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
|
|
||||||
pow(sw_sigma,sw_powerp+(numtyp)1.0);
|
|
||||||
pre_sw_c4=sw_biga*sw_epsilon*
|
|
||||||
pow(sw_sigma,sw_powerq+(numtyp)1.0);
|
|
||||||
pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
|
|
||||||
pow(sw_sigma,sw_powerp);
|
|
||||||
pre_sw_c6=sw_biga*sw_epsilon*
|
|
||||||
pow(sw_sigma,sw_powerq);
|
|
||||||
|
|
||||||
int tid, ii, offset;
|
int tid, ii, offset;
|
||||||
atom_info(t_per_atom,ii,tid,offset);
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
@ -165,8 +162,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
//int iw=ix.w;
|
int itype=ix.w;
|
||||||
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
itype=map[itype];
|
||||||
|
|
||||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
@ -174,7 +171,10 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
//int mtype=itype+jx.w;
|
int jtype=jx.w;
|
||||||
|
jtype=map[jtype];
|
||||||
|
|
||||||
|
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||||
|
|
||||||
// Compute r12
|
// Compute r12
|
||||||
numtyp delx = ix.x-jx.x;
|
numtyp delx = ix.x-jx.x;
|
||||||
|
@ -182,7 +182,31 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||||
numtyp delz = ix.z-jx.z;
|
numtyp delz = ix.z-jx.z;
|
||||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
if (rsq<sw_cutsq) {
|
if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
|
||||||
|
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||||
|
numtyp sw_epsilon=sw1_ijparam.x;
|
||||||
|
numtyp sw_sigma=sw1_ijparam.y;
|
||||||
|
numtyp4 sw2_ijparam; fetch4(sw2_ijparam,ijparam,sw2_tex);
|
||||||
|
numtyp sw_biga=sw2_ijparam.x;
|
||||||
|
numtyp sw_bigb=sw2_ijparam.y;
|
||||||
|
numtyp sw_powerp=sw2_ijparam.z;
|
||||||
|
numtyp sw_powerq=sw2_ijparam.w;
|
||||||
|
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
|
||||||
|
numtyp sw_cut=sw3_ijparam.x;
|
||||||
|
numtyp sw_cutsq=sw3_ijparam.y;
|
||||||
|
numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
|
||||||
|
pow(sw_sigma,sw_powerp);
|
||||||
|
numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
|
||||||
|
pow(sw_sigma,sw_powerq);
|
||||||
|
numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
|
||||||
|
pow(sw_sigma,sw_powerp+(numtyp)1.0);
|
||||||
|
numtyp pre_sw_c4=sw_biga*sw_epsilon*
|
||||||
|
pow(sw_sigma,sw_powerq+(numtyp)1.0);
|
||||||
|
numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
|
||||||
|
pow(sw_sigma,sw_powerp);
|
||||||
|
numtyp pre_sw_c6=sw_biga*sw_epsilon*
|
||||||
|
pow(sw_sigma,sw_powerq);
|
||||||
|
|
||||||
numtyp r=ucl_sqrt(rsq);
|
numtyp r=ucl_sqrt(rsq);
|
||||||
numtyp rp=ucl_powr(r,-sw_powerp);
|
numtyp rp=ucl_powr(r,-sw_powerp);
|
||||||
numtyp rq=ucl_powr(r,-sw_powerq);
|
numtyp rq=ucl_powr(r,-sw_powerq);
|
||||||
|
@ -209,40 +233,41 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||||
virial[5] += dely*delz*force;
|
virial[5] += dely*delz*force;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
|
||||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
ans,engv);
|
ans,engv);
|
||||||
} // if ii
|
} // if ii
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define threebody(delr1x, delr1y, delr1z, eflag, energy) \
|
#define threebody(delr1x, delr1y, delr1z, eflag, energy) \
|
||||||
{ \
|
{ \
|
||||||
numtyp r1 = ucl_sqrt(rsq1); \
|
numtyp r1 = ucl_sqrt(rsq1); \
|
||||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||||
numtyp rainv1 = ucl_recip(r1 - sw_cut); \
|
numtyp rainv1 = ucl_recip(r1 - sw_cut_ij); \
|
||||||
numtyp gsrainv1 = sw_sigma_gamma * rainv1; \
|
numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1; \
|
||||||
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
|
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
|
||||||
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
|
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
|
||||||
\
|
\
|
||||||
numtyp r2 = ucl_sqrt(rsq2); \
|
numtyp r2 = ucl_sqrt(rsq2); \
|
||||||
numtyp rinvsq2 = ucl_recip(rsq2); \
|
numtyp rinvsq2 = ucl_recip(rsq2); \
|
||||||
numtyp rainv2 = ucl_recip(r2 - sw_cut); \
|
numtyp rainv2 = ucl_recip(r2 - sw_cut_ik); \
|
||||||
numtyp gsrainv2 = sw_sigma_gamma * rainv2; \
|
numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2; \
|
||||||
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
|
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
|
||||||
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
|
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
|
||||||
\
|
\
|
||||||
numtyp rinv12 = ucl_recip(r1*r2); \
|
numtyp rinv12 = ucl_recip(r1*r2); \
|
||||||
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
|
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
|
||||||
numtyp delcs = cs - sw_costheta; \
|
numtyp delcs = cs - sw_costheta_ijk; \
|
||||||
numtyp delcssq = delcs*delcs; \
|
numtyp delcssq = delcs*delcs; \
|
||||||
\
|
\
|
||||||
numtyp facexp = expgsrainv1*expgsrainv2; \
|
numtyp facexp = expgsrainv1*expgsrainv2; \
|
||||||
\
|
\
|
||||||
numtyp facrad = sw_lambda_epsilon * facexp*delcssq; \
|
numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq; \
|
||||||
numtyp frad1 = facrad*gsrainvsq1; \
|
numtyp frad1 = facrad*gsrainvsq1; \
|
||||||
numtyp frad2 = facrad*gsrainvsq2; \
|
numtyp frad2 = facrad*gsrainvsq2; \
|
||||||
numtyp facang = sw_lambda_epsilon2 * facexp*delcs; \
|
numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs; \
|
||||||
numtyp facang12 = rinv12*facang; \
|
numtyp facang12 = rinv12*facang; \
|
||||||
numtyp csfacang = cs*facang; \
|
numtyp csfacang = cs*facang; \
|
||||||
numtyp csfac1 = rinvsq1*csfacang; \
|
numtyp csfac1 = rinvsq1*csfacang; \
|
||||||
|
@ -273,26 +298,26 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||||
{ \
|
{ \
|
||||||
numtyp r1 = ucl_sqrt(rsq1); \
|
numtyp r1 = ucl_sqrt(rsq1); \
|
||||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||||
numtyp rainv1 = ucl_recip(r1 - sw_cut); \
|
numtyp rainv1 = ucl_recip(r1 - sw_cut_ij); \
|
||||||
numtyp gsrainv1 = sw_sigma_gamma * rainv1; \
|
numtyp gsrainv1 = sw_sigma_gamma_ij * rainv1; \
|
||||||
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
|
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
|
||||||
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
|
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
|
||||||
\
|
\
|
||||||
numtyp r2 = ucl_sqrt(rsq2); \
|
numtyp r2 = ucl_sqrt(rsq2); \
|
||||||
numtyp rainv2 = ucl_recip(r2 - sw_cut); \
|
numtyp rainv2 = ucl_recip(r2 - sw_cut_ik); \
|
||||||
numtyp gsrainv2 = sw_sigma_gamma * rainv2; \
|
numtyp gsrainv2 = sw_sigma_gamma_ik * rainv2; \
|
||||||
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
|
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
|
||||||
\
|
\
|
||||||
numtyp rinv12 = ucl_recip(r1*r2); \
|
numtyp rinv12 = ucl_recip(r1*r2); \
|
||||||
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
|
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
|
||||||
numtyp delcs = cs - sw_costheta; \
|
numtyp delcs = cs - sw_costheta_ijk; \
|
||||||
numtyp delcssq = delcs*delcs; \
|
numtyp delcssq = delcs*delcs; \
|
||||||
\
|
\
|
||||||
numtyp facexp = expgsrainv1*expgsrainv2; \
|
numtyp facexp = expgsrainv1*expgsrainv2; \
|
||||||
\
|
\
|
||||||
numtyp facrad = sw_lambda_epsilon * facexp*delcssq; \
|
numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq; \
|
||||||
numtyp frad1 = facrad*gsrainvsq1; \
|
numtyp frad1 = facrad*gsrainvsq1; \
|
||||||
numtyp facang = sw_lambda_epsilon2 * facexp*delcs; \
|
numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs; \
|
||||||
numtyp facang12 = rinv12*facang; \
|
numtyp facang12 = rinv12*facang; \
|
||||||
numtyp csfacang = cs*facang; \
|
numtyp csfacang = cs*facang; \
|
||||||
numtyp csfac1 = rinvsq1*csfacang; \
|
numtyp csfac1 = rinvsq1*csfacang; \
|
||||||
|
@ -303,24 +328,24 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
__kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||||
|
const __global numtyp4 *restrict sw1,
|
||||||
|
const __global numtyp4 *restrict sw2,
|
||||||
|
const __global numtyp4 *restrict sw3,
|
||||||
|
const __global int *restrict map,
|
||||||
|
const __global int *restrict elem2param,
|
||||||
|
const int nelements,
|
||||||
const __global int * dev_nbor,
|
const __global int * dev_nbor,
|
||||||
const __global int * dev_packed,
|
const __global int * dev_packed,
|
||||||
__global acctyp4 *restrict ans,
|
__global acctyp4 *restrict ans,
|
||||||
__global acctyp *restrict engv,
|
__global acctyp *restrict engv,
|
||||||
const int eflag, const int vflag,
|
const int eflag, const int vflag,
|
||||||
const int inum, const int nbor_pitch,
|
const int inum, const int nbor_pitch,
|
||||||
const int t_per_atom, const int evatom,
|
const int t_per_atom, const int evatom) {
|
||||||
const numtyp sw_cut, const numtyp sw_epsilon,
|
|
||||||
const numtyp sw_sigma, const numtyp sw_lambda,
|
|
||||||
const numtyp sw_gamma, const numtyp sw_costheta,
|
|
||||||
const numtyp sw_cutsq) {
|
|
||||||
__local int tpa_sq, n_stride;
|
__local int tpa_sq, n_stride;
|
||||||
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
|
|
||||||
__local numtyp sw_lambda_epsilon2;
|
|
||||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||||
sw_sigma_gamma=sw_sigma*sw_gamma;
|
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||||
sw_lambda_epsilon=sw_lambda*sw_epsilon;
|
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||||
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
|
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||||
|
|
||||||
int tid, ii, offset;
|
int tid, ii, offset;
|
||||||
atom_info(tpa_sq,ii,tid,offset);
|
atom_info(tpa_sq,ii,tid,offset);
|
||||||
|
@ -344,8 +369,8 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||||
int offset_k=tid & (t_per_atom-1);
|
int offset_k=tid & (t_per_atom-1);
|
||||||
|
|
||||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
//int iw=ix.w;
|
int itype=ix.w;
|
||||||
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
itype=map[itype];
|
||||||
|
|
||||||
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
|
@ -353,7 +378,8 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
//int mtype=itype+jx.w;
|
int jtype=jx.w;
|
||||||
|
jtype=map[jtype];
|
||||||
|
|
||||||
// Compute r12
|
// Compute r12
|
||||||
numtyp delr1x = jx.x-ix.x;
|
numtyp delr1x = jx.x-ix.x;
|
||||||
|
@ -361,7 +387,16 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||||
numtyp delr1z = jx.z-ix.z;
|
numtyp delr1z = jx.z-ix.z;
|
||||||
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
|
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
|
||||||
|
|
||||||
if (rsq1 > sw_cutsq) continue;
|
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||||
|
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
|
||||||
|
|
||||||
|
if (rsq1 > sw3_ijparam.y) continue;
|
||||||
|
|
||||||
|
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||||
|
sw_sigma=sw1_ijparam.y;
|
||||||
|
sw_gamma=sw1_ijparam.w;
|
||||||
|
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||||
|
sw_cut_ij=sw3_ijparam.x;
|
||||||
|
|
||||||
const __global int *nbor_k=nbor_j-offset_j+offset_k;
|
const __global int *nbor_k=nbor_j-offset_j+offset_k;
|
||||||
if (nbor_k<=nbor_j)
|
if (nbor_k<=nbor_j)
|
||||||
|
@ -372,11 +407,31 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||||
|
int ktype=kx.w;
|
||||||
|
ktype=map[ktype];
|
||||||
|
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
|
||||||
|
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
|
||||||
|
|
||||||
numtyp delr2x = kx.x-ix.x;
|
numtyp delr2x = kx.x-ix.x;
|
||||||
numtyp delr2y = kx.y-ix.y;
|
numtyp delr2y = kx.y-ix.y;
|
||||||
numtyp delr2z = kx.z-ix.z;
|
numtyp delr2z = kx.z-ix.z;
|
||||||
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
||||||
if (rsq2 < sw_cutsq) {
|
if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y;
|
||||||
|
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||||
|
sw_sigma=sw1_ikparam.y;
|
||||||
|
sw_gamma=sw1_ikparam.w;
|
||||||
|
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||||
|
sw_cut_ik=sw3_ikparam.x;
|
||||||
|
|
||||||
|
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
|
||||||
|
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||||
|
sw_epsilon=sw1_ijkparam.x;
|
||||||
|
sw_lambda=sw1_ijkparam.z;
|
||||||
|
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||||
|
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||||
|
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||||
|
sw_costheta_ijk=sw3_ijkparam.z;
|
||||||
|
|
||||||
numtyp fjx, fjy, fjz, fkx, fky, fkz;
|
numtyp fjx, fjy, fjz, fkx, fky, fkz;
|
||||||
threebody(delr1x,delr1y,delr1z,eflag,energy);
|
threebody(delr1x,delr1y,delr1z,eflag,energy);
|
||||||
|
|
||||||
|
@ -403,23 +458,24 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
__kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||||
|
const __global numtyp4 *restrict sw1,
|
||||||
|
const __global numtyp4 *restrict sw2,
|
||||||
|
const __global numtyp4 *restrict sw3,
|
||||||
|
const __global int *restrict map,
|
||||||
|
const __global int *restrict elem2param,
|
||||||
|
const int nelements,
|
||||||
const __global int * dev_nbor,
|
const __global int * dev_nbor,
|
||||||
const __global int * dev_packed,
|
const __global int * dev_packed,
|
||||||
__global acctyp4 *restrict ans,
|
__global acctyp4 *restrict ans,
|
||||||
__global acctyp *restrict engv,
|
__global acctyp *restrict engv,
|
||||||
const int eflag, const int vflag,
|
const int eflag, const int vflag,
|
||||||
const int inum, const int nbor_pitch,
|
const int inum, const int nbor_pitch,
|
||||||
const int t_per_atom, const numtyp sw_cut,
|
const int t_per_atom) {
|
||||||
const numtyp sw_epsilon, const numtyp sw_sigma,
|
|
||||||
const numtyp sw_lambda, const numtyp sw_gamma,
|
|
||||||
const numtyp sw_costheta, const numtyp sw_cutsq) {
|
|
||||||
__local int tpa_sq, n_stride;
|
__local int tpa_sq, n_stride;
|
||||||
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
|
|
||||||
__local numtyp sw_lambda_epsilon2;
|
|
||||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||||
sw_sigma_gamma=sw_sigma*sw_gamma;
|
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||||
sw_lambda_epsilon=sw_lambda*sw_epsilon;
|
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||||
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
|
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||||
|
|
||||||
int tid, ii, offset;
|
int tid, ii, offset;
|
||||||
atom_info(tpa_sq,ii,tid,offset);
|
atom_info(tpa_sq,ii,tid,offset);
|
||||||
|
@ -443,15 +499,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||||
int offset_k=tid & (t_per_atom-1);
|
int offset_k=tid & (t_per_atom-1);
|
||||||
|
|
||||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
//int iw=ix.w;
|
int itype=ix.w;
|
||||||
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
itype=map[itype];
|
||||||
|
|
||||||
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
|
||||||
int j=*nbor_j;
|
int j=*nbor_j;
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
//int mtype=itype+jx.w;
|
int jtype=jx.w;
|
||||||
|
jtype=map[jtype];
|
||||||
|
|
||||||
// Compute r12
|
// Compute r12
|
||||||
numtyp delr1x = ix.x-jx.x;
|
numtyp delr1x = ix.x-jx.x;
|
||||||
|
@ -459,7 +516,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||||
numtyp delr1z = ix.z-jx.z;
|
numtyp delr1z = ix.z-jx.z;
|
||||||
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
|
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
|
||||||
|
|
||||||
if (rsq1 > sw_cutsq) continue;
|
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||||
|
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
|
||||||
|
|
||||||
|
if (rsq1 > sw3_ijparam.y) continue;
|
||||||
|
|
||||||
|
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||||
|
sw_sigma=sw1_ijparam.y;
|
||||||
|
sw_gamma=sw1_ijparam.w;
|
||||||
|
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||||
|
sw_cut_ij=sw3_ijparam.x;
|
||||||
|
|
||||||
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
|
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
|
||||||
int numk=*nbor_k;
|
int numk=*nbor_k;
|
||||||
|
@ -478,15 +544,35 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||||
int k=*nbor_k;
|
int k=*nbor_k;
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i)
|
if (k == i) continue;
|
||||||
continue;
|
|
||||||
|
|
||||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||||
|
int ktype=kx.w;
|
||||||
|
ktype=map[ktype];
|
||||||
|
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
|
||||||
|
|
||||||
numtyp delr2x = kx.x - jx.x;
|
numtyp delr2x = kx.x - jx.x;
|
||||||
numtyp delr2y = kx.y - jx.y;
|
numtyp delr2y = kx.y - jx.y;
|
||||||
numtyp delr2z = kx.z - jx.z;
|
numtyp delr2z = kx.z - jx.z;
|
||||||
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
||||||
if (rsq2 < sw_cutsq) {
|
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
|
||||||
|
|
||||||
|
if (rsq2 < sw3_ikparam.y) {
|
||||||
|
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||||
|
sw_sigma=sw1_ikparam.y;
|
||||||
|
sw_gamma=sw1_ikparam.w;
|
||||||
|
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||||
|
sw_cut_ik=sw3_ikparam.x;
|
||||||
|
|
||||||
|
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
|
||||||
|
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||||
|
sw_epsilon=sw1_ijkparam.x;
|
||||||
|
sw_lambda=sw1_ijkparam.z;
|
||||||
|
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||||
|
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||||
|
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||||
|
sw_costheta_ijk=sw3_ijkparam.z;
|
||||||
|
|
||||||
numtyp fjx, fjy, fjz;
|
numtyp fjx, fjy, fjz;
|
||||||
//if (evatom==0) {
|
//if (evatom==0) {
|
||||||
threebody_half(delr1x,delr1y,delr1z);
|
threebody_half(delr1x,delr1y,delr1z);
|
||||||
|
@ -513,23 +599,24 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
__kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||||
|
const __global numtyp4 *restrict sw1,
|
||||||
|
const __global numtyp4 *restrict sw2,
|
||||||
|
const __global numtyp4 *restrict sw3,
|
||||||
|
const __global int *restrict map,
|
||||||
|
const __global int *restrict elem2param,
|
||||||
|
const int nelements,
|
||||||
const __global int * dev_nbor,
|
const __global int * dev_nbor,
|
||||||
const __global int * dev_packed,
|
const __global int * dev_packed,
|
||||||
__global acctyp4 *restrict ans,
|
__global acctyp4 *restrict ans,
|
||||||
__global acctyp *restrict engv,
|
__global acctyp *restrict engv,
|
||||||
const int eflag, const int vflag,
|
const int eflag, const int vflag,
|
||||||
const int inum, const int nbor_pitch,
|
const int inum, const int nbor_pitch,
|
||||||
const int t_per_atom, const numtyp sw_cut,
|
const int t_per_atom) {
|
||||||
const numtyp sw_epsilon, const numtyp sw_sigma,
|
|
||||||
const numtyp sw_lambda, const numtyp sw_gamma,
|
|
||||||
const numtyp sw_costheta, const numtyp sw_cutsq) {
|
|
||||||
__local int tpa_sq, n_stride;
|
__local int tpa_sq, n_stride;
|
||||||
__local numtyp sw_sigma_gamma, sw_lambda_epsilon;
|
|
||||||
__local numtyp sw_lambda_epsilon2;
|
|
||||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||||
sw_sigma_gamma=sw_sigma*sw_gamma;
|
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||||
sw_lambda_epsilon=sw_lambda*sw_epsilon;
|
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||||
sw_lambda_epsilon2=(numtyp)2.0*sw_lambda_epsilon;
|
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||||
|
|
||||||
int tid, ii, offset;
|
int tid, ii, offset;
|
||||||
atom_info(tpa_sq,ii,tid,offset);
|
atom_info(tpa_sq,ii,tid,offset);
|
||||||
|
@ -553,15 +640,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||||
int offset_k=tid & (t_per_atom-1);
|
int offset_k=tid & (t_per_atom-1);
|
||||||
|
|
||||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
//int iw=ix.w;
|
int itype=ix.w;
|
||||||
//int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
itype=map[itype];
|
||||||
|
|
||||||
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<list_end; nbor_j+=n_stride) {
|
||||||
int j=*nbor_j;
|
int j=*nbor_j;
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
//int mtype=itype+jx.w;
|
int jtype=jx.w;
|
||||||
|
jtype=map[jtype];
|
||||||
|
|
||||||
// Compute r12
|
// Compute r12
|
||||||
numtyp delr1x = ix.x-jx.x;
|
numtyp delr1x = ix.x-jx.x;
|
||||||
|
@ -569,7 +657,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||||
numtyp delr1z = ix.z-jx.z;
|
numtyp delr1z = ix.z-jx.z;
|
||||||
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
|
numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
|
||||||
|
|
||||||
if (rsq1 > sw_cutsq) continue;
|
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||||
|
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
|
||||||
|
|
||||||
|
if (rsq1 > sw3_ijparam.y) continue;
|
||||||
|
|
||||||
|
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||||
|
sw_sigma=sw1_ijparam.y;
|
||||||
|
sw_gamma=sw1_ijparam.w;
|
||||||
|
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||||
|
sw_cut_ij=sw3_ijparam.x;
|
||||||
|
|
||||||
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
|
const __global int *nbor_k=dev_nbor+j+nbor_pitch;
|
||||||
int numk=*nbor_k;
|
int numk=*nbor_k;
|
||||||
|
@ -588,15 +685,35 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||||
int k=*nbor_k;
|
int k=*nbor_k;
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i)
|
if (k == i) continue;
|
||||||
continue;
|
|
||||||
|
|
||||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||||
|
int ktype=kx.w;
|
||||||
|
ktype=map[ktype];
|
||||||
|
int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
|
||||||
|
numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
|
||||||
|
|
||||||
numtyp delr2x = kx.x - jx.x;
|
numtyp delr2x = kx.x - jx.x;
|
||||||
numtyp delr2y = kx.y - jx.y;
|
numtyp delr2y = kx.y - jx.y;
|
||||||
numtyp delr2z = kx.z - jx.z;
|
numtyp delr2z = kx.z - jx.z;
|
||||||
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
||||||
if (rsq2 < sw_cutsq) {
|
|
||||||
|
if (rsq2 < sw3_ikparam.y) {
|
||||||
|
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||||
|
sw_sigma=sw1_ikparam.y;
|
||||||
|
sw_gamma=sw1_ikparam.w;
|
||||||
|
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||||
|
sw_cut_ik=sw3_ikparam.x;
|
||||||
|
|
||||||
|
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
|
||||||
|
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||||
|
sw_epsilon=sw1_ijkparam.x;
|
||||||
|
sw_lambda=sw1_ijkparam.z;
|
||||||
|
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||||
|
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||||
|
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||||
|
sw_costheta_ijk=sw3_ijkparam.z;
|
||||||
|
|
||||||
numtyp fjx, fjy, fjz, fkx, fky, fkz;
|
numtyp fjx, fjy, fjz, fkx, fky, fkz;
|
||||||
threebody(delr1x,delr1y,delr1z,eflag,energy);
|
threebody(delr1x,delr1y,delr1z,eflag,energy);
|
||||||
|
|
||||||
|
|
|
@ -37,13 +37,14 @@ class SW : public BaseThree<numtyp, acctyp> {
|
||||||
* - -3 if there is an out of memory error
|
* - -3 if there is an out of memory error
|
||||||
* - -4 if the GPU library was not compiled for GPU
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
* - -5 Double precision is not supported on card **/
|
* - -5 Double precision is not supported on card **/
|
||||||
int init(const int nlocal, const int nall, const int max_nbors,
|
int init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
|
||||||
const double cell_size, const double gpu_split, FILE *screen,
|
const double cell_size, const double gpu_split, FILE *screen,
|
||||||
const double epsilon, const double sigma,
|
int* host_map, const int nelements, int*** host_elem2param, const int nparams,
|
||||||
const double lambda, const double gamma,
|
const double* epsilon, const double* sigma,
|
||||||
const double costheta, const double biga,
|
const double* lambda, const double* gamma,
|
||||||
const double bigb, const double powerp,
|
const double* costheta, const double* biga,
|
||||||
const double powerq, const double cut, const double cutsq);
|
const double* bigb, const double* powerp,
|
||||||
|
const double* powerq, const double* cut, const double* cutsq);
|
||||||
|
|
||||||
/// Clear all host and device data
|
/// Clear all host and device data
|
||||||
/** \note This is called at the beginning of the init() routine **/
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
@ -60,11 +61,26 @@ class SW : public BaseThree<numtyp, acctyp> {
|
||||||
/// If atom type constants fit in shared memory, use fast kernels
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
bool shared_types;
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
/// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma
|
||||||
|
UCL_D_Vec<numtyp4> sw1;
|
||||||
|
/// sw2.x = biga, sw2.y = bigb, sw2.z = powerp, sw2.w = powerq
|
||||||
|
UCL_D_Vec<numtyp4> sw2;
|
||||||
|
/// sw3.x = cut, sw3.y = cutsq, sw3.z = costheta
|
||||||
|
UCL_D_Vec<numtyp4> sw3;
|
||||||
|
|
||||||
|
UCL_D_Vec<int> elem2param;
|
||||||
|
UCL_D_Vec<int> map;
|
||||||
|
int _nparams,_nelements;
|
||||||
|
|
||||||
|
UCL_Texture sw1_tex, sw2_tex, sw3_tex;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool _allocated;
|
bool _allocated;
|
||||||
void loop(const bool _eflag, const bool _vflag, const int evatom);
|
void loop(const bool _eflag, const bool _vflag, const int evatom);
|
||||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta;
|
|
||||||
numtyp sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,14 +27,15 @@ static SW<PRECISION,ACC_PRECISION> SWMF;
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Allocate memory on host and device and copy constants to device
|
// Allocate memory on host and device and copy constants to device
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
int sw_gpu_init(const int inum, const int nall, const int max_nbors,
|
int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
|
||||||
const double cell_size, int &gpu_mode, FILE *screen,
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
const double sw_epsilon, const double sw_sigma,
|
int* host_map, const int nelements, int*** host_elem2param, const int nparams,
|
||||||
const double sw_lambda, const double sw_gamma,
|
const double* sw_epsilon, const double* sw_sigma,
|
||||||
const double sw_costheta, const double sw_biga,
|
const double* sw_lambda, const double* sw_gamma,
|
||||||
const double sw_bigb, const double sw_powerp,
|
const double* sw_costheta, const double* sw_biga,
|
||||||
const double sw_powerq, const double sw_cut,
|
const double* sw_bigb, const double* sw_powerp,
|
||||||
const double sw_cutsq) {
|
const double* sw_powerq, const double* sw_cut,
|
||||||
|
const double* sw_cutsq) {
|
||||||
SWMF.clear();
|
SWMF.clear();
|
||||||
gpu_mode=SWMF.device->gpu_mode();
|
gpu_mode=SWMF.device->gpu_mode();
|
||||||
double gpu_split=SWMF.device->particle_split();
|
double gpu_split=SWMF.device->particle_split();
|
||||||
|
@ -55,13 +56,14 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
|
||||||
message=true;
|
message=true;
|
||||||
|
|
||||||
if (message) {
|
if (message) {
|
||||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
|
|
||||||
int init_ok=0;
|
int init_ok=0;
|
||||||
if (world_me==0)
|
if (world_me==0)
|
||||||
init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
|
init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
|
||||||
|
host_map, nelements, host_elem2param, nparams,
|
||||||
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
|
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
|
||||||
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
|
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
|
||||||
|
|
||||||
|
@ -72,14 +74,15 @@ int sw_gpu_init(const int inum, const int nall, const int max_nbors,
|
||||||
for (int i=0; i<procs_per_gpu; i++) {
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
if (message) {
|
if (message) {
|
||||||
if (last_gpu-first_gpu==0)
|
if (last_gpu-first_gpu==0)
|
||||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||||
else
|
else
|
||||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0)
|
if (gpu_rank==i && world_me!=0)
|
||||||
init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
|
init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
|
||||||
|
host_map, nelements, host_elem2param, nparams,
|
||||||
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
|
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
|
||||||
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut,
|
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut,
|
||||||
sw_cutsq);
|
sw_cutsq);
|
||||||
|
|
Loading…
Reference in New Issue