Separate the computation of hneigh into another kernel

Simplify the main GPU kernel and add another kernel 'k_pair_reneigh'. It works good on GTX1070 (Pascal), but still there is a problem with non-deterministic results on Volta.

I reimplement BaseCharge::compute methods in the child class LJ_TIP4PLong to correctly embed a new kernel in the code.

Also commit includes some codestyle fixes.
This commit is contained in:
Vsevak 2019-12-06 21:41:02 +03:00
parent 66a076b819
commit a2f9fa8e78
5 changed files with 252 additions and 105 deletions

View File

@ -24,26 +24,26 @@ const char *lj_tip4p=0;
#include "lal_lj_tip4p_long.h"
#include <cassert>
using namespace LAMMPS_AL;
#define LJ_TIP4PLong_T LJ_TIP4PLong<numtyp, acctyp>
#define LJTIP4PLongT LJ_TIP4PLong<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
LJ_TIP4PLong<numtyp, acctyp>::LJ_TIP4PLong(): BaseCharge<numtyp,acctyp>(), _allocated(false) {
LJTIP4PLongT::LJ_TIP4PLong(): BaseCharge<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
LJ_TIP4PLong<numtyp, acctyp>::~LJ_TIP4PLong() {
LJTIP4PLongT::~LJ_TIP4PLong() {
clear();
}
template <class numtyp, class acctyp>
int LJ_TIP4PLong<numtyp, acctyp>::bytes_per_atom(const int max_nbors) const {
int LJTIP4PLongT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJ_TIP4PLong<numtyp, acctyp>::init(const int ntypes,
int LJTIP4PLongT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
@ -65,6 +65,7 @@ int LJ_TIP4PLong<numtyp, acctyp>::init(const int ntypes,
if (success!=0)
return success;
k_pair_distrib.set_function(*this->pair_program,"k_lj_tip4p_long_distrib");
k_pair_reneigh.set_function(*this->pair_program,"k_lj_tip4p_reneigh");
TypeH = tH;
TypeO = tO;
@ -143,7 +144,7 @@ int LJ_TIP4PLong<numtyp, acctyp>::init(const int ntypes,
template <class numtyp, class acctyp>
void LJ_TIP4PLong<numtyp, acctyp>::clear() {
void LJTIP4PLongT::clear() {
if (!_allocated)
return;
_allocated=false;
@ -161,12 +162,13 @@ void LJ_TIP4PLong<numtyp, acctyp>::clear() {
//force_comp.clear();
k_pair_distrib.clear();
k_pair_reneigh.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJ_TIP4PLong<numtyp, acctyp>::host_memory_usage() const {
double LJTIP4PLongT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJ_TIP4PLong<numtyp,acctyp>);
}
@ -174,7 +176,7 @@ double LJ_TIP4PLong<numtyp, acctyp>::host_memory_usage() const {
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJ_TIP4PLong<numtyp, acctyp>::loop(const bool _eflag, const bool _vflag) {
void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
@ -188,13 +190,24 @@ void LJ_TIP4PLong<numtyp, acctyp>::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
const int nall = this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
int GX;
if (t_ago == 0) {
GX=static_cast<int>(ceil(static_cast<double>(nall)/BX));
this->k_pair_reneigh.set_size(GX,BX);
this->k_pair_reneigh.run(&this->atom->x,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&nall, &ainum,&nbor_pitch, &this->_threads_per_atom,
&hneight, &m, &TypeO, &TypeH,
&tag, &map_array, &atom_sametag);
}
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->k_pair.set_size(GX,BX);
if (vflag){
this->ansO.resize_ib(ainum*3);
@ -213,17 +226,16 @@ void LJ_TIP4PLong<numtyp, acctyp>::loop(const bool _eflag, const bool _vflag) {
&atom_sametag, &this->ansO);
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
this->k_pair_distrib.set_size(GX,BX);
this->k_pair_distrib.run(&this->atom->x, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom,
&hneight, &m, &TypeO, &TypeH, &alpha,
&this->atom->q, &this->ansO);
this->k_pair_distrib.run(&this->atom->x, &this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
&hneight, &m, &TypeO, &TypeH, &alpha,&this->atom->q, &this->ansO);
this->time_pair.stop();
}
template <class numtyp, class acctyp>
void LJ_TIP4PLong<numtyp, acctyp>::copy_relations_data(int **hn, double **newsite, int n,
int* tag, int *map_array, int map_size, int *sametag, int max_same, int ago){
void LJTIP4PLongT::copy_relations_data(int n, int* tag, int *map_array,
int map_size, int *sametag, int max_same, int ago){
int nall = n;
const int hn_sz = n*4; // matrix size = col size * col number
hneight.resize_ib(hn_sz);
@ -250,4 +262,118 @@ void LJ_TIP4PLong<numtyp, acctyp>::copy_relations_data(int **hn, double **newsit
host_tag_write.clear();
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJTIP4PLongT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
this->atom->cast_x_data(host_x,host_type);
this->atom->cast_q_data(host_q);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
this->atom->add_q_data();
this->device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
boxlo, prd);
t_ago = ago;
loop(eflag,vflag);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** LJTIP4PLongT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int *map_array, int map_size, int *sametag, int max_same,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success,
double *host_q, double *boxlo, double *prd) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->atom->cast_q_data(host_q);
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->atom->cast_q_data(host_q);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
this->atom->add_q_data();
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
copy_relations_data(nall, tag, map_array, map_size, sametag, max_same, ago);
this->device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
boxlo, prd);
t_ago = ago;
loop(eflag,vflag);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
template class LJ_TIP4PLong<PRECISION,ACC_PRECISION>;

View File

@ -158,6 +158,62 @@ __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
} // if ii
}
__kernel void k_lj_tip4p_reneigh(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const int nall, const int inum,
const int nbor_pitch, const int t_per_atom,
__global int *restrict hneigh,
__global numtyp4 *restrict m,
const int typeO, const int typeH,
const __global int *restrict tag, const __global int *restrict map,
const __global int *restrict sametag) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
int i = BLOCK_ID_X*(BLOCK_SIZE_X)+tid;
if (i<nall) {
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iH1, iH2, iO;
int itype = ix.w;
if(itype == typeO) {
iO = i;
if (hneigh[i*4+2] != -1) {
iH1 = atom_mapping(map, tag[i] + 1);
iH2 = atom_mapping(map, tag[i] + 2);
// set iH1,iH2 to closest image to O
iH1 = closest_image(i, iH1, sametag, x_);
iH2 = closest_image(i, iH2, sametag, x_);
hneigh[i*4 ] = iH1;
hneigh[i*4+1] = iH2;
hneigh[i*4+2] = -1;
}
} else {
if (hneigh[i*4+2] != -1) {
int iI, iH;
iI = atom_mapping(map,tag[i] - 1);
numtyp4 iIx; fetch4(iIx,iI,pos_tex); //x_[iI];
if ((int)iIx.w == typeH) {
iO = atom_mapping(map,tag[i] - 2);
iO = closest_image(i, iO, sametag, x_);
iH1 = closest_image(i, iI, sametag, x_);
iH2 = i;
} else { //if ((int)iIx.w == typeO)
iH = atom_mapping(map, tag[i] + 1);
iO = closest_image(i,iI,sametag, x_);
iH1 = i;
iH2 = closest_image(i,iH,sametag, x_);
}
hneigh[i*4+0] = iO;
hneigh[i*4+1] += -1;
hneigh[i*4+2] = -1;
}
}
}
}
__kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
@ -211,63 +267,17 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
if(itype == typeO) {
iO = i;
if (hneigh[i*4+2] != -1) {
iH1 = atom_mapping(map, tag[i] + 1);
iH2 = atom_mapping(map, tag[i] + 2);
// set iH1,iH2 to closest image to O
iH1 = closest_image(i, iH1, sametag, x_);
iH2 = closest_image(i, iH2, sametag, x_);
hneigh[ i*4 ] = iH1;
hneigh[ i*4+1] = iH2;
hneigh[ i*4+2] = -1;
hneigh[iH1*4 ] = i;
hneigh[iH1*4+1] += -1;
hneigh[iH1*4+2] = -1;
hneigh[iH2*4 ] = i;
hneigh[iH2*4+1] += -1;
hneigh[iH2*4+2] = -1;
} else {
iH1 = hneigh[i*4 ];
iH2 = hneigh[i*4+1];
}
if(fabs(m[iO].w) <= eq_zero) {
compute_newsite(iO,iH1,iH2, &m[iO], alpha, x_);
m[iO].w = qtmp;
}
x1 = m[iO];
} else {
if (hneigh[i*4+2] != -1) {
int iI, iH;
iI = atom_mapping(map,tag[i] - 1);
numtyp4 iIx; fetch4(iIx,iI,pos_tex); //x_[iI];
if ((int)iIx.w == typeH) {
iO = atom_mapping(map,tag[i] - 2);
iO = closest_image(i, iO, sametag, x_);
iH1 = closest_image(i, iI, sametag, x_);
iH2 = i;
} else { //if ((int)iIx.w == typeO)
iH = atom_mapping(map, tag[i] + 1);
iO = closest_image(i,iI,sametag, x_);
iH1 = i;
iH2 = closest_image(i,iH,sametag, x_);
}
hneigh[iH1*4+0] = iO;
hneigh[iH1*4+1] += -1;
hneigh[iH1*4+2] = -1;
hneigh[iH2*4+0] = iO;
hneigh[iH2*4+1] += -1;
hneigh[iH2*4+2] = -1;
hneigh[iO*4+0] = iH1;
hneigh[iO*4+1] = iH2;
hneigh[iO*4+2] = -1;
} else {
iO = hneigh[i *4 ];
iH1 = hneigh[iO*4 ];
iH2 = hneigh[iO*4+1];
}
if (iO >= inum) {
non_local_oxy = 1;
if(fabs(m[iO].w) <= eq_zero) {
@ -327,25 +337,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
if(itype == typeO || jtype == typeO) {
if (jtype == typeO) {
jO = j;
if (hneigh[j*4+2] != -1) {
jH1 = atom_mapping(map,tag[j] + 1);
jH2 = atom_mapping(map,tag[j] + 2);
// set iH1,iH2 to closest image to O
jH1 = closest_image(j, jH1, sametag, x_);
jH2 = closest_image(j, jH2, sametag, x_);
hneigh[j*4 ] = jH1;
hneigh[j*4+1] = jH2;
hneigh[j*4+2] = -1;
hneigh[jH1*4 ] = j;
hneigh[jH1*4+1] += -1;
hneigh[jH1*4+2] = -1;
hneigh[jH2*4 ] = j;
hneigh[jH2*4+1] += -1;
hneigh[jH2*4+2] = -1;
} else {
jH1 = hneigh[j*4 ];
jH2 = hneigh[j*4+1];
}
if (fabs(m[j].w) <= eq_zero) {
compute_newsite(j, jH1, jH2, &m[j], alpha, x_);
m[j].w = qj;
@ -405,7 +398,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH;
vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH;
vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH;
vdj.w = vdj.w;
//vdj.w = vdj.w;
virial[0] += (ix.x - vdj.x)*fd.x;
virial[1] += (ix.y - vdj.y)*fd.y;
virial[2] += (ix.z - vdj.z)*fd.z;
@ -422,7 +415,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
vdi.x = xO.x*cO + xH1.x*cH + xH2.x*cH;
vdi.y = xO.y*cO + xH1.y*cH + xH2.y*cH;
vdi.z = xO.z*cO + xH1.z*cH + xH2.z*cH;
vdi.w = vdi.w;
//vdi.w = vdi.w;
if (jtype != typeH){
numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex);
@ -430,7 +423,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH;
vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH;
vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH;
vdj.w = vdj.w;
//vdj.w = vdj.w;
} else vdj = jx;
vO[0] += 0.5*(vdi.x - vdj.x)*fd.x;
vO[1] += 0.5*(vdi.y - vdj.y)*fd.y;

View File

@ -62,10 +62,29 @@ public:
/// Total host memory used by library for pair style
double host_memory_usage() const;
/// Copy data which is easier to compute in LAMMPS_NS
void copy_relations_data(int **hn, double **m, int n,int* tag, int *map_array, int map_size,
/// Copy data from LAMMPS_NS
void copy_relations_data(int n,int* tag, int *map_array, int map_size,
int *sametag, int max_same, int ago);
/// Reimplement BaseCharge pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *charge,
const int nlocal, double *boxlo, double *prd);
/// Reimplement BaseCharge pair loop with device neighboring
int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag,int *map_array, int map_size, int *sametag, int max_same,
int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
double *charge, double *boxlo, double *prd);
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw
@ -77,7 +96,6 @@ public:
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
@ -98,10 +116,11 @@ public:
UCL_D_Vec<int> map_array;
UCL_D_Vec<int> atom_sametag;
UCL_Kernel k_pair_distrib;
UCL_Kernel k_pair_distrib, k_pair_reneigh;
private:
bool _allocated;
int t_ago;
void loop(const bool _eflag, const bool _vflag);
};

View File

@ -112,14 +112,18 @@ void ljtip4p_long_gpu_clear() {
int ** ljtip4p_long_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial,
double *sublo, double *subhi,
tagint *tag, int *map_array, int map_size,
int *sametag, int max_same,
int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return LJTIP4PLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
subhi, tag, map_array, map_size, sametag, max_same,
nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q,boxlo, prd);
}
@ -139,10 +143,10 @@ double ljtip4p_long_gpu_bytes() {
return LJTIP4PLMF.host_memory_usage();
}
void ljtip4p_long_copy_molecule_data(int **hn, double **m, int n, int* tag,
void ljtip4p_long_copy_molecule_data(int n, int* tag,
int *map_array, int map_size,
int *sametag, int max_same, int ago){
LJTIP4PLMF.copy_relations_data(hn, m, n, tag,map_array,map_size,sametag, max_same, ago);
LJTIP4PLMF.copy_relations_data(n, tag, map_array, map_size, sametag, max_same, ago);
}

View File

@ -64,7 +64,10 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
void ljtip4p_long_gpu_clear();
int ** ljtip4p_long_gpu_compute_n(const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial,
double *sublo, double *subhi,
tagint *tag, int *map_array, int map_size,
int *sametag, int max_same,
int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum,
@ -78,7 +81,7 @@ void ljtip4p_long_gpu_compute(const int ago, const int inum, const int nall,
bool &success, double *host_q, const int nlocal,
double *boxlo, double *prd);
double ljtip4p_long_gpu_bytes();
void ljtip4p_long_copy_molecule_data(int **, double **, int, int* , int *,
void ljtip4p_long_copy_molecule_data(int, int* , int *,
int, int *, int , int);
/* ---------------------------------------------------------------------- */
@ -112,17 +115,16 @@ void PairLJCutTIP4PLongGPU::compute(int eflag, int vflag)
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
ljtip4p_long_copy_molecule_data(hneigh, newsite, nall, atom->tag,
atom->get_map_array(), atom->get_map_size(),
atom->sametag, atom->get_max_same(), neighbor->ago);
bool success = true;
int *ilist, *numneigh, **firstneigh;
if (gpu_mode != GPU_FORCE) {
inum = atom->nlocal;
firstneigh = ljtip4p_long_gpu_compute_n(neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
domain->subhi,
atom->tag, atom->get_map_array(), atom->get_map_size(),
atom->sametag, atom->get_max_same(),
atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, &ilist, &numneigh,
cpu_time, success, atom->q, domain->boxlo,
@ -132,6 +134,9 @@ void PairLJCutTIP4PLongGPU::compute(int eflag, int vflag)
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
ljtip4p_long_copy_molecule_data(nall, atom->tag,
atom->get_map_array(), atom->get_map_size(),
atom->sametag, atom->get_max_same(), neighbor->ago);
ljtip4p_long_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success, atom->q,