From 96fb599b2d3d5814e74b19f49475d22e5e686f5c Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 2 May 2011 15:03:20 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6055 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/GPU/Install.sh | 25 +++ src/GPU/fix_gpu.cpp | 48 ++-- src/GPU/fix_gpu.h | 2 + src/GPU/pair_cg_cmm_coul_long_gpu.cpp | 274 +++++------------------ src/GPU/pair_cg_cmm_coul_long_gpu.h | 3 +- src/GPU/pair_cg_cmm_gpu.cpp | 203 ++++------------- src/GPU/pair_cg_cmm_gpu.h | 3 +- src/GPU/pair_gayberne_gpu.cpp | 268 ++++++---------------- src/GPU/pair_gayberne_gpu.h | 5 +- src/GPU/pair_lj96_cut_gpu.cpp | 166 ++++---------- src/GPU/pair_lj96_cut_gpu.h | 3 +- src/GPU/pair_lj_charmm_coul_long_gpu.cpp | 264 ++++++---------------- src/GPU/pair_lj_charmm_coul_long_gpu.h | 3 +- src/GPU/pair_lj_cut_coul_cut_gpu.cpp | 192 ++++------------ src/GPU/pair_lj_cut_coul_cut_gpu.h | 3 +- src/GPU/pair_lj_cut_coul_long_gpu.cpp | 225 ++++--------------- src/GPU/pair_lj_cut_coul_long_gpu.h | 3 +- src/GPU/pair_lj_cut_gpu.cpp | 165 ++++---------- src/GPU/pair_lj_cut_gpu.h | 3 +- 19 files changed, 491 insertions(+), 1367 deletions(-) diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index 29504865b4..a17dc9ffd5 100644 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -14,6 +14,15 @@ if (test $1 = 1) then sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package fi + if (test -e ../pppm.cpp) then + cp pppm_gpu.cpp .. + cp pppm_gpu_single.cpp .. + cp pppm_gpu_double.cpp .. + cp pppm_gpu.h .. + cp pppm_gpu_single.h .. + cp pppm_gpu_double.h .. + fi + if (test -e ../pair_gayberne.cpp) then cp pair_gayberne_gpu.cpp .. cp pair_gayberne_gpu.h .. @@ -40,14 +49,19 @@ if (test $1 = 1) then fi cp pair_lj_cut_gpu.cpp .. + cp pair_morse_gpu.cpp .. cp pair_lj96_cut_gpu.cpp .. + cp pair_lj_expand_gpu.cpp .. cp pair_lj_cut_coul_cut_gpu.cpp .. cp pair_lj_cut_gpu.h .. + cp pair_morse_gpu.h .. cp pair_lj96_cut_gpu.h .. + cp pair_lj_expand_gpu.h .. cp pair_lj_cut_coul_cut_gpu.h .. cp fix_gpu.cpp .. cp fix_gpu.h .. + cp gpu_extra.h .. elif (test $1 = 0) then @@ -56,9 +70,14 @@ elif (test $1 = 0) then sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package fi + rm ../pppm_gpu.cpp + rm ../pppm_gpu_single.cpp + rm ../pppm_gpu_double.cpp rm ../pair_gayberne_gpu.cpp rm ../pair_lj_cut_gpu.cpp + rm ../pair_morse_gpu.cpp rm ../pair_lj96_cut_gpu.cpp + rm ../pair_lj_expand_gpu.cpp rm ../pair_lj_cut_coul_cut_gpu.cpp rm ../pair_lj_cut_coul_long_gpu.cpp rm ../pair_lj_charmm_coul_long_gpu.cpp @@ -66,15 +85,21 @@ elif (test $1 = 0) then rm ../pair_cg_cmm_coul_long_gpu.cpp rm ../fix_gpu.cpp + rm ../pppm_gpu.h + rm ../pppm_gpu_single.cpp + rm ../pppm_gpu_double.h rm ../pair_gayberne_gpu.h rm ../pair_lj_cut_gpu.h + rm ../pair_morse_gpu.h rm ../pair_lj96_cut_gpu.h + rm ../pair_lj_expand_gpu.h rm ../pair_lj_cut_coul_cut_gpu.h rm ../pair_lj_cut_coul_long_gpu.h rm ../pair_lj_charmm_coul_long_gpu.h rm ../pair_cg_cmm_gpu.h rm ../pair_cg_cmm_coul_long_gpu.h rm ../fix_gpu.h + rm ../gpu_extra.h fi diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 75ce1e83f3..54721900e6 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -24,15 +24,16 @@ #include "modify.h" #include "domain.h" #include "universe.h" +#include "gpu_extra.h" using namespace LAMMPS_NS; enum{GPU_FORCE, GPU_NEIGH}; -extern bool lmp_init_device(MPI_Comm world, MPI_Comm replica, - const int first_gpu, const int last_gpu, - const int gpu_mode, const double particle_split, - const int nthreads); +extern int lmp_init_device(MPI_Comm world, MPI_Comm replica, + const int first_gpu, const int last_gpu, + const int gpu_mode, const double particle_split, + const int nthreads, const int t_per_atom); extern void lmp_clear_device(); extern double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul); @@ -42,18 +43,17 @@ extern double lmp_gpu_forces(double **f, double **tor, double *eatom, FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) { - if (narg != 7) error->all("Illegal fix gpu command"); + if (narg < 7) error->all("Illegal fix gpu command"); if (strcmp(arg[1],"all") != 0) error->all("Illegal fix gpu command"); - int gpu_mode, first_gpu, last_gpu; - double particle_split; + int first_gpu, last_gpu; if (strcmp(arg[3],"force") == 0) - gpu_mode = GPU_FORCE; + _gpu_mode = GPU_FORCE; else if (strcmp(arg[3],"force/neigh") == 0) { - gpu_mode = GPU_NEIGH; + _gpu_mode = GPU_NEIGH; if (domain->triclinic) error->all("Cannot use force/neigh with triclinic box."); } else @@ -62,13 +62,24 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : first_gpu = atoi(arg[4]); last_gpu = atoi(arg[5]); - particle_split = force->numeric(arg[6]); - if (particle_split==0 || particle_split>1) + _particle_split = force->numeric(arg[6]); + if (_particle_split==0 || _particle_split>1) error->all("Illegal fix gpu command."); - if (!lmp_init_device(universe->uworld,world,first_gpu,last_gpu,gpu_mode, - particle_split,1)) - error->one("Could not find or initialize a specified accelerator device."); + int nthreads = 1; + int threads_per_atom = -1; + if (narg == 9) { + if (strcmp(arg[7],"threads_per_atom") == 0) + threads_per_atom = atoi(arg[8]); + else + error->all("Illegal fix gpu command."); + } else if (narg != 7) + error->all("Illegal fix gpu command."); + + int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu, + _gpu_mode, _particle_split, nthreads, + threads_per_atom); + GPU_EXTRA::check_flag(gpu_flag,error,world); } /* ---------------------------------------------------------------------- */ @@ -95,6 +106,15 @@ void FixGPU::init() // Can only have 1 gpu fix that must be the first fix for a run if ((void*)modify->fix[0] != (void*)this) error->all("GPU is not the first fix for this run."); + // Hybrid cannot be used with force/neigh option + if (_gpu_mode == GPU_NEIGH) + if (force->pair_match("hybrid",1) != NULL || + force->pair_match("hybrid/overlay",1) != NULL) + error->all("Cannot use pair hybrid with GPU neighbor builds."); + if (_particle_split < 0) + if (force->pair_match("hybrid",1) != NULL || + force->pair_match("hybrid/overlay",1) != NULL) + error->all("Fix gpu split must be positive for hybrid pair styles."); } /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/fix_gpu.h b/src/GPU/fix_gpu.h index 35c8dea324..30b53ac879 100644 --- a/src/GPU/fix_gpu.h +++ b/src/GPU/fix_gpu.h @@ -37,6 +37,8 @@ class FixGPU : public Fix { double memory_usage(); private: + int _gpu_mode; + double _particle_split; }; } diff --git a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp index 6d11692d5c..153cb98a9e 100644 --- a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp +++ b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp @@ -35,6 +35,7 @@ #include "domain.h" #include "string.h" #include "kspace.h" +#include "gpu_extra.h" #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) @@ -49,27 +50,29 @@ // External functions from cuda library for atom decomposition -bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald); +int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald); void cmml_gpu_clear(); -int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q); -void cmml_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q); +int ** cmml_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd); +void cmml_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double cmml_gpu_bytes(); using namespace LAMMPS_NS; @@ -95,8 +98,6 @@ PairCGCMMCoulLongGPU::~PairCGCMMCoulLongGPU() void PairCGCMMCoulLongGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -104,31 +105,32 @@ void PairCGCMMCoulLongGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = cmml_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->q); + firstneigh = cmml_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); } else { inum = list->inum; - cmml_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, atom->q); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + cmml_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one("Out of memory on GPGPU"); if (host_startq_flag) error->all("Pair style cg/cmm/coul/long requires atom attribute q"); - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU cg/cmm pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -176,17 +178,13 @@ void PairCGCMMCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3, - lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, cut_ljsq, - cut_coulsq_global, force->special_coul, - force->qqrd2e, g_ewald); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU cg/cmm pair style"); + int success = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3, + lj4, offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, + cut_coulsq_global, force->special_coul, + force->qqrd2e, g_ewald); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -205,14 +203,16 @@ double PairCGCMMCoulLongGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag) +void PairCGCMMCoulLongGPU::cpu_compute(int start, int inum, int eflag, + int vflag, int *ilist, int *numneigh, + int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype,itable; + int i,j,ii,jj,jnum,itype,jtype,itable; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz; double fraction,table; double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; double grij,expm2,prefactor,t,erfc; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double rsq; double **x = atom->x; @@ -225,11 +225,6 @@ void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag) double *special_lj = force->special_lj; double qqrd2e = force->qqrd2e; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -244,13 +239,9 @@ void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag) for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; - - if (j < nall) factor_coul = factor_lj = 1.0; - else { - factor_coul = special_coul[j/nall]; - factor_lj = special_lj[j/nall]; - j %= nall; - } + factor_lj = special_lj[sbmask(j)]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; const double delx = xtmp - x[j][0]; const double dely = ytmp - x[j][1]; @@ -347,156 +338,3 @@ void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairCGCMMCoulLongGPU::cpu_compute(int *nbors, int start, int eflag, - int vflag) -{ - int i,j,jnum,itype,jtype,itable; - double qtmp,xtmp,ytmp,ztmp,delx,dely,delz; - double fraction,table; - double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - double grij,expm2,prefactor,t,erfc; - double rsq; - - double **x = atom->x; - double **f = atom->f; - double *q = atom->q; - int *type = atom->type; - int nlocal = atom->nlocal; - int nall = nlocal + atom->nghost; - int stride = nlocal-start; - double *special_coul = force->special_coul; - double *special_lj = force->special_lj; - double qqrd2e = force->qqrd2e; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - qtmp = q[i]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor>= ncoulshiftbits; - const double fraction = (rsq_lookup.f - rtable[itable]) * - drtable[itable]; - const double table = ftable[itable] + fraction*dftable[itable]; - forcecoul = qtmp*q[j] * table; - if (eflag) { - const double table2 = etable[itable] + fraction*detable[itable]; - ecoul = qtmp*q[j] * table2; - } - if (factor_coul < 1.0) { - const double table2 = ctable[itable] + fraction*dctable[itable]; - const double prefactor = qtmp*q[j] * table2; - forcecoul -= (1.0-factor_coul)*prefactor; - if (eflag) ecoul -= (1.0-factor_coul)*prefactor; - } - } - } - fpair = (forcecoul + forcelj) * r2inv; - - f[i][0] += delx*fpair; - f[i][1] += dely*fpair; - f[i][2] += delz*fpair; - - if (j (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen); +int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen); void cmm_gpu_clear(); -int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); -void cmm_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success); +int ** cmm_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); +void cmm_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double cmm_gpu_bytes(); using namespace LAMMPS_NS; @@ -84,8 +85,6 @@ PairCGCMMGPU::~PairCGCMMGPU() void PairCGCMMGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -93,30 +92,30 @@ void PairCGCMMGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = cmm_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success); + firstneigh = cmm_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success); } else { inum = list->inum; - cmm_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + cmm_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); } if (!success) error->one("Out of memory on GPGPU"); if (host_startpair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU CGCMM pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -152,15 +151,11 @@ void PairCGCMMGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU CGCMM pair style"); + int success = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -179,11 +174,13 @@ double PairCGCMMGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) { - int i,j,ii,jj,inum,jnum,itype,jtype; +void PairCGCMMGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) +{ + int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r2inv,r6inv,forcelj,factor_lj; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double **x = atom->x; double **f = atom->f; @@ -192,11 +189,6 @@ void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) { int nall = nlocal + atom->nghost; double *special_lj = force->special_lj; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -210,12 +202,8 @@ void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) { for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; - - if (j < nall) factor_lj = 1.0; - else { - factor_lj = special_lj[j/nall]; - j %= nall; - } + factor_lj = special_lj[sbmask(j)]; + j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; @@ -266,100 +254,3 @@ void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) { } } } - -/* ---------------------------------------------------------------------- */ - -void PairCGCMMGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { - int i,j,itype,jtype; - int nlocal = atom->nlocal; - int nall = nlocal + atom->nghost; - int stride = nlocal-start; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r2inv,r6inv,forcelj,factor_lj; - double *special_lj = force->special_lj; - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - int jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool gb_gpu_init(const int ntypes, const double gamma, const double upsilon, - const double mu, double **shape, double **well, double **cutsq, - double **sigma, double **epsilon, double *host_lshape, - int **form, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, - double *special_lj, const int nlocal, const int nall, - const int max_nbors, const double cell_size, - int &gpu_mode, FILE *screen); +int gb_gpu_init(const int ntypes, const double gamma, const double upsilon, + const double mu, double **shape, double **well, double **cutsq, + double **sigma, double **epsilon, double *host_lshape, + int **form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, const int nlocal, const int nall, + const int max_nbors, const double cell_size, + int &gpu_mode, FILE *screen); void gb_gpu_clear(); -int * gb_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success, - double **host_quat); -int * gb_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double **host_quat); +int ** gb_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double **host_quat); +int * gb_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_quat); double gb_gpu_bytes(); using namespace LAMMPS_NS; @@ -77,6 +77,8 @@ PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid"); if (!avec) error->all("Pair gayberne requires atom style ellipsoid"); + quat_nmax = 0; + quat = NULL; } /* ---------------------------------------------------------------------- @@ -87,14 +89,13 @@ PairGayBerneGPU::~PairGayBerneGPU() { gb_gpu_clear(); cpu_time = 0.0; + memory->destroy(quat); } /* ---------------------------------------------------------------------- */ void PairGayBerneGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -102,34 +103,47 @@ void PairGayBerneGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; + int *ilist, *numneigh, **firstneigh; + + if (nall > quat_nmax) { + quat_nmax = static_cast(1.1 * nall); + memory->grow(quat, quat_nmax, 4, "pair:quat"); + } + AtomVecEllipsoid::Bonus *bonus = avec->bonus; + int *ellipsoid = atom->ellipsoid; + for (int i=0; i -1) { + quat[i][0] = bonus[qi].quat[0]; + quat[i][1] = bonus[qi].quat[1]; + quat[i][2] = bonus[qi].quat[2]; + quat[i][3] = bonus[qi].quat[3]; + } + } if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - /* MIKE: this arg of atom->quat needs to be modified - gpulist = gb_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, domain->subhi, - eflag, vflag, eflag_atom, vflag_atom, host_start, - cpu_time, success, atom->quat); - */ + firstneigh = gb_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, quat); } else { inum = list->inum; - /* MIKE: this arg of atom->quat needs to be modified - olist = gb_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, - list->firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->quat); - */ + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + olist = gb_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, + eflag_atom, vflag_atom, host_start, + cpu_time, success, quat); } if (!success) error->one("Out of memory on GPGPU"); if (host_start < inum) { cpu_time = MPI_Wtime(); - if (gpu_mode == GPU_NEIGH) - cpu_compute(gpulist,host_start,eflag,vflag); - else - cpu_compute(host_start,eflag,vflag); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_time = MPI_Wtime() - cpu_time; } } @@ -140,8 +154,8 @@ void PairGayBerneGPU::compute(int eflag, int vflag) void PairGayBerneGPU::init_style() { - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU Gay-Berne pair style"); if (!atom->ellipsoid_flag) error->all("Pair gayberne requires atom style ellipsoid"); @@ -179,22 +193,20 @@ void PairGayBerneGPU::init_style() double cell_size = sqrt(maxcut) + neighbor->skin; - bool init_ok = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, - shape1, well, cutsq, sigma, epsilon, lshape, form, - lj1, lj2, lj3, lj4, offset, force->special_lj, - atom->nlocal, atom->nlocal+atom->nghost, 300, - cell_size, gpu_mode, screen); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu)."); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU Gay-Berne pair style"); + int success = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, + shape2, well, cutsq, sigma, epsilon, lshape, form, + lj1, lj2, lj3, lj4, offset, force->special_lj, + atom->nlocal, atom->nlocal+atom->nghost, 300, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } + quat_nmax = static_cast(1.1 * (atom->nlocal + atom->nghost)); + memory->grow(quat, quat_nmax, 4, "pair:quat"); } /* ---------------------------------------------------------------------- */ @@ -202,18 +214,19 @@ void PairGayBerneGPU::init_style() double PairGayBerneGPU::memory_usage() { double bytes = Pair::memory_usage(); - return bytes + gb_gpu_bytes(); + return bytes + memory->usage(quat,quat_nmax)+gb_gpu_bytes(); } /* ---------------------------------------------------------------------- */ -void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag) +void PairGayBerneGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype; + int i,j,ii,jj,jnum,itype,jtype; double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj; double fforce[3],ttor[3],rtor[3],r12[3]; double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3]; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double *iquat,*jquat; AtomVecEllipsoid::Bonus *bonus = avec->bonus; @@ -225,11 +238,6 @@ void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag) int nlocal = atom->nlocal; double *special_lj = force->special_lj; - inum = list->inum; - ilist = olist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -331,143 +339,3 @@ void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairGayBerneGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) -{ - int i,j,itype,jtype; - double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj; - double fforce[3],ttor[3],rtor[3],r12[3]; - double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3]; - double *iquat,*jquat; - - AtomVecEllipsoid::Bonus *bonus = avec->bonus; - int *ellipsoid = atom->ellipsoid; - double **x = atom->x; - double **f = atom->f; - double **tor = atom->torque; - int *type = atom->type; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double *special_lj = force->special_lj; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - itype = type[i]; - - if (form[itype][itype] == ELLIPSE_ELLIPSE) { - iquat = bonus[ellipsoid[j]].quat; - MathExtra::quat_to_mat_trans(iquat,a1); - MathExtra::diag_times3(well[itype],a1,temp); - MathExtra::transpose_times3(a1,temp,b1); - MathExtra::diag_times3(shape2[itype],a1,temp); - MathExtra::transpose_times3(a1,temp,g1); - } - - int *nbor = nbors+i-start; - int jnum =* nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for ( ; nbor < nbor_end; nbor += stride) { - j = *nbor; - factor_lj = special_lj[sbmask(j)]; - j &= NEIGHMASK; - - // r12 = center to center vector - - r12[0] = x[j][0]-x[i][0]; - r12[1] = x[j][1]-x[i][1]; - r12[2] = x[j][2]-x[i][2]; - rsq = MathExtra::dot3(r12,r12); - jtype = type[j]; - - // compute if less than cutoff - - if (rsq < cutsq[itype][jtype]) { - - switch (form[itype][jtype]) { - case SPHERE_SPHERE: - r2inv = 1.0/rsq; - r6inv = r2inv*r2inv*r2inv; - forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); - forcelj *= -r2inv; - if (eflag) one_eng = - r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) - - offset[itype][jtype]; - fforce[0] = r12[0]*forcelj; - fforce[1] = r12[1]*forcelj; - fforce[2] = r12[2]*forcelj; - ttor[0] = ttor[1] = ttor[2] = 0.0; - rtor[0] = rtor[1] = rtor[2] = 0.0; - break; - - case SPHERE_ELLIPSE: - jquat = bonus[ellipsoid[j]].quat; - MathExtra::quat_to_mat_trans(jquat,a2); - MathExtra::diag_times3(well[jtype],a2,temp); - MathExtra::transpose_times3(a2,temp,b2); - MathExtra::diag_times3(shape2[jtype],a2,temp); - MathExtra::transpose_times3(a2,temp,g2); - one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor); - ttor[0] = ttor[1] = ttor[2] = 0.0; - break; - - case ELLIPSE_SPHERE: - one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor); - rtor[0] = rtor[1] = rtor[2] = 0.0; - break; - - default: - jquat = bonus[ellipsoid[j]].quat; - MathExtra::quat_to_mat_trans(jquat,a2); - MathExtra::diag_times3(well[jtype],a2,temp); - MathExtra::transpose_times3(a2,temp,b2); - MathExtra::diag_times3(shape2[jtype],a2,temp); - MathExtra::transpose_times3(a2,temp,g2); - one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq, - fforce,ttor,rtor); - break; - } - - fforce[0] *= factor_lj; - fforce[1] *= factor_lj; - fforce[2] *= factor_lj; - ttor[0] *= factor_lj; - ttor[1] *= factor_lj; - ttor[2] *= factor_lj; - - f[i][0] += fforce[0]; - f[i][1] += fforce[1]; - f[i][2] += fforce[2]; - tor[i][0] += ttor[0]; - tor[i][1] += ttor[1]; - tor[i][2] += ttor[2]; - - if (eflag) evdwl = factor_lj*one_eng; - - if (j (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); +int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); void lj96_gpu_clear(); -int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); -void lj96_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success); +int ** lj96_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); +void lj96_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double lj96_gpu_bytes(); using namespace LAMMPS_NS; @@ -83,8 +84,6 @@ PairLJ96CutGPU::~PairLJ96CutGPU() void PairLJ96CutGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -92,30 +91,30 @@ void PairLJ96CutGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = lj96_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success); + firstneigh = lj96_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success); } else { inum = list->inum; - lj96_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + lj96_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); } if (!success) error->one("Out of memory on GPGPU"); if (host_startpair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ96 pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -151,15 +150,11 @@ void PairLJ96CutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU LJ96 pair style"); + int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -178,11 +173,13 @@ double PairLJ96CutGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) { - int i,j,ii,jj,inum,jnum,itype,jtype; +void PairLJ96CutGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) +{ + int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double **x = atom->x; double **f = atom->f; @@ -190,11 +187,6 @@ void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) { int nlocal = atom->nlocal; double *special_lj = force->special_lj; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -239,73 +231,3 @@ void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) { } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJ96CutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { - int i,j,itype,jtype; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj; - double *special_lj = force->special_lj; - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - int jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor (b) ? (a) : (b)) @@ -49,35 +50,35 @@ // External functions from cuda library for atom decomposition -bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, double **sigma, - const bool mix_arithmetic); +int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald, const double cut_lj_innersq, + const double denom_lj, double **epsilon, double **sigma, + const bool mix_arithmetic); void crml_gpu_clear(); -int * crml_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q); -void crml_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q); +int ** crml_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, + double *boxlo, double *prd); +void crml_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double crml_gpu_bytes(); using namespace LAMMPS_NS; -enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER}; - /* ---------------------------------------------------------------------- */ PairLJCharmmCoulLongGPU::PairLJCharmmCoulLongGPU(LAMMPS *lmp) : @@ -100,8 +101,6 @@ PairLJCharmmCoulLongGPU::~PairLJCharmmCoulLongGPU() void PairLJCharmmCoulLongGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -109,31 +108,32 @@ void PairLJCharmmCoulLongGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = crml_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->q); + firstneigh = crml_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); } else { inum = list->inum; - crml_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, atom->q); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + crml_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one("Out of memory on GPGPU"); if (host_startq_flag) error->all("Pair style lj/charmm/coul/long requires atom attribute q"); - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU CHARMM pair style"); // Repeat cutsq calculation because done after call to init_style double cut; @@ -183,18 +183,24 @@ void PairLJCharmmCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, cut_ljsq, - cut_coulsq, force->special_coul, force->qqrd2e, - g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma, - mix_flag == ARITHMETIC); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - if (force->newton_pair) - error->all("Cannot use newton pair with GPU CHARMM pair style"); + bool arithmetic = true; + for (int i = 1; i < atom->ntypes + 1; i++) + for (int j = i + 1; j < atom->ntypes + 1; j++) { + if (epsilon[i][j] != sqrt(epsilon[i][i] * epsilon[j][j])) + arithmetic = false; + if (sigma[i][j] != 0.5 * (sigma[i][i] + sigma[j][j])) + arithmetic = false; + } + + int success = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, + cut_coulsq, force->special_coul, force->qqrd2e, + g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma, + arithmetic); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -213,15 +219,17 @@ double PairLJCharmmCoulLongGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJCharmmCoulLongGPU::cpu_compute(int start, int eflag, int vflag) +void PairLJCharmmCoulLongGPU::cpu_compute(int start, int inum, int eflag, + int vflag, int *ilist, + int *numneigh, int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype,itable; + int i,j,ii,jj,jnum,itype,jtype,itable; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; double fraction,table; double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; double grij,expm2,prefactor,t,erfc; double philj,switch1,switch2; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double rsq; evdwl = ecoul = 0.0; @@ -235,11 +243,6 @@ void PairLJCharmmCoulLongGPU::cpu_compute(int start, int eflag, int vflag) double *special_lj = force->special_lj; double qqrd2e = force->qqrd2e; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -339,140 +342,3 @@ void PairLJCharmmCoulLongGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJCharmmCoulLongGPU::cpu_compute(int *nbors, int start, int eflag, - int vflag) -{ - int i,j,jnum,itype,jtype,itable; - double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; - double fraction,table; - double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - double grij,expm2,prefactor,t,erfc; - double philj,switch1,switch2; - double rsq; - - evdwl = ecoul = 0.0; - - double **x = atom->x; - double **f = atom->f; - double *q = atom->q; - int *type = atom->type; - int nlocal = atom->nlocal; - int stride = nlocal - start; - double *special_coul = force->special_coul; - double *special_lj = force->special_lj; - double qqrd2e = force->qqrd2e; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - qtmp = q[i]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor>= ncoulshiftbits; - fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable]; - table = ftable[itable] + fraction*dftable[itable]; - forcecoul = qtmp*q[j] * table; - if (factor_coul < 1.0) { - table = ctable[itable] + fraction*dctable[itable]; - prefactor = qtmp*q[j] * table; - forcecoul -= (1.0-factor_coul)*prefactor; - } - } - } else forcecoul = 0.0; - - if (rsq < cut_ljsq) { - r6inv = r2inv*r2inv*r2inv; - jtype = type[j]; - forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); - if (rsq > cut_lj_innersq) { - switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) * - (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj; - switch2 = 12.0*rsq * (cut_ljsq-rsq) * - (rsq-cut_lj_innersq) / denom_lj; - philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]); - forcelj = forcelj*switch1 + philj*switch2; - } - } else forcelj = 0.0; - - fpair = (forcecoul + factor_lj*forcelj) * r2inv; - - f[i][0] += delx*fpair; - f[i][1] += dely*fpair; - f[i][2] += delz*fpair; - - if (eflag) { - if (rsq < cut_coulsq) { - if (!ncoultablebits || rsq <= tabinnersq) - ecoul = prefactor*erfc; - else { - table = etable[itable] + fraction*detable[itable]; - ecoul = qtmp*q[j] * table; - } - if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; - } else ecoul = 0.0; - - if (rsq < cut_ljsq) { - evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]); - if (rsq > cut_lj_innersq) { - switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) * - (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj; - evdwl *= switch1; - } - evdwl *= factor_lj; - } else evdwl = 0.0; - } - - if (j (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e); +int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e); void ljc_gpu_clear(); -int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q); -void ljc_gpu_compute(const int timestep, const int ago, const int inum, +int ** ljc_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd); +void ljc_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q); + bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd); double ljc_gpu_bytes(); using namespace LAMMPS_NS; @@ -85,8 +89,6 @@ PairLJCutCoulCutGPU::~PairLJCutCoulCutGPU() void PairLJCutCoulCutGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -94,31 +96,32 @@ void PairLJCutCoulCutGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = ljc_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->q); + firstneigh = ljc_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); } else { inum = list->inum; - ljc_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, atom->q); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljc_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one("Out of memory on GPGPU"); if (host_startq_flag) error->all("Pair style lj/cut/coul/cut requires atom attribute q"); - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -154,16 +158,12 @@ void PairLJCutCoulCutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, - force->special_coul, force->qqrd2e); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU LJ pair style"); + int success = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, + force->special_coul, force->qqrd2e); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -182,12 +182,14 @@ double PairLJCutCoulCutGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag) +void PairLJCutCoulCutGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, + int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype; + int i,j,ii,jj,jnum,itype,jtype; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; evdwl = ecoul = 0.0; @@ -201,11 +203,6 @@ void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag) int newton_pair = force->newton_pair; double qqrd2e = force->qqrd2e; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -264,94 +261,3 @@ void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJCutCoulCutGPU::cpu_compute(int *nbors, int start, int eflag, - int vflag) -{ - int i,j,jnum,itype,jtype; - double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; - double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - - evdwl = ecoul = 0.0; - - double **x = atom->x; - double **f = atom->f; - double *q = atom->q; - int *type = atom->type; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double *special_coul = force->special_coul; - double *special_lj = force->special_lj; - double qqrd2e = force->qqrd2e; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - qtmp = q[i]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor (b) ? (a) : (b)) @@ -49,27 +50,29 @@ // External functions from cuda library for atom decomposition -bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald); +int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald); void ljcl_gpu_clear(); -int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q); -void ljcl_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q); +int ** ljcl_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, + double *boxlo, double *prd); +void ljcl_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double ljcl_gpu_bytes(); using namespace LAMMPS_NS; @@ -96,8 +99,6 @@ PairLJCutCoulLongGPU::~PairLJCutCoulLongGPU() void PairLJCutCoulLongGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -105,31 +106,32 @@ void PairLJCutCoulLongGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = ljcl_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, - atom->q); + firstneigh = ljcl_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); } else { inum = list->inum; - ljcl_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, atom->q); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljcl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one("Out of memory on GPGPU"); if (host_startq_flag) error->all("Pair style lj/cut/coul/cut requires atom attribute q"); - if (force->pair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -179,16 +181,12 @@ void PairLJCutCoulLongGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + int success = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, atom->nlocal+atom->nghost, 300, maxspecial, cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU LJ pair style"); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -207,14 +205,16 @@ double PairLJCutCoulLongGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag) +void PairLJCutCoulLongGPU::cpu_compute(int start, int inum, int eflag, + int vflag, int *ilist, int *numneigh, + int **firstneigh) { - int i,j,ii,jj,inum,jnum,itype,jtype,itable; + int i,j,ii,jj,jnum,itype,jtype,itable; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; double fraction,table; double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; double grij,expm2,prefactor,t,erfc; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double rsq; evdwl = ecoul = 0.0; @@ -228,11 +228,6 @@ void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag) double *special_lj = force->special_lj; double qqrd2e = force->qqrd2e; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -320,127 +315,3 @@ void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag) } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJCutCoulLongGPU::cpu_compute(int *nbors, int start, int eflag, - int vflag) -{ - int i,j,jnum,itype,jtype,itable; - double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; - double fraction,table; - double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - double grij,expm2,prefactor,t,erfc; - double rsq; - - evdwl = ecoul = 0.0; - - double **x = atom->x; - double **f = atom->f; - double *q = atom->q; - int *type = atom->type; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double *special_coul = force->special_coul; - double *special_lj = force->special_lj; - double qqrd2e = force->qqrd2e; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - qtmp = q[i]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor>= ncoulshiftbits; - fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable]; - table = ftable[itable] + fraction*dftable[itable]; - forcecoul = qtmp*q[j] * table; - if (factor_coul < 1.0) { - table = ctable[itable] + fraction*dctable[itable]; - prefactor = qtmp*q[j] * table; - forcecoul -= (1.0-factor_coul)*prefactor; - } - } - } else forcecoul = 0.0; - - if (rsq < cut_ljsq[itype][jtype]) { - r6inv = r2inv*r2inv*r2inv; - forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); - } else forcelj = 0.0; - - fpair = (forcecoul + factor_lj*forcelj) * r2inv; - - f[i][0] += delx*fpair; - f[i][1] += dely*fpair; - f[i][2] += delz*fpair; - - if (eflag) { - if (rsq < cut_coulsq) { - if (!ncoultablebits || rsq <= tabinnersq) - ecoul = prefactor*erfc; - else { - table = etable[itable] + fraction*detable[itable]; - ecoul = qtmp*q[j] * table; - } - if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; - } else ecoul = 0.0; - - if (rsq < cut_ljsq[itype][jtype]) { - evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) - - offset[itype][jtype]; - evdwl *= factor_lj; - } else evdwl = 0.0; - } - - if (j (b) ? (a) : (b)) // External functions from cuda library for atom decomposition -bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); +int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); void ljl_gpu_clear(); -int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); -void ljl_gpu_compute(const int timestep, const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success); +int ** ljl_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); +void ljl_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double ljl_gpu_bytes(); using namespace LAMMPS_NS; @@ -83,8 +84,6 @@ PairLJCutGPU::~PairLJCutGPU() void PairLJCutGPU::compute(int eflag, int vflag) { - int ntimestep = static_cast(update->ntimestep % MAXSMALLINT); - if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; @@ -92,30 +91,30 @@ void PairLJCutGPU::compute(int eflag, int vflag) int inum, host_start; bool success = true; - + int *ilist, *numneigh, **firstneigh; if (gpu_mode == GPU_NEIGH) { inum = atom->nlocal; - gpulist = ljl_gpu_compute_n(ntimestep, neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success); + firstneigh = ljl_gpu_compute_n(neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, + &ilist, &numneigh, cpu_time, success); } else { inum = list->inum; - ljl_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x, - atom->type, list->ilist, list->numneigh, list->firstneigh, - eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success); + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); } if (!success) error->one("Out of memory on GPGPU"); if (host_startpair_match("gpu",0) == NULL) - error->all("Cannot use pair hybrid with multiple GPU pair styles"); + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -151,15 +150,11 @@ void PairLJCutGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); - if (!init_ok) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU LJ pair style"); + int success = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); if (gpu_mode != GPU_NEIGH) { int irequest = neighbor->request(this); @@ -178,11 +173,12 @@ double PairLJCutGPU::memory_usage() /* ---------------------------------------------------------------------- */ -void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) { - int i,j,ii,jj,inum,jnum,itype,jtype; +void PairLJCutGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) { + int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r2inv,r6inv,forcelj,factor_lj; - int *ilist,*jlist,*numneigh,**firstneigh; + int *jlist; double **x = atom->x; double **f = atom->f; @@ -190,11 +186,6 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) { int nlocal = atom->nlocal; double *special_lj = force->special_lj; - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { @@ -238,73 +229,3 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) { } } } - -/* ---------------------------------------------------------------------- */ - -void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { - int i,j,itype,jtype; - int nlocal = atom->nlocal; - int stride = nlocal-start; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r2inv,r6inv,forcelj,factor_lj; - double *special_lj = force->special_lj; - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; - - // loop over neighbors of my atoms - - for (i = start; i < nlocal; i++) { - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - int *nbor = nbors + i - start; - int jnum = *nbor; - nbor += stride; - int *nbor_end = nbor + stride * jnum; - - for (; nbor