From 4e52d8d9274ec14911bbca0558122c97e15de0df Mon Sep 17 00:00:00 2001 From: pscrozi Date: Tue, 23 Nov 2010 00:41:14 +0000 Subject: [PATCH] Changes from Mike Brown. git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5278 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/GPU/Install.sh | 41 +- src/GPU/Package.sh | 34 +- src/GPU/fix_gpu.cpp | 148 ++++++ src/GPU/fix_gpu.h | 45 ++ src/GPU/pair_cg_cmm_coul_long_gpu.cpp | 499 +++++++++++++++++++ src/GPU/pair_cg_cmm_coul_long_gpu.h | 48 ++ src/GPU/pair_cg_cmm_gpu.cpp | 362 ++++++++++++++ src/GPU/pair_cg_cmm_gpu.h | 47 ++ src/GPU/pair_gayberne_gpu.cpp | 681 +++++++++++++------------- src/GPU/pair_gayberne_gpu.h | 20 +- src/GPU/pair_lj96_cut_gpu.cpp | 318 ++++++++++++ src/GPU/pair_lj96_cut_gpu.h | 48 ++ src/GPU/pair_lj_cut_coul_cut_gpu.cpp | 364 ++++++++++++++ src/GPU/pair_lj_cut_coul_cut_gpu.h | 48 ++ src/GPU/pair_lj_cut_coul_long_gpu.cpp | 452 +++++++++++++++++ src/GPU/pair_lj_cut_coul_long_gpu.h | 48 ++ src/GPU/pair_lj_cut_gpu.cpp | 418 +++++++++------- src/GPU/pair_lj_cut_gpu.h | 19 +- 18 files changed, 3104 insertions(+), 536 deletions(-) create mode 100644 src/GPU/fix_gpu.cpp create mode 100644 src/GPU/fix_gpu.h create mode 100644 src/GPU/pair_cg_cmm_coul_long_gpu.cpp create mode 100644 src/GPU/pair_cg_cmm_coul_long_gpu.h create mode 100644 src/GPU/pair_cg_cmm_gpu.cpp create mode 100644 src/GPU/pair_cg_cmm_gpu.h create mode 100644 src/GPU/pair_lj96_cut_gpu.cpp create mode 100644 src/GPU/pair_lj96_cut_gpu.h create mode 100644 src/GPU/pair_lj_cut_coul_cut_gpu.cpp create mode 100644 src/GPU/pair_lj_cut_coul_cut_gpu.h create mode 100644 src/GPU/pair_lj_cut_coul_long_gpu.cpp create mode 100644 src/GPU/pair_lj_cut_coul_long_gpu.h diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index fd76c7c98c..0d28d679b8 100644 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -1,6 +1,7 @@ # Install/unInstall package files in LAMMPS # edit Makefile.package to include/exclude GPU library # do not copy gayberne files if non-GPU version does not exist +# do not copy charmm files if non-GPU version does not exist if (test $1 = 1) then @@ -12,14 +13,36 @@ if (test $1 = 1) then sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package fi - + if (test -e ../pair_gayberne.cpp) then cp pair_gayberne_gpu.cpp .. cp pair_gayberne_gpu.h .. fi + + if (test -e ../pair_lj_cut_coul_long.cpp) then + cp pair_lj_cut_coul_long_gpu.cpp .. + cp pair_lj_cut_coul_long_gpu.h .. + fi + + if (test -e ../pair_cg_cmm.cpp) then + cp pair_cg_cmm_gpu.cpp .. + cp pair_cg_cmm_gpu.h .. + fi + + if (test -e ../pair_cg_cmm_coul_long.cpp) then + cp pair_cg_cmm_coul_long_gpu.cpp .. + cp pair_cg_cmm_coul_long_gpu.h .. + fi cp pair_lj_cut_gpu.cpp .. + cp pair_lj96_cut_gpu.cpp .. + cp pair_lj_cut_coul_cut_gpu.cpp .. cp pair_lj_cut_gpu.h .. + cp pair_lj96_cut_gpu.h .. + cp pair_lj_cut_coul_cut_gpu.h .. + + cp fix_gpu.cpp .. + cp fix_gpu.h .. elif (test $1 = 0) then @@ -27,11 +50,23 @@ elif (test $1 = 0) then sed -i -e 's/[^ \t]*gpu //' ../Makefile.package sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package fi - + rm ../pair_gayberne_gpu.cpp rm ../pair_lj_cut_gpu.cpp + rm ../pair_lj96_cut_gpu.cpp + rm ../pair_lj_cut_coul_cut_gpu.cpp + rm ../pair_lj_cut_coul_long_gpu.cpp + rm ../pair_cg_cmm_gpu.cpp + rm ../pair_cg_cmm_coul_long_gpu.cpp + rm ../fix_gpu.cpp rm ../pair_gayberne_gpu.h rm ../pair_lj_cut_gpu.h - + rm ../pair_lj96_cut_gpu.h + rm ../pair_lj_cut_coul_cut_gpu.h + rm ../pair_lj_cut_coul_long_gpu.h + rm ../pair_cg_cmm_gpu.h + rm ../pair_cg_cmm_coul_long_gpu.h + rm ../fix_gpu.h + fi diff --git a/src/GPU/Package.sh b/src/GPU/Package.sh index d3d43271cf..60aa6820ca 100644 --- a/src/GPU/Package.sh +++ b/src/GPU/Package.sh @@ -3,10 +3,40 @@ # do not copy gayberne files if non-GPU version does not exist for file in *.cpp *.h; do - if (test $file == pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then + if (test $file = pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then continue fi - if (test $file == pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then + if (test $file = pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then + continue + fi + if (test $file = pair_lj_cut_coul_long_gpu.cpp -a ! -e ../pair_lj_cut_coul_long.cpp) then + continue + fi + if (test $file = pair_lj_cut_coul_long_gpu.h -a ! -e ../pair_lj_cut_coul_long.cpp) then + continue + fi + if (test $file = pair_cg_cmm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then + continue + fi + if (test $file = pair_cg_cmm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then + continue + fi + if (test $file = pair_cg_cmm_coul_long_gpu.cpp -a ! -e ../pair_cg_cmm_coul_long.cpp) then + continue + fi + if (test $file = pair_cg_cmm_coul_long_gpu.h -a ! -e ../pair_cg_cmm_coul_long.cpp) then + continue + fi + if (test $file = pair_cg_cmm_coul_msm.cpp -a ! -e ../pair_cg_cmm.cpp) then + continue + fi + if (test $file = pair_cg_cmm_coul_msm.h -a ! -e ../pair_cg_cmm.cpp) then + continue + fi + if (test $file = pair_cg_cmm_coul_msm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then + continue + fi + if (test $file = pair_cg_cmm_coul_msm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then continue fi diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp new file mode 100644 index 0000000000..01a79ffef1 --- /dev/null +++ b/src/GPU/fix_gpu.cpp @@ -0,0 +1,148 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "string.h" +#include "stdlib.h" +#include "fix_gpu.h" +#include "atom.h" +#include "force.h" +#include "pair.h" +#include "respa.h" +#include "input.h" +#include "error.h" +#include "timer.h" +#include "modify.h" +#include "domain.h" + +using namespace LAMMPS_NS; + +enum{GPU_FORCE, GPU_NEIGH}; + +extern bool lmp_init_device(const int first_gpu, const int last_gpu, + const int gpu_mode, const double particle_split); +extern void lmp_clear_device(); +extern double lmp_gpu_forces(double **f, double **tor, double *eatom, + double **vatom, double *virial, double &ecoul); + +/* ---------------------------------------------------------------------- */ + +FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : + Fix(lmp, narg, arg) +{ + if (narg < 7) error->all("Illegal fix gpu command"); + + if (strcmp(arg[1],"all") != 0) + error->all("Illegal fix gpu command"); + + int gpu_mode, first_gpu, last_gpu; + double particle_split; + + if (strcmp(arg[3],"force") == 0) + gpu_mode = GPU_FORCE; + else if (strcmp(arg[3],"force/neigh") == 0) { + gpu_mode = GPU_NEIGH; + if (domain->triclinic) + error->all("Cannot use force/neigh with triclinic box."); + } else + error->all("Illegal fix gpu command."); + + first_gpu = atoi(arg[4]); + last_gpu = atoi(arg[5]); + + particle_split = force->numeric(arg[6]); + if (particle_split==0 || particle_split>1) + error->all("Illegal fix gpu command."); + + if (!lmp_init_device(first_gpu,last_gpu,gpu_mode,particle_split)) + error->one("Could not find or initialize a specified accelerator device."); +} + +/* ---------------------------------------------------------------------- */ + +FixGPU::~FixGPU() +{ + lmp_clear_device(); +} + +/* ---------------------------------------------------------------------- */ + +int FixGPU::setmask() +{ + int mask = 0; + mask |= POST_FORCE; + mask |= MIN_POST_FORCE; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixGPU::init() +{ + // Can only have 1 gpu fix that must be the first fix for a run + if ((void*)modify->fix[0] != (void*)this) + error->all("GPU is not the first fix for this run."); +} + +/* ---------------------------------------------------------------------- */ + +void FixGPU::setup(int vflag) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +void FixGPU::min_setup(int vflag) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +void FixGPU::post_force(int vflag) +{ + timer->stamp(); + double lvirial[6]; + for (int i = 0; i < 6; i++) lvirial[i] = 0.0; + double my_eng = lmp_gpu_forces(atom->f, atom->torque, force->pair->eatom, + force->pair->vatom, lvirial, + force->pair->eng_coul); + + force->pair->eng_vdwl += my_eng; + force->pair->virial[0] += lvirial[0]; + force->pair->virial[1] += lvirial[1]; + force->pair->virial[2] += lvirial[2]; + force->pair->virial[3] += lvirial[3]; + force->pair->virial[4] += lvirial[4]; + force->pair->virial[5] += lvirial[5]; + + if (force->pair->vflag_fdotr) force->pair->virial_compute(); + timer->stamp(TIME_PAIR); +} + +/* ---------------------------------------------------------------------- */ + +void FixGPU::min_post_force(int vflag) +{ + post_force(vflag); +} + +/* ---------------------------------------------------------------------- */ + +double FixGPU::memory_usage() +{ + double bytes = 0.0; + // Memory usage currently returned by pair routine + return bytes; +} + diff --git a/src/GPU/fix_gpu.h b/src/GPU/fix_gpu.h new file mode 100644 index 0000000000..35c8dea324 --- /dev/null +++ b/src/GPU/fix_gpu.h @@ -0,0 +1,45 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(gpu,FixGPU) + +#else + +#ifndef LMP_FIX_GPU_H +#define LMP_FIX_GPU_H + +#include "fix.h" + +namespace LAMMPS_NS { + +class FixGPU : public Fix { + public: + FixGPU(class LAMMPS *, int, char **); + ~FixGPU(); + int setmask(); + void init(); + void setup(int); + void min_setup(int); + void post_force(int); + void min_post_force(int); + double memory_usage(); + + private: +}; + +} + +#endif +#endif diff --git a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp new file mode 100644 index 0000000000..8c7825f316 --- /dev/null +++ b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp @@ -0,0 +1,499 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Mike Brown (SNL) +------------------------------------------------------------------------- */ + +#include "math.h" +#include "stdio.h" +#include "stdlib.h" +#include "pair_cg_cmm_coul_long_gpu.h" +#include "atom.h" +#include "atom_vec.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "integrate.h" +#include "memory.h" +#include "error.h" +#include "neigh_request.h" +#include "universe.h" +#include "update.h" +#include "domain.h" +#include "string.h" +#include "kspace.h" + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 + +// External functions from cuda library for atom decomposition + +bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald); +void cmml_gpu_clear(); +int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q); +void cmml_gpu_compute(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q); +double cmml_gpu_bytes(); + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairCGCMMCoulLongGPU::PairCGCMMCoulLongGPU(LAMMPS *lmp) : PairCGCMMCoulLong(lmp), gpu_mode(GPU_PAIR) +{ + respa_enable = 0; + cpu_time = 0.0; +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairCGCMMCoulLongGPU::~PairCGCMMCoulLongGPU() +{ + cmml_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulLongGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + + if (gpu_mode == GPU_NEIGH) { + inum = atom->nlocal; + gpulist = cmml_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, + atom->q); + } else { + inum = list->inum; + cmml_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x, + atom->type, list->ilist, list->numneigh, list->firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, + success, atom->q); + } + if (!success) + error->one("Out of memory on GPGPU"); + + if (host_startq_flag) + error->all("Pair style cg/cmm/coul/long requires atom attribute q"); + if (force->pair_match("gpu",0) == NULL) + error->all("Cannot use pair hybrid with multiple GPU pair styles"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + // insure use of KSpace long-range solver, set g_ewald + + if (force->kspace == NULL) + error->all("Pair style is incompatible with KSpace style"); + g_ewald = force->kspace->g_ewald; + + // setup force tables + + if (ncoultablebits) init_tables(); + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + bool init_ok = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3, + lj4, offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, + cut_coulsq_global, force->special_coul, + force->qqrd2e, g_ewald); + if (!init_ok) + error->one("Insufficient memory on accelerator (or no fix gpu).\n"); + + if (force->newton_pair) + error->all("Cannot use newton pair with GPU cg/cmm pair style"); + + if (gpu_mode != GPU_NEIGH) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairCGCMMCoulLongGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + cmml_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag) +{ + int i,j,ii,jj,inum,jnum,itype,jtype,itable; + double qtmp,xtmp,ytmp,ztmp,delx,dely,delz; + double fraction,table; + double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; + double grij,expm2,prefactor,t,erfc; + int *ilist,*jlist,*numneigh,**firstneigh; + double rsq; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + + if (j < nall) factor_coul = factor_lj = 1.0; + else { + factor_coul = special_coul[j/nall]; + factor_lj = special_lj[j/nall]; + j %= nall; + } + + const double delx = xtmp - x[j][0]; + const double dely = ytmp - x[j][1]; + const double delz = ztmp - x[j][2]; + const double rsq = delx*delx + dely*dely + delz*delz; + const int jtype = type[j]; + + double evdwl = 0.0; + double ecoul = 0.0; + double fpair = 0.0; + + if (rsq < cutsq[itype][jtype]) { + const double r2inv = 1.0/rsq; + const int cgt=cg_type[itype][jtype]; + + double forcelj = 0.0; + double forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + forcelj=factor_lj; + if (eflag) evdwl=factor_lj; + + if (cgt == CG_LJ12_4) { + const double r4inv=r2inv*r2inv; + forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv + - lj2[itype][jtype]); + if (eflag) { + evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv + - lj4[itype][jtype]) - offset[itype][jtype]; + } + } else if (cgt == CG_LJ9_6) { + const double r3inv = r2inv*sqrt(r2inv); + const double r6inv = r3inv*r3inv; + forcelj *= r6inv*(lj1[itype][jtype]*r3inv + - lj2[itype][jtype]); + if (eflag) { + evdwl *= r6inv*(lj3[itype][jtype]*r3inv + - lj4[itype][jtype]) - offset[itype][jtype]; + } + } else { + const double r6inv = r2inv*r2inv*r2inv; + forcelj *= r6inv*(lj1[itype][jtype]*r6inv + - lj2[itype][jtype]); + if (eflag) { + evdwl *= r6inv*(lj3[itype][jtype]*r6inv + - lj4[itype][jtype]) - offset[itype][jtype]; + } + } + } + + if (rsq < cut_coulsq_global) { + if (!ncoultablebits || rsq <= tabinnersq) { + const double r = sqrt(rsq); + const double grij = g_ewald * r; + const double expm2 = exp(-grij*grij); + const double t = 1.0 / (1.0 + EWALD_P*grij); + const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const double prefactor = qqrd2e * qtmp*q[j]/r; + forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); + if (eflag) ecoul = prefactor*erfc; + if (factor_coul < 1.0) { + forcecoul -= (1.0-factor_coul)*prefactor; + if (eflag) ecoul -= (1.0-factor_coul)*prefactor; + } + } else { + union_int_float_t rsq_lookup; + rsq_lookup.f = rsq; + int itable = rsq_lookup.i & ncoulmask; + itable >>= ncoulshiftbits; + const double fraction = (rsq_lookup.f - rtable[itable]) * + drtable[itable]; + const double table = ftable[itable] + fraction*dftable[itable]; + forcecoul = qtmp*q[j] * table; + if (eflag) { + const double table2 = etable[itable] + fraction*detable[itable]; + ecoul = qtmp*q[j] * table2; + } + if (factor_coul < 1.0) { + const double table2 = ctable[itable] + fraction*dctable[itable]; + const double prefactor = qtmp*q[j] * table2; + forcecoul -= (1.0-factor_coul)*prefactor; + if (eflag) ecoul -= (1.0-factor_coul)*prefactor; + } + } + } + fpair = (forcecoul + forcelj) * r2inv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz); + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMCoulLongGPU::cpu_compute(int *nbors, int start, int eflag, + int vflag) +{ + int i,j,jnum,itype,jtype,itable; + double qtmp,xtmp,ytmp,ztmp,delx,dely,delz; + double fraction,table; + double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; + double grij,expm2,prefactor,t,erfc; + double rsq; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + int stride = nlocal-start; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (i = start; i < nlocal; i++) { + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + int *nbor = nbors + i - start; + jnum = *nbor; + nbor += stride; + int *nbor_end = nbor + stride * jnum; + + for (; nbor>= ncoulshiftbits; + const double fraction = (rsq_lookup.f - rtable[itable]) * + drtable[itable]; + const double table = ftable[itable] + fraction*dftable[itable]; + forcecoul = qtmp*q[j] * table; + if (eflag) { + const double table2 = etable[itable] + fraction*detable[itable]; + ecoul = qtmp*q[j] * table2; + } + if (factor_coul < 1.0) { + const double table2 = ctable[itable] + fraction*dctable[itable]; + const double prefactor = qtmp*q[j] * table2; + forcecoul -= (1.0-factor_coul)*prefactor; + if (eflag) ecoul -= (1.0-factor_coul)*prefactor; + } + } + } + fpair = (forcecoul + forcelj) * r2inv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (j (b) ? (a) : (b)) + +// External functions from cuda library for atom decomposition + +bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen); +void cmm_gpu_clear(); +int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); +void cmm_gpu_compute(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success); +double cmm_gpu_bytes(); + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairCGCMMGPU::PairCGCMMGPU(LAMMPS *lmp) : PairCGCMM(lmp), gpu_mode(GPU_PAIR) +{ + respa_enable = 0; + cpu_time = 0.0; +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairCGCMMGPU::~PairCGCMMGPU() +{ + cmm_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + + if (gpu_mode == GPU_NEIGH) { + inum = atom->nlocal; + gpulist = cmm_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); + } else { + inum = list->inum; + cmm_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x, + atom->type, list->ilist, list->numneigh, list->firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, + success); + } + if (!success) + error->one("Out of memory on GPGPU"); + + if (host_startpair_match("gpu",0) == NULL) + error->all("Cannot use pair hybrid with multiple GPU pair styles"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + bool init_ok = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + if (!init_ok) + error->one("Insufficient memory on accelerator (or no fix gpu).\n"); + + if (force->newton_pair) + error->all("Cannot use newton pair with GPU CGCMM pair style"); + + if (gpu_mode != GPU_NEIGH) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairCGCMMGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + cmm_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) { + int i,j,ii,jj,inum,jnum,itype,jtype; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double rsq,r2inv,r6inv,forcelj,factor_lj; + int *ilist,*jlist,*numneigh,**firstneigh; + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + double *special_lj = force->special_lj; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + + if (j < nall) factor_lj = 1.0; + else { + factor_lj = special_lj[j/nall]; + j %= nall; + } + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + const int cgt=cg_type[itype][jtype]; + r2inv = 1.0/rsq; + + fpair = factor_lj; + if (eflag) evdwl = factor_lj; + if (cgt == CG_LJ12_4) { + const double r4inv = r2inv*r2inv; + fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv + - lj2[itype][jtype]); + if (eflag) { + evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv + - lj4[itype][jtype]) - offset[itype][jtype]; + } + } else if (cgt == CG_LJ9_6) { + const double r3inv = r2inv*sqrt(r2inv); + const double r6inv = r3inv*r3inv; + fpair *= r6inv*(lj1[itype][jtype]*r3inv + - lj2[itype][jtype]); + if (eflag) { + evdwl *= r6inv*(lj3[itype][jtype]*r3inv + - lj4[itype][jtype]) - offset[itype][jtype]; + } + } else { + const double r6inv = r2inv*r2inv*r2inv; + fpair *= r6inv*(lj1[itype][jtype]*r6inv + - lj2[itype][jtype]); + if (eflag) { + evdwl *= r6inv*(lj3[itype][jtype]*r6inv + - lj4[itype][jtype]) - offset[itype][jtype]; + } + } + fpair *= r2inv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairCGCMMGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { + int i,j,itype,jtype; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + int stride = nlocal-start; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double rsq,r2inv,r6inv,forcelj,factor_lj; + double *special_lj = force->special_lj; + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + + // loop over neighbors of my atoms + + for (i = start; i < nlocal; i++) { + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + int *nbor = nbors + i - start; + int jnum = *nbor; + nbor += stride; + int *nbor_end = nbor + stride * jnum; + + for (; nbor - -#ifdef GB_GPU_OMP -#include "omp.h" -#endif +#include "domain.h" +#include "update.h" +#include "string.h" #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) -#ifndef WINDLL - // External functions from cuda library for atom decomposition -bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma, - const double upsilon, const double mu, double **shape, - double **well, double **cutsq, double **sigma, - double **epsilon, double *host_lshape, int **form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int nlocal, const int nall, const int max_nbors, - const int thread, const int gpu_id); -void gb_gpu_clear(const int thread); -int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, - int *ilist, const int *numj, const int *type, - const int thread, bool &success); -void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, - const int thread); -void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, - const bool rebuild, const int thread); -void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, - const int thread); -double gb_gpu_forces(double **f, double **tor, const int *ilist, - const bool eflag, const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial, const int thread); -void gb_gpu_name(const int gpu_id, const int max_nbors, char * name); -void gb_gpu_time(const int thread); -int gb_gpu_num_devices(); +bool gb_gpu_init(const int ntypes, const double gamma, const double upsilon, + const double mu, double **shape, double **well, double **cutsq, + double **sigma, double **epsilon, double *host_lshape, + int **form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, const int nlocal, const int nall, + const int max_nbors, const double cell_size, + int &gpu_mode, FILE *screen); +void gb_gpu_clear(); +int * gb_gpu_compute_n(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, const double cpu_time, bool &success, + double **host_quat); +int * gb_gpu_compute(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double **host_quat); double gb_gpu_bytes(); -#else -#include - -typedef bool (*_gb_gpu_init)(int &ij_size, const int ntypes, const double gamma, - const double upsilon, const double mu, double **shape, - double **well, double **cutsq, double **sigma, - double **epsilon, double *host_lshape, int **form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int nlocal, const int nall, const int max_nbors, - const int thread, const int gpu_id); -typedef void (*_gb_gpu_clear)(const int thread); -typedef int * (*_gb_gpu_reset_nbors)(const int nall, const int nlocal, - const int inum, int *ilist, const int *numj, const int *type, - const int thread, bool &success); -typedef void (*_gb_gpu_nbors)(const int *ij, const int num_ij, const bool eflag, - const int thread); -typedef void (*_gb_gpu_atom)(double **host_x, double **host_quat, - const int *host_type, const bool rebuild, const int thread); -typedef void (*_gb_gpu_gayberne)(const bool eflag, const bool vflag, - const bool rebuild, const int thread); -typedef double (*_gb_gpu_forces)(double **f, double **tor, const int *ilist, - const bool eflag, const bool vflag, const bool eflag_atom, - const bool vflag_atom, double *eatom, double **vatom, - double *virial, const int thread); -typedef void (*_gb_gpu_name)(const int gpu_id, const int max_nbors, - char * name); -typedef void (*_gb_gpu_time)(const int thread); -typedef int (*_gb_gpu_num_devices)(); -typedef double (*_gb_gpu_bytes)(); - -_gb_gpu_init gb_gpu_init; -_gb_gpu_clear gb_gpu_clear; -_gb_gpu_reset_nbors gb_gpu_reset_nbors; -_gb_gpu_nbors gb_gpu_nbors; -_gb_gpu_atom gb_gpu_atom; -_gb_gpu_gayberne gb_gpu_gayberne; -_gb_gpu_forces gb_gpu_forces; -_gb_gpu_name gb_gpu_name; -_gb_gpu_time gb_gpu_time; -_gb_gpu_num_devices gb_gpu_num_devices; -_gb_gpu_bytes gb_gpu_bytes; - -#endif - using namespace LAMMPS_NS; +enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; + /* ---------------------------------------------------------------------- */ -PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0), - omp_chunk(0), nthreads(1), - multi_gpu_mode(ONE_NODE), - multi_gpu_param(0), - output_time(false) +PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), + gpu_mode(GPU_PAIR) { - ij_new[0]=NULL; - -#ifdef WINDLL - HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll")); - if (hinstLib == NULL) { - printf("\nUnable to load gpu.dll\n"); - exit(1); - } - - gb_gpu_init=(_gb_gpu_init)GetProcAddress(hinstLib,"gb_gpu_init"); - gb_gpu_clear=(_gb_gpu_clear)GetProcAddress(hinstLib,"gb_gpu_clear"); - gb_gpu_reset_nbors=(_gb_gpu_reset_nbors)GetProcAddress(hinstLib,"gb_gpu_reset_nbors"); - gb_gpu_nbors=(_gb_gpu_nbors)GetProcAddress(hinstLib,"gb_gpu_nbors"); - gb_gpu_atom=(_gb_gpu_atom)GetProcAddress(hinstLib,"gb_gpu_atom"); - gb_gpu_gayberne=(_gb_gpu_gayberne)GetProcAddress(hinstLib,"gb_gpu_gayberne"); - gb_gpu_forces=(_gb_gpu_forces)GetProcAddress(hinstLib,"gb_gpu_forces"); - gb_gpu_name=(_gb_gpu_name)GetProcAddress(hinstLib,"gb_gpu_name"); - gb_gpu_time=(_gb_gpu_time)GetProcAddress(hinstLib,"gb_gpu_time"); - gb_gpu_num_devices=(_gb_gpu_num_devices)GetProcAddress(hinstLib,"gb_gpu_num_devices"); - gb_gpu_bytes=(_gb_gpu_bytes)GetProcAddress(hinstLib,"gb_gpu_bytes"); -#endif } /* ---------------------------------------------------------------------- @@ -156,26 +80,8 @@ PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0), PairGayBerneGPU::~PairGayBerneGPU() { - if (output_time) { - printf("\n\n-------------------------------------"); - printf("--------------------------------\n"); - printf(" GPU Time Stamps (on proc 0): "); - printf("\n-------------------------------------"); - printf("--------------------------------\n"); - gb_gpu_time(my_thread); - printf("-------------------------------------"); - printf("--------------------------------\n\n"); - } - - #pragma omp parallel - { - #ifdef GB_GPU_OMP - int my_thread=omp_get_thread_num(); - #endif - gb_gpu_clear(my_thread); - if (ij_new[my_thread]!=NULL) - delete [] ij_new[my_thread]; - } + gb_gpu_clear(); + cpu_time = 0.0; } /* ---------------------------------------------------------------------- */ @@ -183,141 +89,38 @@ PairGayBerneGPU::~PairGayBerneGPU() void PairGayBerneGPU::compute(int eflag, int vflag) { if (eflag || vflag) ev_setup(eflag,vflag); - else evflag = vflag_fdotr = eflag_atom = vflag_atom = 0; - if (vflag_atom) - error->all("Per-atom virial not available with GPU Gay-Berne"); + else evflag = vflag_fdotr = 0; - int nlocal = atom->nlocal; - int nall = nlocal + atom->nghost; - int inum = list->inum; - - bool rebuild=false; - if (neighbor->ncalls > last_neighbor) { - last_neighbor=neighbor->ncalls; - rebuild=true; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + + if (gpu_mode == GPU_NEIGH) { + inum = atom->nlocal; + gpulist = gb_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, domain->subhi, + eflag, vflag, eflag_atom, vflag_atom, host_start, + cpu_time, success, atom->quat); + } else { + inum = list->inum; + olist = gb_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x, + atom->type, list->ilist, list->numneigh, + list->firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, + atom->quat); } - - #pragma omp parallel - { - - bool success=true; - #ifdef GB_GPU_OMP - int my_thread=omp_get_thread_num(); - if (rebuild) { - omp_chunk=static_cast(ceil(static_cast(inum)/nthreads)); - if (my_thread==nthreads-1) - thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk; - else - thread_inum[my_thread]=omp_chunk; - olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, - thread_inum[my_thread], - list->ilist+omp_chunk*my_thread, - list->numneigh, atom->type, my_thread, - success); - } - #else - if (rebuild) - olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist, - list->numneigh, atom->type, my_thread, - success); - #endif if (!success) error->one("Out of memory on GPGPU"); - - // copy atom data to GPU - gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread); - int i,j,ii,jj,jnum; - double factor_lj; - int *jlist; - - if (rebuild==true) { - int num_ij = 0; - - // loop over neighbors of my atoms - int *ijp=ij_new[my_thread]; - #ifdef GB_GPU_OMP - int mgo=my_thread*omp_chunk; - int mgot=mgo+thread_inum[my_thread]; - #else - int mgo=0, mgot=inum; - #endif - for (ii = mgo; iifirstneigh[i]; - jnum = list->numneigh[i]; - - for (jj = 0; jj < jnum; jj++) { - j = jlist[jj]; - - *ijp=j; - ijp++; - num_ij++; - - if (num_ij==ij_size) { - gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread); - ijp=ij_new[my_thread]; - num_ij=0; - } - } - } - if (num_ij>0) - gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread); + if (host_start < inum) { + cpu_time = MPI_Wtime(); + if (gpu_mode == GPU_NEIGH) + cpu_compute(gpulist,host_start,eflag,vflag); + else + cpu_compute(host_start,eflag,vflag); + cpu_time = MPI_Wtime() - cpu_time; } - - gb_gpu_gayberne(eflag,vflag,rebuild,my_thread); - double lvirial[6]; - for (int i=0; i<6; i++) lvirial[i]=0.0; - double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag, - eflag_atom, vflag_atom, eatom, vatom, lvirial, - my_thread); - #pragma omp critical - { - eng_vdwl+=my_eng; - virial[0]+=lvirial[0]; - virial[1]+=lvirial[1]; - virial[2]+=lvirial[2]; - virial[3]+=lvirial[3]; - virial[4]+=lvirial[4]; - virial[5]+=lvirial[5]; - } - - } //End parallel - - if (vflag_fdotr) virial_compute(); -} - -/* ---------------------------------------------------------------------- - global settings -------------------------------------------------------------------------- */ - -void PairGayBerneGPU::settings(int narg, char **arg) -{ - // strip off GPU keyword/value and send remaining args to parent - - if (narg < 2) error->all("Illegal pair_style command"); - - // set multi_gpu_mode to one/node for multiple gpus on 1 node - // -- param is starting gpu id - // set multi_gpu_mode to one/gpu to select the same gpu id on every node - // -- param is id of gpu - // set multi_gpu_mode to multi/gpu to get ma - // -- param is number of gpus per node - - if (strcmp(arg[0],"one/node") == 0) - multi_gpu_mode = ONE_NODE; - else if (strcmp(arg[0],"one/gpu") == 0) - multi_gpu_mode = ONE_GPU; - else if (strcmp(arg[0],"multi/gpu") == 0) - multi_gpu_mode = MULTI_GPU; - else error->all("Illegal pair_style command"); - - multi_gpu_param = atoi(arg[1]); - - if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1) - error->all("Illegal pair_style command"); - - PairGayBerne::settings(narg-2,&arg[2]); } /* ---------------------------------------------------------------------- @@ -326,8 +129,6 @@ void PairGayBerneGPU::settings(int narg, char **arg) void PairGayBerneGPU::init_style() { - if (comm->me == 0) - output_time=true; if (force->pair_match("gpu",0) == NULL) error->all("Cannot use pair hybrid with multiple GPU pair styles"); if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type) @@ -335,24 +136,7 @@ void PairGayBerneGPU::init_style() if (atom->radius_flag) error->all("Pair gayberne cannot be used with atom attribute diameter"); - // set the GPU ID - - int my_gpu=comm->me+multi_gpu_param; - int ngpus=universe->nprocs; - if (multi_gpu_mode==ONE_GPU) { - my_gpu=multi_gpu_param; - ngpus=1; - } else if (multi_gpu_mode==MULTI_GPU) { - ngpus=multi_gpu_param; - my_gpu=comm->me%ngpus; - if (ngpus>universe->nprocs) - ngpus=universe->nprocs; - } - - int irequest = neighbor->request(this); - // per-type shape precalculations - for (int i = 1; i <= atom->ntypes; i++) { if (setwell[i]) { double *one = atom->shape[i]; @@ -364,72 +148,38 @@ void PairGayBerneGPU::init_style() } // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; double cut; - for (int i = 1; i <= atom->ntypes; i++) + for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - cut = init_one(i,j); - cutsq[i][j] = cutsq[j][i] = cut*cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; } + } - // If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads - #ifdef GB_GPU_OMP - if (multi_gpu_mode!=ONE_GPU) - nthreads=ngpus=gb_gpu_num_devices(); - else - nthreads=ngpus=1; - if (nthreads>MAX_GPU_THREADS) - nthreads=MAX_GPU_THREADS; - omp_set_num_threads(nthreads); - #endif - - #pragma omp parallel firstprivate(my_gpu) - { - #ifdef GB_GPU_OMP - int my_thread = omp_get_thread_num(); - if (multi_gpu_mode!=ONE_GPU) - my_gpu=my_thread; - if (multi_gpu_mode==ONE_NODE) - my_gpu+=multi_gpu_param; - #endif - - bool init_ok=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu, + double cell_size = sqrt(maxcut) + neighbor->skin; + + bool init_ok = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, shape, well, cutsq, sigma, epsilon, lshape, form, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, atom->nlocal+atom->nghost, 300, - my_thread, my_gpu); - if (!init_ok) - error->one("At least 1 proc could not allocate a CUDA gpu or memory"); - - if (ij_new[my_thread]!=NULL) - delete [] ij_new[my_thread]; - ij_new[my_thread]=new int[ij_size]; - } - - last_neighbor = -1; - neighbor->requests[irequest]->half = 0; - neighbor->requests[irequest]->full = 1; + cell_size, gpu_mode, screen); + if (!init_ok) + error->one("Insufficient memory on accelerator (or no fix gpu)."); + if (force->newton_pair) - error->all("Cannot use newton pair with GPU GayBerne pair style"); + error->all("Cannot use newton pair with GPU Gay-Berne pair style"); - if (comm->me == 0 && screen) { - printf("\n-------------------------------------"); - printf("-------------------------------------\n"); - printf("- Using GPGPU acceleration for Gay-Berne:\n"); - printf("-------------------------------------"); - printf("-------------------------------------\n"); - - for (int i=0; ioneatom,gpu_string); - printf("GPU %d: %s\n",gpui,gpu_string); - } - printf("-------------------------------------"); - printf("-------------------------------------\n\n"); + if (gpu_mode != GPU_NEIGH) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; } } @@ -437,6 +187,275 @@ void PairGayBerneGPU::init_style() double PairGayBerneGPU::memory_usage() { - double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int); - return bytes+gb_gpu_bytes(); + double bytes = Pair::memory_usage(); + return bytes + gb_gpu_bytes(); } + +/* ---------------------------------------------------------------------- */ + +void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag) +{ + int i,j,ii,jj,inum,jnum,itype,jtype; + double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj; + double fforce[3],ttor[3],rtor[3],r12[3]; + double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3]; + int *ilist,*jlist,*numneigh,**firstneigh; + + double **x = atom->x; + double **f = atom->f; + double **quat = atom->quat; + double **tor = atom->torque; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + double *special_lj = force->special_lj; + + inum = list->inum; + ilist = olist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + itype = type[i]; + + if (form[itype][itype] == ELLIPSE_ELLIPSE) { + MathExtra::quat_to_mat_trans(quat[i],a1); + MathExtra::diag_times3(well[itype],a1,temp); + MathExtra::transpose_times3(a1,temp,b1); + MathExtra::diag_times3(shape[itype],a1,temp); + MathExtra::transpose_times3(a1,temp,g1); + } + + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + + if (j < nall) factor_lj = 1.0; + else { + factor_lj = special_lj[j/nall]; + j %= nall; + } + + // r12 = center to center vector + + r12[0] = x[j][0]-x[i][0]; + r12[1] = x[j][1]-x[i][1]; + r12[2] = x[j][2]-x[i][2]; + rsq = MathExtra::dot3(r12,r12); + jtype = type[j]; + + // compute if less than cutoff + + if (rsq < cutsq[itype][jtype]) { + + switch (form[itype][jtype]) { + case SPHERE_SPHERE: + r2inv = 1.0/rsq; + r6inv = r2inv*r2inv*r2inv; + forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); + forcelj *= -r2inv; + if (eflag) one_eng = + r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) - + offset[itype][jtype]; + fforce[0] = r12[0]*forcelj; + fforce[1] = r12[1]*forcelj; + fforce[2] = r12[2]*forcelj; + ttor[0] = ttor[1] = ttor[2] = 0.0; + rtor[0] = rtor[1] = rtor[2] = 0.0; + break; + + case SPHERE_ELLIPSE: + MathExtra::quat_to_mat_trans(quat[j],a2); + MathExtra::diag_times3(well[jtype],a2,temp); + MathExtra::transpose_times3(a2,temp,b2); + MathExtra::diag_times3(shape[jtype],a2,temp); + MathExtra::transpose_times3(a2,temp,g2); + one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor); + ttor[0] = ttor[1] = ttor[2] = 0.0; + break; + + case ELLIPSE_SPHERE: + one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor); + rtor[0] = rtor[1] = rtor[2] = 0.0; + break; + + default: + MathExtra::quat_to_mat_trans(quat[j],a2); + MathExtra::diag_times3(well[jtype],a2,temp); + MathExtra::transpose_times3(a2,temp,b2); + MathExtra::diag_times3(shape[jtype],a2,temp); + MathExtra::transpose_times3(a2,temp,g2); + one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq, + fforce,ttor,rtor); + break; + } + + fforce[0] *= factor_lj; + fforce[1] *= factor_lj; + fforce[2] *= factor_lj; + ttor[0] *= factor_lj; + ttor[1] *= factor_lj; + ttor[2] *= factor_lj; + + f[i][0] += fforce[0]; + f[i][1] += fforce[1]; + f[i][2] += fforce[2]; + tor[i][0] += ttor[0]; + tor[i][1] += ttor[1]; + tor[i][2] += ttor[2]; + + if (eflag) evdwl = factor_lj*one_eng; + + if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],fforce[2], + -r12[0],-r12[1],-r12[2]); + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairGayBerneGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) +{ + int i,j,itype,jtype; + double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj; + double fforce[3],ttor[3],rtor[3],r12[3]; + double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3]; + + double **x = atom->x; + double **f = atom->f; + double **quat = atom->quat; + double **tor = atom->torque; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + int stride = nlocal-start; + double *special_lj = force->special_lj; + + // loop over neighbors of my atoms + + for (i = start; i < nlocal; i++) { + itype = type[i]; + + if (form[itype][itype] == ELLIPSE_ELLIPSE) { + MathExtra::quat_to_mat_trans(quat[i],a1); + MathExtra::diag_times3(well[itype],a1,temp); + MathExtra::transpose_times3(a1,temp,b1); + MathExtra::diag_times3(shape[itype],a1,temp); + MathExtra::transpose_times3(a1,temp,g1); + } + + int *nbor = nbors+i-start; + int jnum =* nbor; + nbor += stride; + int *nbor_end = nbor + stride * jnum; + + for ( ; nbor < nbor_end; nbor += stride) { + j = *nbor; + + if (j < nall) factor_lj = 1.0; + else { + factor_lj = special_lj[j/nall]; + j %= nall; + } + + // r12 = center to center vector + + r12[0] = x[j][0]-x[i][0]; + r12[1] = x[j][1]-x[i][1]; + r12[2] = x[j][2]-x[i][2]; + rsq = MathExtra::dot3(r12,r12); + jtype = type[j]; + + // compute if less than cutoff + + if (rsq < cutsq[itype][jtype]) { + + switch (form[itype][jtype]) { + case SPHERE_SPHERE: + r2inv = 1.0/rsq; + r6inv = r2inv*r2inv*r2inv; + forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); + forcelj *= -r2inv; + if (eflag) one_eng = + r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) - + offset[itype][jtype]; + fforce[0] = r12[0]*forcelj; + fforce[1] = r12[1]*forcelj; + fforce[2] = r12[2]*forcelj; + ttor[0] = ttor[1] = ttor[2] = 0.0; + rtor[0] = rtor[1] = rtor[2] = 0.0; + break; + + case SPHERE_ELLIPSE: + MathExtra::quat_to_mat_trans(quat[j],a2); + MathExtra::diag_times3(well[jtype],a2,temp); + MathExtra::transpose_times3(a2,temp,b2); + MathExtra::diag_times3(shape[jtype],a2,temp); + MathExtra::transpose_times3(a2,temp,g2); + one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor); + ttor[0] = ttor[1] = ttor[2] = 0.0; + break; + + case ELLIPSE_SPHERE: + one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor); + rtor[0] = rtor[1] = rtor[2] = 0.0; + break; + + default: + MathExtra::quat_to_mat_trans(quat[j],a2); + MathExtra::diag_times3(well[jtype],a2,temp); + MathExtra::transpose_times3(a2,temp,b2); + MathExtra::diag_times3(shape[jtype],a2,temp); + MathExtra::transpose_times3(a2,temp,g2); + one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq, + fforce,ttor,rtor); + break; + } + + fforce[0] *= factor_lj; + fforce[1] *= factor_lj; + fforce[2] *= factor_lj; + ttor[0] *= factor_lj; + ttor[1] *= factor_lj; + ttor[2] *= factor_lj; + + f[i][0] += fforce[0]; + f[i][1] += fforce[1]; + f[i][2] += fforce[2]; + tor[i][0] += ttor[0]; + tor[i][1] += ttor[1]; + tor[i][2] += ttor[2]; + + if (eflag) evdwl = factor_lj*one_eng; + + if (j (b) ? (a) : (b)) + +// External functions from cuda library for atom decomposition + +bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void lj96_gpu_clear(); +int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); +void lj96_gpu_compute(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success); +double lj96_gpu_bytes(); + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairLJ96CutGPU::PairLJ96CutGPU(LAMMPS *lmp) : PairLJ96Cut(lmp), gpu_mode(GPU_PAIR) +{ + respa_enable = 0; + cpu_time = 0.0; +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJ96CutGPU::~PairLJ96CutGPU() +{ + lj96_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJ96CutGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + + if (gpu_mode == GPU_NEIGH) { + inum = atom->nlocal; + gpulist = lj96_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); + } else { + inum = list->inum; + lj96_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x, + atom->type, list->ilist, list->numneigh, list->firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, + success); + } + if (!success) + error->one("Out of memory on GPGPU"); + + if (host_startpair_match("gpu",0) == NULL) + error->all("Cannot use pair hybrid with multiple GPU pair styles"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + bool init_ok = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + if (!init_ok) + error->one("Insufficient memory on accelerator (or no fix gpu).\n"); + + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ96 pair style"); + + if (gpu_mode != GPU_NEIGH) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairLJ96CutGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + lj96_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) { + int i,j,ii,jj,inum,jnum,itype,jtype; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj; + int *ilist,*jlist,*numneigh,**firstneigh; + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + double *special_lj = force->special_lj; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + + if (j < nall) factor_lj = 1.0; + else { + factor_lj = special_lj[j/nall]; + j %= nall; + } + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r2inv = 1.0/rsq; + r6inv = r2inv*r2inv*r2inv; + r3inv = sqrt(r6inv); + forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]); + fpair = factor_lj*forcelj*r2inv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (eflag) { + evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) - + offset[itype][jtype]; + evdwl *= factor_lj; + } + + if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJ96CutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { + int i,j,itype,jtype; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + int stride = nlocal-start; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj; + double *special_lj = force->special_lj; + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + + // loop over neighbors of my atoms + + for (i = start; i < nlocal; i++) { + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + int *nbor = nbors + i - start; + int jnum = *nbor; + nbor += stride; + int *nbor_end = nbor + stride * jnum; + + for (; nbor (b) ? (a) : (b)) + +// External functions from cuda library for atom decomposition + +bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e); +void ljc_gpu_clear(); +int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q); +void ljc_gpu_compute(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q); +double ljc_gpu_bytes(); + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulCutGPU::PairLJCutCoulCutGPU(LAMMPS *lmp) : PairLJCutCoulCut(lmp), gpu_mode(GPU_PAIR) +{ + respa_enable = 0; + cpu_time = 0.0; +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJCutCoulCutGPU::~PairLJCutCoulCutGPU() +{ + ljc_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + + if (gpu_mode == GPU_NEIGH) { + inum = atom->nlocal; + gpulist = ljc_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, + atom->q); + } else { + inum = list->inum; + ljc_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x, + atom->type, list->ilist, list->numneigh, list->firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, + success, atom->q); + } + if (!success) + error->one("Out of memory on GPGPU"); + + if (host_startq_flag) + error->all("Pair style lj/cut/coul/cut requires atom attribute q"); + if (force->pair_match("gpu",0) == NULL) + error->all("Cannot use pair hybrid with multiple GPU pair styles"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + bool init_ok = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, + force->special_coul, force->qqrd2e); + if (!init_ok) + error->one("Insufficient memory on accelerator (or no fix gpu).\n"); + + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); + + if (gpu_mode != GPU_NEIGH) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairLJCutCoulCutGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + ljc_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag) +{ + int i,j,ii,jj,inum,jnum,itype,jtype; + double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; + double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; + int *ilist,*jlist,*numneigh,**firstneigh; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + int newton_pair = force->newton_pair; + double qqrd2e = force->qqrd2e; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + + if (j < nall) factor_coul = factor_lj = 1.0; + else { + factor_coul = special_coul[j/nall]; + factor_lj = special_lj[j/nall]; + j %= nall; + } + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r2inv = 1.0/rsq; + + if (rsq < cut_coulsq[itype][jtype]) + forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv); + else forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + r6inv = r2inv*r2inv*r2inv; + forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); + } else forcelj = 0.0; + + fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (eflag) { + if (rsq < cut_coulsq[itype][jtype]) + ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv); + else ecoul = 0.0; + if (rsq < cut_ljsq[itype][jtype]) { + evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) - + offset[itype][jtype]; + evdwl *= factor_lj; + } else evdwl = 0.0; + } + + if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz); + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutGPU::cpu_compute(int *nbors, int start, int eflag, + int vflag) +{ + int i,j,jnum,itype,jtype; + double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; + double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + int stride = nlocal-start; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (i = start; i < nlocal; i++) { + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + int *nbor = nbors + i - start; + jnum = *nbor; + nbor += stride; + int *nbor_end = nbor + stride * jnum; + + for (; nbor (b) ? (a) : (b)) + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 + +// External functions from cuda library for atom decomposition + +bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald); +void ljcl_gpu_clear(); +int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q); +void ljcl_gpu_compute(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q); +double ljcl_gpu_bytes(); + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulLongGPU::PairLJCutCoulLongGPU(LAMMPS *lmp) : PairLJCutCoulLong(lmp), gpu_mode(GPU_PAIR) +{ + respa_enable = 0; + cpu_time = 0.0; +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJCutCoulLongGPU::~PairLJCutCoulLongGPU() +{ + ljcl_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + + if (gpu_mode == GPU_NEIGH) { + inum = atom->nlocal; + gpulist = ljcl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, + atom->q); + } else { + inum = list->inum; + ljcl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x, + atom->type, list->ilist, list->numneigh, list->firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, + success, atom->q); + } + if (!success) + error->one("Out of memory on GPGPU"); + + if (host_startq_flag) + error->all("Pair style lj/cut/coul/cut requires atom attribute q"); + if (force->pair_match("gpu",0) == NULL) + error->all("Cannot use pair hybrid with multiple GPU pair styles"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + cut_coulsq = cut_coul * cut_coul; + + // insure use of KSpace long-range solver, set g_ewald + + if (force->kspace == NULL) + error->all("Pair style is incompatible with KSpace style"); + g_ewald = force->kspace->g_ewald; + + // setup force tables + + if (ncoultablebits) init_tables(); + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + bool init_ok = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, + force->special_coul, force->qqrd2e, g_ewald); + if (!init_ok) + error->one("Insufficient memory on accelerator (or no fix gpu).\n"); + + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); + + if (gpu_mode != GPU_NEIGH) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairLJCutCoulLongGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + ljcl_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag) +{ + int i,j,ii,jj,inum,jnum,itype,jtype,itable; + double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; + double fraction,table; + double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; + double grij,expm2,prefactor,t,erfc; + int *ilist,*jlist,*numneigh,**firstneigh; + double rsq; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + + if (j < nall) factor_coul = factor_lj = 1.0; + else { + factor_coul = special_coul[j/nall]; + factor_lj = special_lj[j/nall]; + j %= nall; + } + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r2inv = 1.0/rsq; + + if (rsq < cut_coulsq) { + if (!ncoultablebits || rsq <= tabinnersq) { + r = sqrt(rsq); + grij = g_ewald * r; + expm2 = exp(-grij*grij); + t = 1.0 / (1.0 + EWALD_P*grij); + erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*q[j]/r; + forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); + if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; + } else { + union_int_float_t rsq_lookup; + rsq_lookup.f = rsq; + itable = rsq_lookup.i & ncoulmask; + itable >>= ncoulshiftbits; + fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable]; + table = ftable[itable] + fraction*dftable[itable]; + forcecoul = qtmp*q[j] * table; + if (factor_coul < 1.0) { + table = ctable[itable] + fraction*dctable[itable]; + prefactor = qtmp*q[j] * table; + forcecoul -= (1.0-factor_coul)*prefactor; + } + } + } else forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + r6inv = r2inv*r2inv*r2inv; + forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); + } else forcelj = 0.0; + + fpair = (forcecoul + factor_lj*forcelj) * r2inv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (eflag) { + if (rsq < cut_coulsq) { + if (!ncoultablebits || rsq <= tabinnersq) + ecoul = prefactor*erfc; + else { + table = etable[itable] + fraction*detable[itable]; + ecoul = qtmp*q[j] * table; + } + if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; + } else ecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) - + offset[itype][jtype]; + evdwl *= factor_lj; + } else evdwl = 0.0; + } + + if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz); + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongGPU::cpu_compute(int *nbors, int start, int eflag, + int vflag) +{ + int i,j,jnum,itype,jtype,itable; + double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; + double fraction,table; + double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; + double grij,expm2,prefactor,t,erfc; + double rsq; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + int stride = nlocal-start; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (i = start; i < nlocal; i++) { + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + int *nbor = nbors + i - start; + jnum = *nbor; + nbor += stride; + int *nbor_end = nbor + stride * jnum; + + for (; nbor>= ncoulshiftbits; + fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable]; + table = ftable[itable] + fraction*dftable[itable]; + forcecoul = qtmp*q[j] * table; + if (factor_coul < 1.0) { + table = ctable[itable] + fraction*dctable[itable]; + prefactor = qtmp*q[j] * table; + forcecoul -= (1.0-factor_coul)*prefactor; + } + } + } else forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + r6inv = r2inv*r2inv*r2inv; + forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); + } else forcelj = 0.0; + + fpair = (forcecoul + factor_lj*forcelj) * r2inv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (eflag) { + if (rsq < cut_coulsq) { + if (!ncoultablebits || rsq <= tabinnersq) + ecoul = prefactor*erfc; + else { + table = etable[itable] + fraction*detable[itable]; + ecoul = qtmp*q[j] * table; + } + if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; + } else ecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) - + offset[itype][jtype]; + evdwl *= factor_lj; + } else evdwl = 0.0; + } + + if (j +#include "update.h" +#include "domain.h" +#include "string.h" #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) -#ifndef WINDLL +// External functions from cuda library for atom decomposition -// External functions from cuda library for force decomposition - -bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, - double **epsilon, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, - double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin, - const int max_nbors, const int gpu_id); -void lj_gpu_clear(); -double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, - const int ago, const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi); -void lj_gpu_name(const int gpu_id, const int max_nbors, char * name); -void lj_gpu_time(); -double lj_gpu_bytes(); - -#else -#include - -typedef bool (*_lj_gpu_init)(int &ij_size, const int ntypes, double **cutsq,double **sigma, - double **epsilon, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, - double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin, - const int max_nbors, const int gpu_id); -typedef void (*_lj_gpu_clear)(); -typedef double (*_lj_gpu_cell)(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, - const int ago, const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi); -typedef void (*_lj_gpu_name)(const int gpu_id, const int max_nbors, char * name); -typedef void (*_lj_gpu_time)(); -typedef double (*_lj_gpu_bytes)(); - -_lj_gpu_init lj_gpu_init; -_lj_gpu_clear lj_gpu_clear; -_lj_gpu_cell lj_gpu_cell; -_lj_gpu_name lj_gpu_name; -_lj_gpu_time lj_gpu_time; -_lj_gpu_bytes lj_gpu_bytes; - -#endif +bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void ljl_gpu_clear(); +int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *boxlo, double *boxhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); +void ljl_gpu_compute(const int timestep, const int ago, const int inum, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success); +double ljl_gpu_bytes(); using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ -PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0) +PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), gpu_mode(GPU_PAIR) { - ij_new=NULL; respa_enable = 0; - -#ifdef WINDLL - HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll")); - if (hinstLib == NULL) { - printf("\nUnable to load gpu.dll\n"); - exit(1); - } - - lj_gpu_init=(_lj_gpu_init)GetProcAddress(hinstLib,"lj_gpu_init"); - lj_gpu_clear=(_lj_gpu_clear)GetProcAddress(hinstLib,"lj_gpu_clear"); - lj_gpu_cell=(_lj_gpu_cell)GetProcAddress(hinstLib,"lj_gpu_cell"); - lj_gpu_name=(_lj_gpu_name)GetProcAddress(hinstLib,"lj_gpu_name"); - lj_gpu_time=(_lj_gpu_time)GetProcAddress(hinstLib,"lj_gpu_time"); - lj_gpu_bytes=(_lj_gpu_bytes)GetProcAddress(hinstLib,"lj_gpu_bytes"); -#endif - + cpu_time = 0.0; } /* ---------------------------------------------------------------------- @@ -113,17 +75,7 @@ PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0) PairLJCutGPU::~PairLJCutGPU() { - printf("\n\n-------------------------------------"); - printf("--------------------------------\n"); - printf(" GPU Time Stamps: "); - printf("\n-------------------------------------"); - printf("--------------------------------\n"); - lj_gpu_time(); - printf("-------------------------------------"); - printf("--------------------------------\n\n"); - lj_gpu_clear(); - if (ij_new!=NULL) - delete [] ij_new; + ljl_gpu_clear(); } /* ---------------------------------------------------------------------- */ @@ -132,45 +84,37 @@ void PairLJCutGPU::compute(int eflag, int vflag) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + + if (gpu_mode == GPU_NEIGH) { + inum = atom->nlocal; + gpulist = ljl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); + } else { + inum = list->inum; + ljl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x, + atom->type, list->ilist, list->numneigh, list->firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, + success); + } + if (!success) + error->one("Out of memory on GPGPU"); - // compute forces on GPU - eng_vdwl = lj_gpu_cell(atom->f, virial, atom->x, atom->type, atom->nlocal, atom->nlocal + atom->nghost, - neighbor->ago, eflag, vflag, domain->boxlo, domain->boxhi); - - if (vflag_fdotr) virial_compute(); -} - -/* ---------------------------------------------------------------------- - global settings -------------------------------------------------------------------------- */ - -void PairLJCutGPU::settings(int narg, char **arg) -{ - // strip off GPU keyword/value and send remaining args to parent - - if (narg < 2) error->all("Illegal pair_style command"); - - // set multi_gpu_mode to one/node for multiple gpus on 1 node - // -- param is starting gpu id - // set multi_gpu_mode to one/gpu to select the same gpu id on every node - // -- param is id of gpu - // set multi_gpu_mode to multi/gpu to get ma - // -- param is number of gpus per node - - if (strcmp(arg[0],"one/node") == 0) - multi_gpu_mode = ONE_NODE; - else if (strcmp(arg[0],"one/gpu") == 0) - multi_gpu_mode = ONE_GPU; - else if (strcmp(arg[0],"multi/gpu") == 0) - multi_gpu_mode = MULTI_GPU; - else error->all("Illegal pair_style command"); - - multi_gpu_param = atoi(arg[1]); - - if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1) - error->all("Illegal pair_style command"); - - PairLJCut::settings(narg-2,&arg[2]); + if (host_startpair_match("gpu",0) == NULL) error->all("Cannot use pair hybrid with multiple GPU pair styles"); - // set the GPU ID - - int my_gpu=comm->me+multi_gpu_param; - int ngpus=universe->nprocs; - if (multi_gpu_mode==ONE_GPU) { - my_gpu=multi_gpu_param; - ngpus=1; - } else if (multi_gpu_mode==MULTI_GPU) { - ngpus=multi_gpu_param; - my_gpu=comm->me%ngpus; - if (ngpus>universe->nprocs) - ngpus=universe->nprocs; - } - - cut_respa=NULL; - // Repeat cutsq calculation because done after call to init_style - double cut; - for (int i = 1; i <= atom->ntypes; i++) - for (int j = i; j <= atom->ntypes; j++) { - cut = init_one(i,j); - cutsq[i][j] = cutsq[j][i] = cut*cut; - } - - // use the max cutoff length as the cell length double maxcut = -1.0; - for (int i = 1; i <= atom->ntypes; i++) - for (int j = i; j <= atom->ntypes; j++) - if (cutsq[i][j] > maxcut) maxcut = cutsq[i][j]; - - // for this problem, adding skin results in better perf - // this may be a parameter in the future - double cell_len = sqrt(maxcut) + neighbor->skin; - - if (!lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3, - lj4, offset, force->special_lj, domain->boxlo, domain->boxhi, - cell_len, neighbor->skin, neighbor->oneatom, my_gpu)) - error->one("At least one process could not allocate a CUDA-enabled gpu"); - - if (ij_new!=NULL) - delete [] ij_new; - ij_new=new int[ij_size]; - - if (force->newton_pair) - error->all("Cannot use newton pair with GPU lj/cut pair style"); - - if (comm->me == 0 && screen) { - printf("\n-------------------------------------"); - printf("-------------------------------------\n"); - printf("- Using GPGPU acceleration for LJ-Cut:\n"); - printf("-------------------------------------"); - printf("-------------------------------------\n"); - - for (int i=0; ioneatom,gpu_string); - printf("GPU %d: %s\n",gpui,gpu_string); + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; } - printf("-------------------------------------"); - printf("-------------------------------------\n\n"); + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + if (!init_ok) + error->one("Insufficient memory on accelerator (or no fix gpu).\n"); + + if (force->newton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); + + if (gpu_mode != GPU_NEIGH) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; } } @@ -254,6 +169,149 @@ void PairLJCutGPU::init_style() double PairLJCutGPU::memory_usage() { - double bytes=Pair::memory_usage()+ij_size*sizeof(int); - return bytes+lj_gpu_bytes(); + double bytes = Pair::memory_usage(); + return bytes + ljl_gpu_bytes(); } + +/* ---------------------------------------------------------------------- */ + +void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) { + int i,j,ii,jj,inum,jnum,itype,jtype; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double rsq,r2inv,r6inv,forcelj,factor_lj; + int *ilist,*jlist,*numneigh,**firstneigh; + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + double *special_lj = force->special_lj; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + + if (j < nall) factor_lj = 1.0; + else { + factor_lj = special_lj[j/nall]; + j %= nall; + } + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r2inv = 1.0/rsq; + r6inv = r2inv*r2inv*r2inv; + forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); + fpair = factor_lj*forcelj*r2inv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (eflag) { + evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) - + offset[itype][jtype]; + evdwl *= factor_lj; + } + + if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) { + int i,j,itype,jtype; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + int stride = nlocal-start; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double rsq,r2inv,r6inv,forcelj,factor_lj; + double *special_lj = force->special_lj; + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + + // loop over neighbors of my atoms + + for (i = start; i < nlocal; i++) { + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + int *nbor = nbors + i - start; + int jnum = *nbor; + nbor += stride; + int *nbor_end = nbor + stride * jnum; + + for (; nbor