Changes from Mike Brown.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5278 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
pscrozi 2010-11-23 00:41:14 +00:00
parent 5a82c99485
commit 4e52d8d927
18 changed files with 3104 additions and 536 deletions

View File

@ -1,6 +1,7 @@
# Install/unInstall package files in LAMMPS
# edit Makefile.package to include/exclude GPU library
# do not copy gayberne files if non-GPU version does not exist
# do not copy charmm files if non-GPU version does not exist
if (test $1 = 1) then
@ -12,14 +13,36 @@ if (test $1 = 1) then
sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package
sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package
fi
if (test -e ../pair_gayberne.cpp) then
cp pair_gayberne_gpu.cpp ..
cp pair_gayberne_gpu.h ..
fi
if (test -e ../pair_lj_cut_coul_long.cpp) then
cp pair_lj_cut_coul_long_gpu.cpp ..
cp pair_lj_cut_coul_long_gpu.h ..
fi
if (test -e ../pair_cg_cmm.cpp) then
cp pair_cg_cmm_gpu.cpp ..
cp pair_cg_cmm_gpu.h ..
fi
if (test -e ../pair_cg_cmm_coul_long.cpp) then
cp pair_cg_cmm_coul_long_gpu.cpp ..
cp pair_cg_cmm_coul_long_gpu.h ..
fi
cp pair_lj_cut_gpu.cpp ..
cp pair_lj96_cut_gpu.cpp ..
cp pair_lj_cut_coul_cut_gpu.cpp ..
cp pair_lj_cut_gpu.h ..
cp pair_lj96_cut_gpu.h ..
cp pair_lj_cut_coul_cut_gpu.h ..
cp fix_gpu.cpp ..
cp fix_gpu.h ..
elif (test $1 = 0) then
@ -27,11 +50,23 @@ elif (test $1 = 0) then
sed -i -e 's/[^ \t]*gpu //' ../Makefile.package
sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package
fi
rm ../pair_gayberne_gpu.cpp
rm ../pair_lj_cut_gpu.cpp
rm ../pair_lj96_cut_gpu.cpp
rm ../pair_lj_cut_coul_cut_gpu.cpp
rm ../pair_lj_cut_coul_long_gpu.cpp
rm ../pair_cg_cmm_gpu.cpp
rm ../pair_cg_cmm_coul_long_gpu.cpp
rm ../fix_gpu.cpp
rm ../pair_gayberne_gpu.h
rm ../pair_lj_cut_gpu.h
rm ../pair_lj96_cut_gpu.h
rm ../pair_lj_cut_coul_cut_gpu.h
rm ../pair_lj_cut_coul_long_gpu.h
rm ../pair_cg_cmm_gpu.h
rm ../pair_cg_cmm_coul_long_gpu.h
rm ../fix_gpu.h
fi

View File

@ -3,10 +3,40 @@
# do not copy gayberne files if non-GPU version does not exist
for file in *.cpp *.h; do
if (test $file == pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then
if (test $file = pair_gayberne_gpu.cpp -a ! -e ../pair_gayberne.cpp) then
continue
fi
if (test $file == pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then
if (test $file = pair_gayberne_gpu.h -a ! -e ../pair_gayberne.cpp) then
continue
fi
if (test $file = pair_lj_cut_coul_long_gpu.cpp -a ! -e ../pair_lj_cut_coul_long.cpp) then
continue
fi
if (test $file = pair_lj_cut_coul_long_gpu.h -a ! -e ../pair_lj_cut_coul_long.cpp) then
continue
fi
if (test $file = pair_cg_cmm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_long_gpu.cpp -a ! -e ../pair_cg_cmm_coul_long.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_long_gpu.h -a ! -e ../pair_cg_cmm_coul_long.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_msm.cpp -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_msm.h -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_msm_gpu.cpp -a ! -e ../pair_cg_cmm.cpp) then
continue
fi
if (test $file = pair_cg_cmm_coul_msm_gpu.h -a ! -e ../pair_cg_cmm.cpp) then
continue
fi

148
src/GPU/fix_gpu.cpp Normal file
View File

@ -0,0 +1,148 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include "string.h"
#include "stdlib.h"
#include "fix_gpu.h"
#include "atom.h"
#include "force.h"
#include "pair.h"
#include "respa.h"
#include "input.h"
#include "error.h"
#include "timer.h"
#include "modify.h"
#include "domain.h"
using namespace LAMMPS_NS;
enum{GPU_FORCE, GPU_NEIGH};
extern bool lmp_init_device(const int first_gpu, const int last_gpu,
const int gpu_mode, const double particle_split);
extern void lmp_clear_device();
extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul);
/* ---------------------------------------------------------------------- */
FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
Fix(lmp, narg, arg)
{
if (narg < 7) error->all("Illegal fix gpu command");
if (strcmp(arg[1],"all") != 0)
error->all("Illegal fix gpu command");
int gpu_mode, first_gpu, last_gpu;
double particle_split;
if (strcmp(arg[3],"force") == 0)
gpu_mode = GPU_FORCE;
else if (strcmp(arg[3],"force/neigh") == 0) {
gpu_mode = GPU_NEIGH;
if (domain->triclinic)
error->all("Cannot use force/neigh with triclinic box.");
} else
error->all("Illegal fix gpu command.");
first_gpu = atoi(arg[4]);
last_gpu = atoi(arg[5]);
particle_split = force->numeric(arg[6]);
if (particle_split==0 || particle_split>1)
error->all("Illegal fix gpu command.");
if (!lmp_init_device(first_gpu,last_gpu,gpu_mode,particle_split))
error->one("Could not find or initialize a specified accelerator device.");
}
/* ---------------------------------------------------------------------- */
FixGPU::~FixGPU()
{
lmp_clear_device();
}
/* ---------------------------------------------------------------------- */
int FixGPU::setmask()
{
int mask = 0;
mask |= POST_FORCE;
mask |= MIN_POST_FORCE;
return mask;
}
/* ---------------------------------------------------------------------- */
void FixGPU::init()
{
// Can only have 1 gpu fix that must be the first fix for a run
if ((void*)modify->fix[0] != (void*)this)
error->all("GPU is not the first fix for this run.");
}
/* ---------------------------------------------------------------------- */
void FixGPU::setup(int vflag)
{
post_force(vflag);
}
/* ---------------------------------------------------------------------- */
void FixGPU::min_setup(int vflag)
{
post_force(vflag);
}
/* ---------------------------------------------------------------------- */
void FixGPU::post_force(int vflag)
{
timer->stamp();
double lvirial[6];
for (int i = 0; i < 6; i++) lvirial[i] = 0.0;
double my_eng = lmp_gpu_forces(atom->f, atom->torque, force->pair->eatom,
force->pair->vatom, lvirial,
force->pair->eng_coul);
force->pair->eng_vdwl += my_eng;
force->pair->virial[0] += lvirial[0];
force->pair->virial[1] += lvirial[1];
force->pair->virial[2] += lvirial[2];
force->pair->virial[3] += lvirial[3];
force->pair->virial[4] += lvirial[4];
force->pair->virial[5] += lvirial[5];
if (force->pair->vflag_fdotr) force->pair->virial_compute();
timer->stamp(TIME_PAIR);
}
/* ---------------------------------------------------------------------- */
void FixGPU::min_post_force(int vflag)
{
post_force(vflag);
}
/* ---------------------------------------------------------------------- */
double FixGPU::memory_usage()
{
double bytes = 0.0;
// Memory usage currently returned by pair routine
return bytes;
}

45
src/GPU/fix_gpu.h Normal file
View File

@ -0,0 +1,45 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef FIX_CLASS
FixStyle(gpu,FixGPU)
#else
#ifndef LMP_FIX_GPU_H
#define LMP_FIX_GPU_H
#include "fix.h"
namespace LAMMPS_NS {
class FixGPU : public Fix {
public:
FixGPU(class LAMMPS *, int, char **);
~FixGPU();
int setmask();
void init();
void setup(int);
void min_setup(int);
void post_force(int);
void min_post_force(int);
double memory_usage();
private:
};
}
#endif
#endif

View File

@ -0,0 +1,499 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_cg_cmm_coul_long_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#include "kspace.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define EWALD_F 1.12837917
#define EWALD_P 0.3275911
#define A1 0.254829592
#define A2 -0.284496736
#define A3 1.421413741
#define A4 -1.453152027
#define A5 1.061405429
// External functions from cuda library for atom decomposition
bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald);
void cmml_gpu_clear();
int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q);
void cmml_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q);
double cmml_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairCGCMMCoulLongGPU::PairCGCMMCoulLongGPU(LAMMPS *lmp) : PairCGCMMCoulLong(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairCGCMMCoulLongGPU::~PairCGCMMCoulLongGPU()
{
cmml_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairCGCMMCoulLongGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = cmml_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success,
atom->q);
} else {
inum = list->inum;
cmml_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success, atom->q);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairCGCMMCoulLongGPU::init_style()
{
cut_respa = NULL;
if (!atom->q_flag)
error->all("Pair style cg/cmm/coul/long requires atom attribute q");
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
// insure use of KSpace long-range solver, set g_ewald
if (force->kspace == NULL)
error->all("Pair style is incompatible with KSpace style");
g_ewald = force->kspace->g_ewald;
// setup force tables
if (ncoultablebits) init_tables();
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
lj4, offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen, cut_ljsq,
cut_coulsq_global, force->special_coul,
force->qqrd2e, g_ewald);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU cg/cmm pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairCGCMMCoulLongGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + cmml_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
{
int i,j,ii,jj,inum,jnum,itype,jtype,itable;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
double fraction,table;
double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
double grij,expm2,prefactor,t,erfc;
int *ilist,*jlist,*numneigh,**firstneigh;
double rsq;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
const double delx = xtmp - x[j][0];
const double dely = ytmp - x[j][1];
const double delz = ztmp - x[j][2];
const double rsq = delx*delx + dely*dely + delz*delz;
const int jtype = type[j];
double evdwl = 0.0;
double ecoul = 0.0;
double fpair = 0.0;
if (rsq < cutsq[itype][jtype]) {
const double r2inv = 1.0/rsq;
const int cgt=cg_type[itype][jtype];
double forcelj = 0.0;
double forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
forcelj=factor_lj;
if (eflag) evdwl=factor_lj;
if (cgt == CG_LJ12_4) {
const double r4inv=r2inv*r2inv;
forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else if (cgt == CG_LJ9_6) {
const double r3inv = r2inv*sqrt(r2inv);
const double r6inv = r3inv*r3inv;
forcelj *= r6inv*(lj1[itype][jtype]*r3inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r3inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else {
const double r6inv = r2inv*r2inv*r2inv;
forcelj *= r6inv*(lj1[itype][jtype]*r6inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r6inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
}
}
if (rsq < cut_coulsq_global) {
if (!ncoultablebits || rsq <= tabinnersq) {
const double r = sqrt(rsq);
const double grij = g_ewald * r;
const double expm2 = exp(-grij*grij);
const double t = 1.0 / (1.0 + EWALD_P*grij);
const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const double prefactor = qqrd2e * qtmp*q[j]/r;
forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (eflag) ecoul = prefactor*erfc;
if (factor_coul < 1.0) {
forcecoul -= (1.0-factor_coul)*prefactor;
if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
}
} else {
union_int_float_t rsq_lookup;
rsq_lookup.f = rsq;
int itable = rsq_lookup.i & ncoulmask;
itable >>= ncoulshiftbits;
const double fraction = (rsq_lookup.f - rtable[itable]) *
drtable[itable];
const double table = ftable[itable] + fraction*dftable[itable];
forcecoul = qtmp*q[j] * table;
if (eflag) {
const double table2 = etable[itable] + fraction*detable[itable];
ecoul = qtmp*q[j] * table2;
}
if (factor_coul < 1.0) {
const double table2 = ctable[itable] + fraction*dctable[itable];
const double prefactor = qtmp*q[j] * table2;
forcecoul -= (1.0-factor_coul)*prefactor;
if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
}
}
}
fpair = (forcecoul + forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairCGCMMCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
int vflag)
{
int i,j,jnum,itype,jtype,itable;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
double fraction,table;
double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
double grij,expm2,prefactor,t,erfc;
double rsq;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
double evdwl = 0.0;
double ecoul = 0.0;
double fpair = 0.0;
if (rsq < cutsq[itype][jtype]) {
const double r2inv = 1.0/rsq;
const int cgt=cg_type[itype][jtype];
double forcelj = 0.0;
double forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
forcelj=factor_lj;
if (eflag) evdwl=factor_lj;
if (cgt == CG_LJ12_4) {
const double r4inv=r2inv*r2inv;
forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else if (cgt == CG_LJ9_6) {
const double r3inv = r2inv*sqrt(r2inv);
const double r6inv = r3inv*r3inv;
forcelj *= r6inv*(lj1[itype][jtype]*r3inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r3inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else {
const double r6inv = r2inv*r2inv*r2inv;
forcelj *= r6inv*(lj1[itype][jtype]*r6inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r6inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
}
}
if (rsq < cut_coulsq_global) {
if (!ncoultablebits || rsq <= tabinnersq) {
const double r = sqrt(rsq);
const double grij = g_ewald * r;
const double expm2 = exp(-grij*grij);
const double t = 1.0 / (1.0 + EWALD_P*grij);
const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const double prefactor = qqrd2e * qtmp*q[j]/r;
forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (eflag) ecoul = prefactor*erfc;
if (factor_coul < 1.0) {
forcecoul -= (1.0-factor_coul)*prefactor;
if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
}
} else {
union_int_float_t rsq_lookup;
rsq_lookup.f = rsq;
int itable = rsq_lookup.i & ncoulmask;
itable >>= ncoulshiftbits;
const double fraction = (rsq_lookup.f - rtable[itable]) *
drtable[itable];
const double table = ftable[itable] + fraction*dftable[itable];
forcecoul = qtmp*q[j] * table;
if (eflag) {
const double table2 = etable[itable] + fraction*detable[itable];
ecoul = qtmp*q[j] * table2;
}
if (factor_coul < 1.0) {
const double table2 = ctable[itable] + fraction*dctable[itable];
const double prefactor = qtmp*q[j] * table2;
forcecoul -= (1.0-factor_coul)*prefactor;
if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
}
}
}
fpair = (forcecoul + forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -0,0 +1,48 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(cg/cmm/coul/long/gpu,PairCGCMMCoulLongGPU)
#else
#ifndef LMP_PAIR_CG_CMM_COUL_LONG_GPU_H
#define LMP_PAIR_CG_CMM_COUL_LONG_GPU_H
#include "pair_cg_cmm_coul_long.h"
namespace LAMMPS_NS {
class PairCGCMMCoulLongGPU : public PairCGCMMCoulLong {
public:
PairCGCMMCoulLongGPU(LAMMPS *lmp);
~PairCGCMMCoulLongGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

362
src/GPU/pair_cg_cmm_gpu.cpp Normal file
View File

@ -0,0 +1,362 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_cg_cmm_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
// External functions from cuda library for atom decomposition
bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen);
void cmm_gpu_clear();
int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
void cmm_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success);
double cmm_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairCGCMMGPU::PairCGCMMGPU(LAMMPS *lmp) : PairCGCMM(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairCGCMMGPU::~PairCGCMMGPU()
{
cmm_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairCGCMMGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = cmm_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success);
} else {
inum = list->inum;
cmm_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairCGCMMGPU::init_style()
{
cut_respa = NULL;
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU CGCMM pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairCGCMMGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + cmm_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) {
int i,j,ii,jj,inum,jnum,itype,jtype;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r6inv,forcelj,factor_lj;
int *ilist,*jlist,*numneigh,**firstneigh;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_lj = force->special_lj;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
const int cgt=cg_type[itype][jtype];
r2inv = 1.0/rsq;
fpair = factor_lj;
if (eflag) evdwl = factor_lj;
if (cgt == CG_LJ12_4) {
const double r4inv = r2inv*r2inv;
fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else if (cgt == CG_LJ9_6) {
const double r3inv = r2inv*sqrt(r2inv);
const double r6inv = r3inv*r3inv;
fpair *= r6inv*(lj1[itype][jtype]*r3inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r3inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else {
const double r6inv = r2inv*r2inv*r2inv;
fpair *= r6inv*(lj1[itype][jtype]*r6inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r6inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
}
fpair *= r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairCGCMMGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
int i,j,itype,jtype;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r6inv,forcelj,factor_lj;
double *special_lj = force->special_lj;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
int jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
const int cgt=cg_type[itype][jtype];
r2inv = 1.0/rsq;
fpair = factor_lj;
if (eflag) evdwl = factor_lj;
if (cgt == CG_LJ12_4) {
const double r4inv = r2inv*r2inv;
fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else if (cgt == CG_LJ9_6) {
const double r3inv = r2inv*sqrt(r2inv);
const double r6inv = r3inv*r3inv;
fpair *= r6inv*(lj1[itype][jtype]*r3inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r3inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
} else {
const double r6inv = r2inv*r2inv*r2inv;
fpair *= r6inv*(lj1[itype][jtype]*r6inv
- lj2[itype][jtype]);
if (eflag) {
evdwl *= r6inv*(lj3[itype][jtype]*r6inv
- lj4[itype][jtype]) - offset[itype][jtype];
}
}
fpair *= r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
}

47
src/GPU/pair_cg_cmm_gpu.h Normal file
View File

@ -0,0 +1,47 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(cg/cmm/gpu,PairCGCMMGPU)
#else
#ifndef LMP_PAIR_CG_CMM_GPU_H
#define LMP_PAIR_CG_CMM_GPU_H
#include "pair_cg_cmm.h"
namespace LAMMPS_NS {
class PairCGCMMGPU : public PairCGCMM {
public:
PairCGCMMGPU(LAMMPS *lmp);
~PairCGCMMGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

View File

@ -31,123 +31,47 @@
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include <string>
#ifdef GB_GPU_OMP
#include "omp.h"
#endif
#include "domain.h"
#include "update.h"
#include "string.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#ifndef WINDLL
// External functions from cuda library for atom decomposition
bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int nlocal, const int nall, const int max_nbors,
const int thread, const int gpu_id);
void gb_gpu_clear(const int thread);
int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
int *ilist, const int *numj, const int *type,
const int thread, bool &success);
void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
const int thread);
void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type,
const bool rebuild, const int thread);
void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
const int thread);
double gb_gpu_forces(double **f, double **tor, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial, const int thread);
void gb_gpu_name(const int gpu_id, const int max_nbors, char * name);
void gb_gpu_time(const int thread);
int gb_gpu_num_devices();
bool gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
const double mu, double **shape, double **well, double **cutsq,
double **sigma, double **epsilon, double *host_lshape,
int **form, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, const int nlocal, const int nall,
const int max_nbors, const double cell_size,
int &gpu_mode, FILE *screen);
void gb_gpu_clear();
int * gb_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success,
double **host_quat);
int * gb_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double **host_quat);
double gb_gpu_bytes();
#else
#include <windows.h>
typedef bool (*_gb_gpu_init)(int &ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int nlocal, const int nall, const int max_nbors,
const int thread, const int gpu_id);
typedef void (*_gb_gpu_clear)(const int thread);
typedef int * (*_gb_gpu_reset_nbors)(const int nall, const int nlocal,
const int inum, int *ilist, const int *numj, const int *type,
const int thread, bool &success);
typedef void (*_gb_gpu_nbors)(const int *ij, const int num_ij, const bool eflag,
const int thread);
typedef void (*_gb_gpu_atom)(double **host_x, double **host_quat,
const int *host_type, const bool rebuild, const int thread);
typedef void (*_gb_gpu_gayberne)(const bool eflag, const bool vflag,
const bool rebuild, const int thread);
typedef double (*_gb_gpu_forces)(double **f, double **tor, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial, const int thread);
typedef void (*_gb_gpu_name)(const int gpu_id, const int max_nbors,
char * name);
typedef void (*_gb_gpu_time)(const int thread);
typedef int (*_gb_gpu_num_devices)();
typedef double (*_gb_gpu_bytes)();
_gb_gpu_init gb_gpu_init;
_gb_gpu_clear gb_gpu_clear;
_gb_gpu_reset_nbors gb_gpu_reset_nbors;
_gb_gpu_nbors gb_gpu_nbors;
_gb_gpu_atom gb_gpu_atom;
_gb_gpu_gayberne gb_gpu_gayberne;
_gb_gpu_forces gb_gpu_forces;
_gb_gpu_name gb_gpu_name;
_gb_gpu_time gb_gpu_time;
_gb_gpu_num_devices gb_gpu_num_devices;
_gb_gpu_bytes gb_gpu_bytes;
#endif
using namespace LAMMPS_NS;
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
/* ---------------------------------------------------------------------- */
PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
omp_chunk(0), nthreads(1),
multi_gpu_mode(ONE_NODE),
multi_gpu_param(0),
output_time(false)
PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp),
gpu_mode(GPU_PAIR)
{
ij_new[0]=NULL;
#ifdef WINDLL
HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
if (hinstLib == NULL) {
printf("\nUnable to load gpu.dll\n");
exit(1);
}
gb_gpu_init=(_gb_gpu_init)GetProcAddress(hinstLib,"gb_gpu_init");
gb_gpu_clear=(_gb_gpu_clear)GetProcAddress(hinstLib,"gb_gpu_clear");
gb_gpu_reset_nbors=(_gb_gpu_reset_nbors)GetProcAddress(hinstLib,"gb_gpu_reset_nbors");
gb_gpu_nbors=(_gb_gpu_nbors)GetProcAddress(hinstLib,"gb_gpu_nbors");
gb_gpu_atom=(_gb_gpu_atom)GetProcAddress(hinstLib,"gb_gpu_atom");
gb_gpu_gayberne=(_gb_gpu_gayberne)GetProcAddress(hinstLib,"gb_gpu_gayberne");
gb_gpu_forces=(_gb_gpu_forces)GetProcAddress(hinstLib,"gb_gpu_forces");
gb_gpu_name=(_gb_gpu_name)GetProcAddress(hinstLib,"gb_gpu_name");
gb_gpu_time=(_gb_gpu_time)GetProcAddress(hinstLib,"gb_gpu_time");
gb_gpu_num_devices=(_gb_gpu_num_devices)GetProcAddress(hinstLib,"gb_gpu_num_devices");
gb_gpu_bytes=(_gb_gpu_bytes)GetProcAddress(hinstLib,"gb_gpu_bytes");
#endif
}
/* ----------------------------------------------------------------------
@ -156,26 +80,8 @@ PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
PairGayBerneGPU::~PairGayBerneGPU()
{
if (output_time) {
printf("\n\n-------------------------------------");
printf("--------------------------------\n");
printf(" GPU Time Stamps (on proc 0): ");
printf("\n-------------------------------------");
printf("--------------------------------\n");
gb_gpu_time(my_thread);
printf("-------------------------------------");
printf("--------------------------------\n\n");
}
#pragma omp parallel
{
#ifdef GB_GPU_OMP
int my_thread=omp_get_thread_num();
#endif
gb_gpu_clear(my_thread);
if (ij_new[my_thread]!=NULL)
delete [] ij_new[my_thread];
}
gb_gpu_clear();
cpu_time = 0.0;
}
/* ---------------------------------------------------------------------- */
@ -183,141 +89,38 @@ PairGayBerneGPU::~PairGayBerneGPU()
void PairGayBerneGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = eflag_atom = vflag_atom = 0;
if (vflag_atom)
error->all("Per-atom virial not available with GPU Gay-Berne");
else evflag = vflag_fdotr = 0;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int inum = list->inum;
bool rebuild=false;
if (neighbor->ncalls > last_neighbor) {
last_neighbor=neighbor->ncalls;
rebuild=true;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = gb_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo, domain->subhi,
eflag, vflag, eflag_atom, vflag_atom, host_start,
cpu_time, success, atom->quat);
} else {
inum = list->inum;
olist = gb_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh,
list->firstneigh, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success,
atom->quat);
}
#pragma omp parallel
{
bool success=true;
#ifdef GB_GPU_OMP
int my_thread=omp_get_thread_num();
if (rebuild) {
omp_chunk=static_cast<int>(ceil(static_cast<double>(inum)/nthreads));
if (my_thread==nthreads-1)
thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk;
else
thread_inum[my_thread]=omp_chunk;
olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal,
thread_inum[my_thread],
list->ilist+omp_chunk*my_thread,
list->numneigh, atom->type, my_thread,
success);
}
#else
if (rebuild)
olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist,
list->numneigh, atom->type, my_thread,
success);
#endif
if (!success)
error->one("Out of memory on GPGPU");
// copy atom data to GPU
gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread);
int i,j,ii,jj,jnum;
double factor_lj;
int *jlist;
if (rebuild==true) {
int num_ij = 0;
// loop over neighbors of my atoms
int *ijp=ij_new[my_thread];
#ifdef GB_GPU_OMP
int mgo=my_thread*omp_chunk;
int mgot=mgo+thread_inum[my_thread];
#else
int mgo=0, mgot=inum;
#endif
for (ii = mgo; ii<mgot; ii++) {
i = olist[my_thread][ii];
jlist = list->firstneigh[i];
jnum = list->numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
*ijp=j;
ijp++;
num_ij++;
if (num_ij==ij_size) {
gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread);
ijp=ij_new[my_thread];
num_ij=0;
}
}
}
if (num_ij>0)
gb_gpu_nbors(ij_new[my_thread],num_ij,eflag,my_thread);
if (host_start < inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist,host_start,eflag,vflag);
else
cpu_compute(host_start,eflag,vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
gb_gpu_gayberne(eflag,vflag,rebuild,my_thread);
double lvirial[6];
for (int i=0; i<6; i++) lvirial[i]=0.0;
double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag,
eflag_atom, vflag_atom, eatom, vatom, lvirial,
my_thread);
#pragma omp critical
{
eng_vdwl+=my_eng;
virial[0]+=lvirial[0];
virial[1]+=lvirial[1];
virial[2]+=lvirial[2];
virial[3]+=lvirial[3];
virial[4]+=lvirial[4];
virial[5]+=lvirial[5];
}
} //End parallel
if (vflag_fdotr) virial_compute();
}
/* ----------------------------------------------------------------------
global settings
------------------------------------------------------------------------- */
void PairGayBerneGPU::settings(int narg, char **arg)
{
// strip off GPU keyword/value and send remaining args to parent
if (narg < 2) error->all("Illegal pair_style command");
// set multi_gpu_mode to one/node for multiple gpus on 1 node
// -- param is starting gpu id
// set multi_gpu_mode to one/gpu to select the same gpu id on every node
// -- param is id of gpu
// set multi_gpu_mode to multi/gpu to get ma
// -- param is number of gpus per node
if (strcmp(arg[0],"one/node") == 0)
multi_gpu_mode = ONE_NODE;
else if (strcmp(arg[0],"one/gpu") == 0)
multi_gpu_mode = ONE_GPU;
else if (strcmp(arg[0],"multi/gpu") == 0)
multi_gpu_mode = MULTI_GPU;
else error->all("Illegal pair_style command");
multi_gpu_param = atoi(arg[1]);
if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1)
error->all("Illegal pair_style command");
PairGayBerne::settings(narg-2,&arg[2]);
}
/* ----------------------------------------------------------------------
@ -326,8 +129,6 @@ void PairGayBerneGPU::settings(int narg, char **arg)
void PairGayBerneGPU::init_style()
{
if (comm->me == 0)
output_time=true;
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type)
@ -335,24 +136,7 @@ void PairGayBerneGPU::init_style()
if (atom->radius_flag)
error->all("Pair gayberne cannot be used with atom attribute diameter");
// set the GPU ID
int my_gpu=comm->me+multi_gpu_param;
int ngpus=universe->nprocs;
if (multi_gpu_mode==ONE_GPU) {
my_gpu=multi_gpu_param;
ngpus=1;
} else if (multi_gpu_mode==MULTI_GPU) {
ngpus=multi_gpu_param;
my_gpu=comm->me%ngpus;
if (ngpus>universe->nprocs)
ngpus=universe->nprocs;
}
int irequest = neighbor->request(this);
// per-type shape precalculations
for (int i = 1; i <= atom->ntypes; i++) {
if (setwell[i]) {
double *one = atom->shape[i];
@ -364,72 +148,38 @@ void PairGayBerneGPU::init_style()
}
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++)
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
cut = init_one(i,j);
cutsq[i][j] = cutsq[j][i] = cut*cut;
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
// If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads
#ifdef GB_GPU_OMP
if (multi_gpu_mode!=ONE_GPU)
nthreads=ngpus=gb_gpu_num_devices();
else
nthreads=ngpus=1;
if (nthreads>MAX_GPU_THREADS)
nthreads=MAX_GPU_THREADS;
omp_set_num_threads(nthreads);
#endif
#pragma omp parallel firstprivate(my_gpu)
{
#ifdef GB_GPU_OMP
int my_thread = omp_get_thread_num();
if (multi_gpu_mode!=ONE_GPU)
my_gpu=my_thread;
if (multi_gpu_mode==ONE_NODE)
my_gpu+=multi_gpu_param;
#endif
bool init_ok=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu,
double cell_size = sqrt(maxcut) + neighbor->skin;
bool init_ok = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu,
shape, well, cutsq, sigma, epsilon, lshape, form,
lj1, lj2, lj3, lj4, offset, force->special_lj,
atom->nlocal, atom->nlocal+atom->nghost, 300,
my_thread, my_gpu);
if (!init_ok)
error->one("At least 1 proc could not allocate a CUDA gpu or memory");
if (ij_new[my_thread]!=NULL)
delete [] ij_new[my_thread];
ij_new[my_thread]=new int[ij_size];
}
last_neighbor = -1;
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
cell_size, gpu_mode, screen);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU GayBerne pair style");
error->all("Cannot use newton pair with GPU Gay-Berne pair style");
if (comm->me == 0 && screen) {
printf("\n-------------------------------------");
printf("-------------------------------------\n");
printf("- Using GPGPU acceleration for Gay-Berne:\n");
printf("-------------------------------------");
printf("-------------------------------------\n");
for (int i=0; i<ngpus; i++) {
int gpui=my_gpu;
if (multi_gpu_mode==ONE_NODE)
gpui=i+multi_gpu_param;
else if (multi_gpu_mode==MULTI_GPU)
gpui=i;
char gpu_string[500];
gb_gpu_name(gpui,neighbor->oneatom,gpu_string);
printf("GPU %d: %s\n",gpui,gpu_string);
}
printf("-------------------------------------");
printf("-------------------------------------\n\n");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
@ -437,6 +187,275 @@ void PairGayBerneGPU::init_style()
double PairGayBerneGPU::memory_usage()
{
double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int);
return bytes+gb_gpu_bytes();
double bytes = Pair::memory_usage();
return bytes + gb_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag)
{
int i,j,ii,jj,inum,jnum,itype,jtype;
double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
double fforce[3],ttor[3],rtor[3],r12[3];
double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
int *ilist,*jlist,*numneigh,**firstneigh;
double **x = atom->x;
double **f = atom->f;
double **quat = atom->quat;
double **tor = atom->torque;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_lj = force->special_lj;
inum = list->inum;
ilist = olist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
itype = type[i];
if (form[itype][itype] == ELLIPSE_ELLIPSE) {
MathExtra::quat_to_mat_trans(quat[i],a1);
MathExtra::diag_times3(well[itype],a1,temp);
MathExtra::transpose_times3(a1,temp,b1);
MathExtra::diag_times3(shape[itype],a1,temp);
MathExtra::transpose_times3(a1,temp,g1);
}
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
// r12 = center to center vector
r12[0] = x[j][0]-x[i][0];
r12[1] = x[j][1]-x[i][1];
r12[2] = x[j][2]-x[i][2];
rsq = MathExtra::dot3(r12,r12);
jtype = type[j];
// compute if less than cutoff
if (rsq < cutsq[itype][jtype]) {
switch (form[itype][jtype]) {
case SPHERE_SPHERE:
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
forcelj *= -r2inv;
if (eflag) one_eng =
r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
offset[itype][jtype];
fforce[0] = r12[0]*forcelj;
fforce[1] = r12[1]*forcelj;
fforce[2] = r12[2]*forcelj;
ttor[0] = ttor[1] = ttor[2] = 0.0;
rtor[0] = rtor[1] = rtor[2] = 0.0;
break;
case SPHERE_ELLIPSE:
MathExtra::quat_to_mat_trans(quat[j],a2);
MathExtra::diag_times3(well[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,b2);
MathExtra::diag_times3(shape[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,g2);
one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
ttor[0] = ttor[1] = ttor[2] = 0.0;
break;
case ELLIPSE_SPHERE:
one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
rtor[0] = rtor[1] = rtor[2] = 0.0;
break;
default:
MathExtra::quat_to_mat_trans(quat[j],a2);
MathExtra::diag_times3(well[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,b2);
MathExtra::diag_times3(shape[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,g2);
one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
fforce,ttor,rtor);
break;
}
fforce[0] *= factor_lj;
fforce[1] *= factor_lj;
fforce[2] *= factor_lj;
ttor[0] *= factor_lj;
ttor[1] *= factor_lj;
ttor[2] *= factor_lj;
f[i][0] += fforce[0];
f[i][1] += fforce[1];
f[i][2] += fforce[2];
tor[i][0] += ttor[0];
tor[i][1] += ttor[1];
tor[i][2] += ttor[2];
if (eflag) evdwl = factor_lj*one_eng;
if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],fforce[2],
-r12[0],-r12[1],-r12[2]);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairGayBerneGPU::cpu_compute(int *nbors, int start, int eflag, int vflag)
{
int i,j,itype,jtype;
double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
double fforce[3],ttor[3],rtor[3],r12[3];
double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
double **x = atom->x;
double **f = atom->f;
double **quat = atom->quat;
double **tor = atom->torque;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double *special_lj = force->special_lj;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
itype = type[i];
if (form[itype][itype] == ELLIPSE_ELLIPSE) {
MathExtra::quat_to_mat_trans(quat[i],a1);
MathExtra::diag_times3(well[itype],a1,temp);
MathExtra::transpose_times3(a1,temp,b1);
MathExtra::diag_times3(shape[itype],a1,temp);
MathExtra::transpose_times3(a1,temp,g1);
}
int *nbor = nbors+i-start;
int jnum =* nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for ( ; nbor < nbor_end; nbor += stride) {
j = *nbor;
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
// r12 = center to center vector
r12[0] = x[j][0]-x[i][0];
r12[1] = x[j][1]-x[i][1];
r12[2] = x[j][2]-x[i][2];
rsq = MathExtra::dot3(r12,r12);
jtype = type[j];
// compute if less than cutoff
if (rsq < cutsq[itype][jtype]) {
switch (form[itype][jtype]) {
case SPHERE_SPHERE:
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
forcelj *= -r2inv;
if (eflag) one_eng =
r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
offset[itype][jtype];
fforce[0] = r12[0]*forcelj;
fforce[1] = r12[1]*forcelj;
fforce[2] = r12[2]*forcelj;
ttor[0] = ttor[1] = ttor[2] = 0.0;
rtor[0] = rtor[1] = rtor[2] = 0.0;
break;
case SPHERE_ELLIPSE:
MathExtra::quat_to_mat_trans(quat[j],a2);
MathExtra::diag_times3(well[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,b2);
MathExtra::diag_times3(shape[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,g2);
one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
ttor[0] = ttor[1] = ttor[2] = 0.0;
break;
case ELLIPSE_SPHERE:
one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
rtor[0] = rtor[1] = rtor[2] = 0.0;
break;
default:
MathExtra::quat_to_mat_trans(quat[j],a2);
MathExtra::diag_times3(well[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,b2);
MathExtra::diag_times3(shape[jtype],a2,temp);
MathExtra::transpose_times3(a2,temp,g2);
one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
fforce,ttor,rtor);
break;
}
fforce[0] *= factor_lj;
fforce[1] *= factor_lj;
fforce[2] *= factor_lj;
ttor[0] *= factor_lj;
ttor[1] *= factor_lj;
ttor[2] *= factor_lj;
f[i][0] += fforce[0];
f[i][1] += fforce[1];
f[i][2] += fforce[2];
tor[i][0] += ttor[0];
tor[i][1] += ttor[1];
tor[i][2] += ttor[2];
if (eflag) evdwl = factor_lj*one_eng;
if (j<start) {
if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],
fforce[2],-r12[0],-r12[1],-r12[2]);
} else {
if (j < nlocal) {
rtor[0] *= factor_lj;
rtor[1] *= factor_lj;
rtor[2] *= factor_lj;
f[j][0] -= fforce[0];
f[j][1] -= fforce[1];
f[j][2] -= fforce[2];
tor[j][0] += rtor[0];
tor[j][1] += rtor[1];
tor[j][2] += rtor[2];
}
if (evflag) ev_tally_xyz(i,j,nlocal,0,
evdwl,0.0,fforce[0],fforce[1],fforce[2],
-r12[0],-r12[1],-r12[2]);
}
}
}
}
}

View File

@ -29,21 +29,19 @@ class PairGayBerneGPU : public PairGayBerne {
public:
PairGayBerneGPU(LAMMPS *lmp);
~PairGayBerneGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void settings(int, char **);
void init_style();
double memory_usage();
enum { ONE_NODE, ONE_GPU, MULTI_GPU };
private:
int ij_size;
int *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS];
int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk;
int last_neighbor, multi_gpu_mode, multi_gpu_param;
bool output_time;
enum { GPU_PAIR, GPU_NEIGH };
private:
int *olist;
int gpu_mode;
double cpu_time;
int *gpulist;
};
}

View File

@ -0,0 +1,318 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_lj96_cut_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
// External functions from cuda library for atom decomposition
bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen);
void lj96_gpu_clear();
int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
void lj96_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success);
double lj96_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairLJ96CutGPU::PairLJ96CutGPU(LAMMPS *lmp) : PairLJ96Cut(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairLJ96CutGPU::~PairLJ96CutGPU()
{
lj96_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairLJ96CutGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = lj96_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success);
} else {
inum = list->inum;
lj96_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairLJ96CutGPU::init_style()
{
cut_respa = NULL;
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU LJ96 pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairLJ96CutGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + lj96_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) {
int i,j,ii,jj,inum,jnum,itype,jtype;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
int *ilist,*jlist,*numneigh,**firstneigh;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_lj = force->special_lj;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
r3inv = sqrt(r6inv);
forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
fpair = factor_lj*forcelj*r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
}
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairLJ96CutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
int i,j,itype,jtype;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
double *special_lj = force->special_lj;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
int jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
r3inv = sqrt(r6inv);
forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
fpair = factor_lj*forcelj*r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
}
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -0,0 +1,48 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(lj96/cut/gpu,PairLJ96CutGPU)
#else
#ifndef LMP_PAIR_LJ_96_GPU_H
#define LMP_PAIR_LJ_96_GPU_H
#include "pair_lj96_cut.h"
namespace LAMMPS_NS {
class PairLJ96CutGPU : public PairLJ96Cut {
public:
PairLJ96CutGPU(LAMMPS *lmp);
~PairLJ96CutGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

View File

@ -0,0 +1,364 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_lj_cut_coul_cut_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
// External functions from cuda library for atom decomposition
bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e);
void ljc_gpu_clear();
int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q);
void ljc_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q);
double ljc_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairLJCutCoulCutGPU::PairLJCutCoulCutGPU(LAMMPS *lmp) : PairLJCutCoulCut(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairLJCutCoulCutGPU::~PairLJCutCoulCutGPU()
{
ljc_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulCutGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = ljc_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success,
atom->q);
} else {
inum = list->inum;
ljc_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success, atom->q);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairLJCutCoulCutGPU::init_style()
{
if (!atom->q_flag)
error->all("Pair style lj/cut/coul/cut requires atom attribute q");
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
force->special_coul, force->qqrd2e);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU LJ pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairLJCutCoulCutGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + ljc_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag)
{
int i,j,ii,jj,inum,jnum,itype,jtype;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
int *ilist,*jlist,*numneigh,**firstneigh;
evdwl = ecoul = 0.0;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
int newton_pair = force->newton_pair;
double qqrd2e = force->qqrd2e;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
if (rsq < cut_coulsq[itype][jtype])
forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
else forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
} else forcelj = 0.0;
fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
if (rsq < cut_coulsq[itype][jtype])
ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv);
else ecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
} else evdwl = 0.0;
}
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulCutGPU::cpu_compute(int *nbors, int start, int eflag,
int vflag)
{
int i,j,jnum,itype,jtype;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
evdwl = ecoul = 0.0;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
if (rsq < cut_coulsq[itype][jtype])
forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
else forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
} else forcelj = 0.0;
fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
if (rsq < cut_coulsq[itype][jtype])
ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv);
else ecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
} else evdwl = 0.0;
}
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -0,0 +1,48 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(lj/cut/coul/cut/gpu,PairLJCutCoulCutGPU)
#else
#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_GPU_H
#define LMP_PAIR_LJ_CUT_COUL_CUT_GPU_H
#include "pair_lj_cut_coul_cut.h"
namespace LAMMPS_NS {
class PairLJCutCoulCutGPU : public PairLJCutCoulCut {
public:
PairLJCutCoulCutGPU(LAMMPS *lmp);
~PairLJCutCoulCutGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

View File

@ -0,0 +1,452 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_lj_cut_coul_long_gpu.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include "update.h"
#include "domain.h"
#include "string.h"
#include "kspace.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define EWALD_F 1.12837917
#define EWALD_P 0.3275911
#define A1 0.254829592
#define A2 -0.284496736
#define A3 1.421413741
#define A4 -1.453152027
#define A5 1.061405429
// External functions from cuda library for atom decomposition
bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald);
void ljcl_gpu_clear();
int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q);
void ljcl_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q);
double ljcl_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairLJCutCoulLongGPU::PairLJCutCoulLongGPU(LAMMPS *lmp) : PairLJCutCoulLong(lmp), gpu_mode(GPU_PAIR)
{
respa_enable = 0;
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairLJCutCoulLongGPU::~PairLJCutCoulLongGPU()
{
ljcl_gpu_clear();
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulLongGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = ljcl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success,
atom->q);
} else {
inum = list->inum;
ljcl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success, atom->q);
}
if (!success)
error->one("Out of memory on GPGPU");
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairLJCutCoulLongGPU::init_style()
{
cut_respa = NULL;
if (!atom->q_flag)
error->all("Pair style lj/cut/coul/cut requires atom attribute q");
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// Repeat cutsq calculation because done after call to init_style
double maxcut = -1.0;
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
}
double cell_size = sqrt(maxcut) + neighbor->skin;
cut_coulsq = cut_coul * cut_coul;
// insure use of KSpace long-range solver, set g_ewald
if (force->kspace == NULL)
error->all("Pair style is incompatible with KSpace style");
g_ewald = force->kspace->g_ewald;
// setup force tables
if (ncoultablebits) init_tables();
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
force->special_coul, force->qqrd2e, g_ewald);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU LJ pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
/* ---------------------------------------------------------------------- */
double PairLJCutCoulLongGPU::memory_usage()
{
double bytes = Pair::memory_usage();
return bytes + ljcl_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
{
int i,j,ii,jj,inum,jnum,itype,jtype,itable;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
double fraction,table;
double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
double grij,expm2,prefactor,t,erfc;
int *ilist,*jlist,*numneigh,**firstneigh;
double rsq;
evdwl = ecoul = 0.0;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
if (rsq < cut_coulsq) {
if (!ncoultablebits || rsq <= tabinnersq) {
r = sqrt(rsq);
grij = g_ewald * r;
expm2 = exp(-grij*grij);
t = 1.0 / (1.0 + EWALD_P*grij);
erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*q[j]/r;
forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
} else {
union_int_float_t rsq_lookup;
rsq_lookup.f = rsq;
itable = rsq_lookup.i & ncoulmask;
itable >>= ncoulshiftbits;
fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
table = ftable[itable] + fraction*dftable[itable];
forcecoul = qtmp*q[j] * table;
if (factor_coul < 1.0) {
table = ctable[itable] + fraction*dctable[itable];
prefactor = qtmp*q[j] * table;
forcecoul -= (1.0-factor_coul)*prefactor;
}
}
} else forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
} else forcelj = 0.0;
fpair = (forcecoul + factor_lj*forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
if (rsq < cut_coulsq) {
if (!ncoultablebits || rsq <= tabinnersq)
ecoul = prefactor*erfc;
else {
table = etable[itable] + fraction*detable[itable];
ecoul = qtmp*q[j] * table;
}
if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
} else ecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
} else evdwl = 0.0;
}
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairLJCutCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
int vflag)
{
int i,j,jnum,itype,jtype,itable;
double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
double fraction,table;
double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
double grij,expm2,prefactor,t,erfc;
double rsq;
evdwl = ecoul = 0.0;
double **x = atom->x;
double **f = atom->f;
double *q = atom->q;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double *special_coul = force->special_coul;
double *special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
qtmp = q[i];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_coul = factor_lj = 1.0;
else {
factor_coul = special_coul[j/nall];
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
if (rsq < cut_coulsq) {
if (!ncoultablebits || rsq <= tabinnersq) {
r = sqrt(rsq);
grij = g_ewald * r;
expm2 = exp(-grij*grij);
t = 1.0 / (1.0 + EWALD_P*grij);
erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*q[j]/r;
forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
} else {
union_int_float_t rsq_lookup;
rsq_lookup.f = rsq;
itable = rsq_lookup.i & ncoulmask;
itable >>= ncoulshiftbits;
fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
table = ftable[itable] + fraction*dftable[itable];
forcecoul = qtmp*q[j] * table;
if (factor_coul < 1.0) {
table = ctable[itable] + fraction*dctable[itable];
prefactor = qtmp*q[j] * table;
forcecoul -= (1.0-factor_coul)*prefactor;
}
}
} else forcecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
} else forcelj = 0.0;
fpair = (forcecoul + factor_lj*forcelj) * r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
if (rsq < cut_coulsq) {
if (!ncoultablebits || rsq <= tabinnersq)
ecoul = prefactor*erfc;
else {
table = etable[itable] + fraction*detable[itable];
ecoul = qtmp*q[j] * table;
}
if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
} else ecoul = 0.0;
if (rsq < cut_ljsq[itype][jtype]) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
} else evdwl = 0.0;
}
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,ecoul,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -0,0 +1,48 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(lj/cut/coul/long/gpu,PairLJCutCoulLongGPU)
#else
#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_GPU_H
#define LMP_PAIR_LJ_CUT_COUL_LONG_GPU_H
#include "pair_lj_cut_coul_long.h"
namespace LAMMPS_NS {
class PairLJCutCoulLongGPU : public PairLJCutCoulLong {
public:
PairLJCutCoulLongGPU(LAMMPS *lmp);
~PairLJCutCoulLongGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void init_style();
double memory_usage();
enum { GPU_PAIR, GPU_NEIGH };
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif

View File

@ -2,28 +2,24 @@
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "pair_lj_cut_gpu.h"
#include "math_extra.h"
#include "atom.h"
#include "domain.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
@ -32,79 +28,45 @@
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#include <string>
#include "update.h"
#include "domain.h"
#include "string.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#ifndef WINDLL
// External functions from cuda library for atom decomposition
// External functions from cuda library for force decomposition
bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
double **epsilon, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin,
const int max_nbors, const int gpu_id);
void lj_gpu_clear();
double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall,
const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi);
void lj_gpu_name(const int gpu_id, const int max_nbors, char * name);
void lj_gpu_time();
double lj_gpu_bytes();
#else
#include <windows.h>
typedef bool (*_lj_gpu_init)(int &ij_size, const int ntypes, double **cutsq,double **sigma,
double **epsilon, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, double *boxlo, double *boxhi, double cell_len, double skin,
const int max_nbors, const int gpu_id);
typedef void (*_lj_gpu_clear)();
typedef double (*_lj_gpu_cell)(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall,
const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi);
typedef void (*_lj_gpu_name)(const int gpu_id, const int max_nbors, char * name);
typedef void (*_lj_gpu_time)();
typedef double (*_lj_gpu_bytes)();
_lj_gpu_init lj_gpu_init;
_lj_gpu_clear lj_gpu_clear;
_lj_gpu_cell lj_gpu_cell;
_lj_gpu_name lj_gpu_name;
_lj_gpu_time lj_gpu_time;
_lj_gpu_bytes lj_gpu_bytes;
#endif
bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen);
void ljl_gpu_clear();
int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
void ljl_gpu_compute(const int timestep, const int ago, const int inum,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success);
double ljl_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), gpu_mode(GPU_PAIR)
{
ij_new=NULL;
respa_enable = 0;
#ifdef WINDLL
HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
if (hinstLib == NULL) {
printf("\nUnable to load gpu.dll\n");
exit(1);
}
lj_gpu_init=(_lj_gpu_init)GetProcAddress(hinstLib,"lj_gpu_init");
lj_gpu_clear=(_lj_gpu_clear)GetProcAddress(hinstLib,"lj_gpu_clear");
lj_gpu_cell=(_lj_gpu_cell)GetProcAddress(hinstLib,"lj_gpu_cell");
lj_gpu_name=(_lj_gpu_name)GetProcAddress(hinstLib,"lj_gpu_name");
lj_gpu_time=(_lj_gpu_time)GetProcAddress(hinstLib,"lj_gpu_time");
lj_gpu_bytes=(_lj_gpu_bytes)GetProcAddress(hinstLib,"lj_gpu_bytes");
#endif
cpu_time = 0.0;
}
/* ----------------------------------------------------------------------
@ -113,17 +75,7 @@ PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
PairLJCutGPU::~PairLJCutGPU()
{
printf("\n\n-------------------------------------");
printf("--------------------------------\n");
printf(" GPU Time Stamps: ");
printf("\n-------------------------------------");
printf("--------------------------------\n");
lj_gpu_time();
printf("-------------------------------------");
printf("--------------------------------\n\n");
lj_gpu_clear();
if (ij_new!=NULL)
delete [] ij_new;
ljl_gpu_clear();
}
/* ---------------------------------------------------------------------- */
@ -132,45 +84,37 @@ void PairLJCutGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
int nall = atom->nlocal + atom->nghost;
int inum, host_start;
bool success = true;
if (gpu_mode == GPU_NEIGH) {
inum = atom->nlocal;
gpulist = ljl_gpu_compute_n(update->ntimestep, neighbor->ago, inum, nall,
atom->x, atom->type, domain->sublo,
domain->subhi, atom->tag, atom->nspecial,
atom->special, eflag, vflag, eflag_atom,
vflag_atom, host_start, cpu_time, success);
} else {
inum = list->inum;
ljl_gpu_compute(update->ntimestep, neighbor->ago, inum, nall, atom->x,
atom->type, list->ilist, list->numneigh, list->firstneigh,
eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
success);
}
if (!success)
error->one("Out of memory on GPGPU");
// compute forces on GPU
eng_vdwl = lj_gpu_cell(atom->f, virial, atom->x, atom->type, atom->nlocal, atom->nlocal + atom->nghost,
neighbor->ago, eflag, vflag, domain->boxlo, domain->boxhi);
if (vflag_fdotr) virial_compute();
}
/* ----------------------------------------------------------------------
global settings
------------------------------------------------------------------------- */
void PairLJCutGPU::settings(int narg, char **arg)
{
// strip off GPU keyword/value and send remaining args to parent
if (narg < 2) error->all("Illegal pair_style command");
// set multi_gpu_mode to one/node for multiple gpus on 1 node
// -- param is starting gpu id
// set multi_gpu_mode to one/gpu to select the same gpu id on every node
// -- param is id of gpu
// set multi_gpu_mode to multi/gpu to get ma
// -- param is number of gpus per node
if (strcmp(arg[0],"one/node") == 0)
multi_gpu_mode = ONE_NODE;
else if (strcmp(arg[0],"one/gpu") == 0)
multi_gpu_mode = ONE_GPU;
else if (strcmp(arg[0],"multi/gpu") == 0)
multi_gpu_mode = MULTI_GPU;
else error->all("Illegal pair_style command");
multi_gpu_param = atoi(arg[1]);
if (multi_gpu_mode == MULTI_GPU && multi_gpu_param < 1)
error->all("Illegal pair_style command");
PairLJCut::settings(narg-2,&arg[2]);
if (host_start<inum) {
cpu_time = MPI_Wtime();
if (gpu_mode == GPU_NEIGH)
cpu_compute(gpulist, host_start, eflag, vflag);
else
cpu_compute(host_start, eflag, vflag);
cpu_time = MPI_Wtime() - cpu_time;
}
}
/* ----------------------------------------------------------------------
@ -179,74 +123,45 @@ void PairLJCutGPU::settings(int narg, char **arg)
void PairLJCutGPU::init_style()
{
cut_respa = NULL;
if (force->pair_match("gpu",0) == NULL)
error->all("Cannot use pair hybrid with multiple GPU pair styles");
// set the GPU ID
int my_gpu=comm->me+multi_gpu_param;
int ngpus=universe->nprocs;
if (multi_gpu_mode==ONE_GPU) {
my_gpu=multi_gpu_param;
ngpus=1;
} else if (multi_gpu_mode==MULTI_GPU) {
ngpus=multi_gpu_param;
my_gpu=comm->me%ngpus;
if (ngpus>universe->nprocs)
ngpus=universe->nprocs;
}
cut_respa=NULL;
// Repeat cutsq calculation because done after call to init_style
double cut;
for (int i = 1; i <= atom->ntypes; i++)
for (int j = i; j <= atom->ntypes; j++) {
cut = init_one(i,j);
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
// use the max cutoff length as the cell length
double maxcut = -1.0;
for (int i = 1; i <= atom->ntypes; i++)
for (int j = i; j <= atom->ntypes; j++)
if (cutsq[i][j] > maxcut) maxcut = cutsq[i][j];
// for this problem, adding skin results in better perf
// this may be a parameter in the future
double cell_len = sqrt(maxcut) + neighbor->skin;
if (!lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3,
lj4, offset, force->special_lj, domain->boxlo, domain->boxhi,
cell_len, neighbor->skin, neighbor->oneatom, my_gpu))
error->one("At least one process could not allocate a CUDA-enabled gpu");
if (ij_new!=NULL)
delete [] ij_new;
ij_new=new int[ij_size];
if (force->newton_pair)
error->all("Cannot use newton pair with GPU lj/cut pair style");
if (comm->me == 0 && screen) {
printf("\n-------------------------------------");
printf("-------------------------------------\n");
printf("- Using GPGPU acceleration for LJ-Cut:\n");
printf("-------------------------------------");
printf("-------------------------------------\n");
for (int i=0; i<ngpus; i++) {
int gpui=my_gpu;
if (multi_gpu_mode==ONE_NODE)
gpui=i+multi_gpu_param;
else if (multi_gpu_mode==MULTI_GPU)
gpui=i;
char gpu_string[500];
lj_gpu_name(gpui,neighbor->oneatom,gpu_string);
printf("GPU %d: %s\n",gpui,gpu_string);
double cut;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cut *= cut;
if (cut > maxcut)
maxcut = cut;
cutsq[i][j] = cutsq[j][i] = cut;
} else
cutsq[i][j] = cutsq[j][i] = 0.0;
}
printf("-------------------------------------");
printf("-------------------------------------\n\n");
}
double cell_size = sqrt(maxcut) + neighbor->skin;
int maxspecial=0;
if (atom->molecular)
maxspecial=atom->maxspecial;
bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
offset, force->special_lj, atom->nlocal,
atom->nlocal+atom->nghost, 300, maxspecial,
cell_size, gpu_mode, screen);
if (!init_ok)
error->one("Insufficient memory on accelerator (or no fix gpu).\n");
if (force->newton_pair)
error->all("Cannot use newton pair with GPU LJ pair style");
if (gpu_mode != GPU_NEIGH) {
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
}
}
@ -254,6 +169,149 @@ void PairLJCutGPU::init_style()
double PairLJCutGPU::memory_usage()
{
double bytes=Pair::memory_usage()+ij_size*sizeof(int);
return bytes+lj_gpu_bytes();
double bytes = Pair::memory_usage();
return bytes + ljl_gpu_bytes();
}
/* ---------------------------------------------------------------------- */
void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
int i,j,ii,jj,inum,jnum,itype,jtype;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r6inv,forcelj,factor_lj;
int *ilist,*jlist,*numneigh,**firstneigh;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
double *special_lj = force->special_lj;
inum = list->inum;
ilist = list->ilist;
numneigh = list->numneigh;
firstneigh = list->firstneigh;
// loop over neighbors of my atoms
for (ii = start; ii < inum; ii++) {
i = ilist[ii];
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
jlist = firstneigh[i];
jnum = numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
fpair = factor_lj*forcelj*r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
}
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
/* ---------------------------------------------------------------------- */
void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
int i,j,itype,jtype;
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int stride = nlocal-start;
double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
double rsq,r2inv,r6inv,forcelj,factor_lj;
double *special_lj = force->special_lj;
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
// loop over neighbors of my atoms
for (i = start; i < nlocal; i++) {
xtmp = x[i][0];
ytmp = x[i][1];
ztmp = x[i][2];
itype = type[i];
int *nbor = nbors + i - start;
int jnum = *nbor;
nbor += stride;
int *nbor_end = nbor + stride * jnum;
for (; nbor<nbor_end; nbor+=stride) {
j = *nbor;
if (j < nall) factor_lj = 1.0;
else {
factor_lj = special_lj[j/nall];
j %= nall;
}
delx = xtmp - x[j][0];
dely = ytmp - x[j][1];
delz = ztmp - x[j][2];
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
if (rsq < cutsq[itype][jtype]) {
r2inv = 1.0/rsq;
r6inv = r2inv*r2inv*r2inv;
forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
fpair = factor_lj*forcelj*r2inv;
f[i][0] += delx*fpair;
f[i][1] += dely*fpair;
f[i][2] += delz*fpair;
if (eflag) {
evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
offset[itype][jtype];
evdwl *= factor_lj;
}
if (j<start) {
if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
} else {
if (j<nlocal) {
f[j][0] -= delx*fpair;
f[j][1] -= dely*fpair;
f[j][2] -= delz*fpair;
}
if (evflag) ev_tally(i,j,nlocal,0,
evdwl,0.0,fpair,delx,dely,delz);
}
}
}
}
}

View File

@ -17,8 +17,8 @@ PairStyle(lj/cut/gpu,PairLJCutGPU)
#else
#ifndef LMP_PAIR_LJ_CUT_GPU_H
#define LMP_PAIR_LJ_CUT_GPU_H
#ifndef LMP_PAIR_LJ_LIGHT_GPU_H
#define LMP_PAIR_LJ_LIGHT_GPU_H
#include "pair_lj_cut.h"
@ -28,20 +28,21 @@ class PairLJCutGPU : public PairLJCut {
public:
PairLJCutGPU(LAMMPS *lmp);
~PairLJCutGPU();
void cpu_compute(int, int, int);
void cpu_compute(int *, int, int, int);
void compute(int, int);
void settings(int, char **);
void init_style();
double memory_usage();
enum { ONE_NODE, ONE_GPU, MULTI_GPU };
enum { GPU_PAIR, GPU_NEIGH };
private:
int ij_size;
int *ij_new;
int last_neighbor, multi_gpu_mode, multi_gpu_param;
private:
int gpu_mode;
double cpu_time;
int *gpulist;
};
}
#endif
#endif