forked from lijiext/lammps
596 lines
22 KiB
Plaintext
596 lines
22 KiB
Plaintext
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
------------------------------------------------------------------------- */
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
|
Peng Wang (Nvidia), penwang@nvidia.com
|
|
Paul Crozier (SNL), pscrozi@sandia.gov
|
|
------------------------------------------------------------------------- */
|
|
|
|
#include <iostream>
|
|
#include <cassert>
|
|
#include "nvc_macros.h"
|
|
#include "nvc_timer.h"
|
|
#include "nvc_device.h"
|
|
#include "gb_gpu_memory.cu"
|
|
#include "gb_gpu_kernel.h"
|
|
|
|
using namespace std;
|
|
|
|
static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
|
|
#define GBMT GB_GPU_Memory<numtyp,acctyp>
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
|
// -- Only pack neighbors matching the specified inclusive range of forms
|
|
// -- Only pack neighbors within cutoff
|
|
// ---------------------------------------------------------------------------
|
|
template<class numtyp>
|
|
__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
|
|
const int start, const int inum,
|
|
const int *dev_ij, const int form_low,
|
|
const int form_high, const int nall) {
|
|
|
|
// ii indexes the two interacting particles in gi
|
|
int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;
|
|
|
|
if (ii<inum) {
|
|
int *nbor=dev_nbor+ii;
|
|
int i=*nbor;
|
|
nbor+=nbor_pitch;
|
|
int numj=*nbor;
|
|
nbor+=nbor_pitch;
|
|
const int *list=dev_ij+*nbor;
|
|
const int *list_end=list+numj;
|
|
nbor+=nbor_pitch;
|
|
int *nbor_newj=nbor;
|
|
nbor+=nbor_pitch;
|
|
|
|
vec4 ix=x_[i];
|
|
int itype=ix.w;
|
|
|
|
int newj=0;
|
|
for ( ; list<list_end; list++) {
|
|
int j=*list;
|
|
if (j>=nall)
|
|
j%=nall;
|
|
vec4 jx=x_[j];
|
|
int jtype=jx.w;
|
|
|
|
if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
|
|
// Compute r12;
|
|
numtyp rsq=jx.x-ix.x;
|
|
rsq*=rsq;
|
|
numtyp t=jx.y-ix.y;
|
|
rsq+=t*t;
|
|
t=jx.z-ix.z;
|
|
rsq+=t*t;
|
|
|
|
if (rsq< _cutsq_<numtyp>(itype,jtype)) {
|
|
*nbor=j;
|
|
nbor+=nbor_pitch;
|
|
newj++;
|
|
}
|
|
}
|
|
}
|
|
*nbor_newj=newj;
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
|
// -- Only pack neighbors matching the specified inclusive range of forms
|
|
// -- Only pack neighbors within cutoff
|
|
// -- Fast version of routine that uses shared memory for LJ constants
|
|
// ---------------------------------------------------------------------------
|
|
template<class numtyp>
|
|
__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
|
|
const int start, const int inum,
|
|
const int *dev_ij, const int form_low,
|
|
const int form_high, const int nall) {
|
|
|
|
int ii=threadIdx.x;
|
|
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
|
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
|
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
|
int itype=ii/MAX_SHARED_TYPES;
|
|
int jtype=ii%MAX_SHARED_TYPES;
|
|
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
|
|
form[ii]=_form_(itype,jtype);
|
|
}
|
|
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
|
__syncthreads();
|
|
|
|
if (ii<inum) {
|
|
int *nbor=dev_nbor+ii;
|
|
int i=*nbor;
|
|
nbor+=nbor_pitch;
|
|
int numj=*nbor;
|
|
nbor+=nbor_pitch;
|
|
const int *list=dev_ij+*nbor;
|
|
const int *list_end=list+numj;
|
|
nbor+=nbor_pitch;
|
|
int *nbor_newj=nbor;
|
|
nbor+=nbor_pitch;
|
|
|
|
vec4 ix=x_[i];
|
|
int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
|
|
|
|
int newj=0;
|
|
for ( ; list<list_end; list++) {
|
|
int j=*list;
|
|
if (j>=nall)
|
|
j%=nall;
|
|
vec4 jx=x_[j];
|
|
int jtype=jx.w;
|
|
int mtype=itype+jtype;
|
|
|
|
if (form[mtype]>=form_low && form[mtype]<=form_high) {
|
|
// Compute r12;
|
|
numtyp rsq=jx.x-ix.x;
|
|
rsq*=rsq;
|
|
numtyp t=jx.y-ix.y;
|
|
rsq+=t*t;
|
|
t=jx.z-ix.z;
|
|
rsq+=t*t;
|
|
|
|
if (rsq<cutsq[mtype]) {
|
|
*nbor=j;
|
|
nbor+=nbor_pitch;
|
|
newj++;
|
|
}
|
|
}
|
|
}
|
|
*nbor_newj=newj;
|
|
}
|
|
}
|
|
|
|
template<class numtyp, class acctyp>
|
|
void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
|
|
const int inum, const int form_low, const int form_high) {
|
|
if (gbm.shared_types) {
|
|
kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
|
|
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
|
|
gbm.atom.inum(), start, inum,
|
|
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
|
|
} else
|
|
kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
|
|
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
|
|
gbm.atom.inum(), start, inum,
|
|
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Convert something to a string
|
|
// ---------------------------------------------------------------------------
|
|
#include <sstream>
|
|
template <class t>
|
|
inline string gb_gpu_toa(const t& in) {
|
|
ostringstream o;
|
|
o.precision(2);
|
|
o << in;
|
|
return o.str();
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Return string with GPU info
|
|
// ---------------------------------------------------------------------------
|
|
EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
|
|
string sname=GBMF[0].gpu.name(id)+", "+
|
|
gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
|
|
gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
|
|
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ";
|
|
strcpy(name,sname.c_str());
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Allocate memory on host and device and copy constants to device
|
|
// ---------------------------------------------------------------------------
|
|
EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
|
|
const double upsilon, const double mu, double **shape,
|
|
double **well, double **cutsq, double **sigma,
|
|
double **epsilon, double *host_lshape, int **form,
|
|
double **host_lj1, double **host_lj2, double **host_lj3,
|
|
double **host_lj4, double **offset, double *special_lj,
|
|
const int nlocal, const int nall,
|
|
const int max_nbors, const int thread, const int gpu_id) {
|
|
assert(thread<MAX_GPU_THREADS);
|
|
|
|
GBMF[thread].gpu.init();
|
|
|
|
if (GBMF[thread].gpu.num_devices()==0)
|
|
return false;
|
|
|
|
ij_size=IJ_SIZE;
|
|
return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
|
|
well, cutsq, sigma, epsilon, host_lshape, form,
|
|
host_lj1, host_lj2, host_lj3, host_lj4, offset,
|
|
special_lj, nlocal, nall, max_nbors, false,
|
|
gpu_id);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Clear memory on host and device
|
|
// ---------------------------------------------------------------------------
|
|
EXTERN void gb_gpu_clear(const int thread) {
|
|
GBMF[thread].clear();
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// copy atom positions, quaternions, and optionally types to device
|
|
// ---------------------------------------------------------------------------
|
|
template <class numtyp, class acctyp>
|
|
inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
|
|
double **host_quat, const int *host_type,
|
|
const bool rebuild, cudaStream_t &stream) {
|
|
atom.time_atom.start();
|
|
atom.reset_write_buffer();
|
|
|
|
// Rows 1-3 of dev_x are position; rows 4-7 are quaternion
|
|
atom.add_x_data(host_x,host_type);
|
|
atom.add_q_data(host_quat[0]);
|
|
|
|
atom.copy_x_data(stream);
|
|
atom.copy_q_data(stream);
|
|
atom.time_atom.stop();
|
|
}
|
|
|
|
EXTERN void gb_gpu_atom(double **host_x, double **host_quat,
|
|
const int *host_type, const bool rebuild,
|
|
const int thread) {
|
|
_gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
|
|
GBMF[thread].pair_stream);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Signal that we need to transfer a new neighbor list
|
|
// ---------------------------------------------------------------------------
|
|
template <class gbmtyp>
|
|
int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
|
|
const int inum, int *ilist, const int *numj,
|
|
const int *type, bool &success) {
|
|
success=true;
|
|
|
|
gbm.nbor.time_nbor.start();
|
|
|
|
int mn=0;
|
|
for (int i=0; i<inum; i++)
|
|
mn=std::max(mn,numj[i]);
|
|
|
|
if (nall>gbm.max_atoms)
|
|
gbm.resize_atom(nall,success);
|
|
if (nlocal>gbm.max_local || mn>gbm._max_nbors)
|
|
gbm.resize_local(nlocal,mn,success);
|
|
if (!success)
|
|
return false;
|
|
|
|
gbm.atom.nall(nall);
|
|
gbm.atom.inum(inum);
|
|
|
|
if (gbm.multiple_forms) {
|
|
int ij_size=gbm.nbor.host_ij.numel();
|
|
if (inum*2<ij_size) {
|
|
int p=0, acc=0;
|
|
for (int i=0; i<inum; i++) {
|
|
int itype=type[ilist[i]];
|
|
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
|
gbm.host_olist[p]=ilist[i];
|
|
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
|
gbm.nbor.host_ij[p+inum]=acc;
|
|
acc+=numj[ilist[i]];
|
|
p++;
|
|
}
|
|
}
|
|
gbm.last_ellipse=p;
|
|
for (int i=0; i<inum; i++) {
|
|
int itype=type[ilist[i]];
|
|
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
|
gbm.host_olist[p]=ilist[i];
|
|
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
|
gbm.nbor.host_ij[p+inum]=acc;
|
|
acc+=numj[ilist[i]];
|
|
p++;
|
|
}
|
|
}
|
|
gbm.nbor.ij_total=0;
|
|
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
|
|
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum,
|
|
2*inum,gbm.pair_stream);
|
|
} else {
|
|
int p=0, acc=0;
|
|
int offset=0;
|
|
int half=ij_size/2;
|
|
int hi=0;
|
|
for (int i=0; i<inum; i++) {
|
|
int itype=type[ilist[i]];
|
|
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
|
gbm.host_olist[p]=ilist[i];
|
|
gbm.nbor.host_ij[hi]=numj[ilist[i]];
|
|
gbm.nbor.host_ij[hi+half]=acc;
|
|
acc+=numj[ilist[i]];
|
|
p++;
|
|
hi++;
|
|
if (hi==half) {
|
|
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
|
|
half,gbm.pair_stream);
|
|
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
|
|
inum*2+offset,
|
|
half,gbm.pair_stream);
|
|
hi=0;
|
|
offset+=half;
|
|
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
|
}
|
|
}
|
|
}
|
|
gbm.last_ellipse=p;
|
|
for (int i=0; i<inum; i++) {
|
|
int itype=type[ilist[i]];
|
|
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
|
gbm.host_olist[p]=ilist[i];
|
|
gbm.nbor.host_ij[hi]=numj[ilist[i]];
|
|
gbm.nbor.host_ij[hi+half]=acc;
|
|
acc+=numj[ilist[i]];
|
|
p++;
|
|
hi++;
|
|
if (hi==half) {
|
|
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
|
|
half,gbm.pair_stream);
|
|
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
|
|
inum*2+offset,
|
|
half,gbm.pair_stream);
|
|
hi=0;
|
|
offset+=half;
|
|
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
|
}
|
|
}
|
|
}
|
|
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
|
|
if (hi>0) {
|
|
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
|
|
hi,gbm.pair_stream);
|
|
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
|
|
inum*2+offset,
|
|
hi,gbm.pair_stream);
|
|
}
|
|
gbm.nbor.ij_total=0;
|
|
}
|
|
} else {
|
|
gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
|
|
gbm.last_ellipse=inum;
|
|
}
|
|
|
|
gbm.nbor.time_nbor.stop();
|
|
|
|
if (gbm.multiple_forms)
|
|
return gbm.host_olist.begin();
|
|
return ilist;
|
|
}
|
|
|
|
EXTERN int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
|
|
int *ilist, const int *numj, const int *type,
|
|
const int thread, bool &success) {
|
|
return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
|
|
success);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Copy a set of ij_size ij interactions to device and compute energies,
|
|
// forces, and torques for those interactions
|
|
// ---------------------------------------------------------------------------
|
|
template <class gbmtyp>
|
|
void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij,
|
|
const bool eflag) {
|
|
gbm.nbor.time_nbor.add_to_total();
|
|
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
|
|
|
|
memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
|
|
gbm.nbor.time_nbor.start();
|
|
gbm.nbor.add(num_ij,gbm.pair_stream);
|
|
gbm.nbor.time_nbor.stop();
|
|
}
|
|
|
|
EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
|
|
const int thread) {
|
|
_gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
|
|
}
|
|
|
|
|
|
template<class numtyp, class acctyp>
|
|
void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) {
|
|
gbm.atom.time_answer.start();
|
|
gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
|
|
gbm.atom.time_answer.stop();
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Calculate energies, forces, and torques for all ij interactions
|
|
// ---------------------------------------------------------------------------
|
|
template <class numtyp, class acctyp>
|
|
void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
|
|
const bool rebuild) {
|
|
// Compute the block size and grid size to keep all cores busy
|
|
const int BX=BLOCK_1D;
|
|
int ans_pitch=6;
|
|
if (eflag)
|
|
ans_pitch++;
|
|
if (vflag)
|
|
ans_pitch+=6;
|
|
|
|
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
|
|
|
|
if (gbm.multiple_forms) {
|
|
gbm.time_kernel.start();
|
|
if (gbm.last_ellipse>0) {
|
|
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
|
|
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
|
|
static_cast<double>(BX)));
|
|
pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
|
|
gbm.time_kernel.stop();
|
|
|
|
gbm.time_gayberne.start();
|
|
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
|
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
|
|
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
|
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
|
|
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
|
|
eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
|
|
gbm.time_gayberne.stop();
|
|
|
|
if (gbm.last_ellipse==gbm.atom.inum()) {
|
|
gbm.time_kernel2.start();
|
|
gbm.time_kernel2.stop();
|
|
gbm.time_gayberne2.start();
|
|
gbm.time_gayberne2.stop();
|
|
gbm.time_pair.start();
|
|
gbm.time_pair.stop();
|
|
return;
|
|
}
|
|
|
|
// ------------ SPHERE_ELLIPSE ---------------
|
|
|
|
gbm.time_kernel2.start();
|
|
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
|
|
gbm.last_ellipse)/BX));
|
|
pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
|
|
ELLIPSE_SPHERE);
|
|
gbm.time_kernel2.stop();
|
|
|
|
gbm.time_gayberne2.start();
|
|
kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
|
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
|
|
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
|
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
|
|
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
|
|
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
|
gbm.time_gayberne2.stop();
|
|
} else {
|
|
gbm.atom.ans.zero();
|
|
gbm.time_kernel.stop();
|
|
gbm.time_gayberne.start();
|
|
gbm.time_gayberne.stop();
|
|
gbm.time_kernel2.start();
|
|
gbm.time_kernel2.stop();
|
|
gbm.time_gayberne2.start();
|
|
gbm.time_gayberne2.stop();
|
|
}
|
|
|
|
// ------------ LJ ---------------
|
|
gbm.time_pair.start();
|
|
if (gbm.last_ellipse<gbm.atom.inum()) {
|
|
if (gbm.shared_types)
|
|
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
|
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
|
|
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
|
|
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag,
|
|
vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
|
else
|
|
kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
|
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
|
|
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
|
|
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
|
|
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
|
}
|
|
gbm.time_pair.stop();
|
|
} else {
|
|
gbm.time_kernel.start();
|
|
pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
|
|
gbm.time_kernel.stop();
|
|
|
|
gbm.time_gayberne.start();
|
|
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
|
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
|
|
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
|
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
|
|
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(),
|
|
eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
|
|
gbm.time_gayberne.stop();
|
|
}
|
|
}
|
|
|
|
EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
|
|
const int thread) {
|
|
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
|
|
_gb_gpu_enqueue<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Get energies, forces, and torques to host
|
|
// ---------------------------------------------------------------------------
|
|
template<class numtyp, class acctyp>
|
|
double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
|
|
const bool eflag, const bool vflag, const bool eflag_atom,
|
|
const bool vflag_atom, double *eatom, double **vatom,
|
|
double *virial) {
|
|
double evdw;
|
|
|
|
gbm.atom.time_atom.add_to_total();
|
|
gbm.nbor.time_nbor.add_to_total();
|
|
gbm.time_kernel.add_to_total();
|
|
gbm.time_gayberne.add_to_total();
|
|
if (gbm.multiple_forms) {
|
|
gbm.time_kernel2.add_to_total();
|
|
gbm.time_gayberne2.add_to_total();
|
|
gbm.time_pair.add_to_total();
|
|
}
|
|
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
|
if (gbm.last_ellipse>gbm.atom.inum()) {
|
|
if (eflag || vflag)
|
|
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
|
|
f,tor,gbm.atom.inum());
|
|
else
|
|
gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum());
|
|
} else {
|
|
if (eflag || vflag)
|
|
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
|
|
f,tor,gbm.last_ellipse);
|
|
else
|
|
gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse);
|
|
}
|
|
gbm.atom.time_answer.add_to_total();
|
|
return evdw;
|
|
}
|
|
|
|
EXTERN double gb_gpu_forces(double **f, double **tor, const int *ilist,
|
|
const bool eflag, const bool vflag, const bool eflag_atom,
|
|
const bool vflag_atom, double *eatom, double **vatom,
|
|
double *virial, const int thread) {
|
|
return _gb_gpu_forces<PRECISION,ACC_PRECISION>
|
|
(GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
|
|
vflag_atom,eatom,vatom,virial);
|
|
}
|
|
|
|
EXTERN void gb_gpu_time(const int i) {
|
|
cout.precision(4);
|
|
cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds()
|
|
<< " s.\n"
|
|
<< "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds()
|
|
<< " s.\n"
|
|
<< "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
|
|
GBMF[i].time_kernel2.total_seconds() << " s.\n"
|
|
<< "Force calc: " << GBMF[i].time_gayberne.total_seconds()+
|
|
GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
|
|
if (GBMF[i].multiple_forms)
|
|
cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n";
|
|
cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds()
|
|
<< " s.\n";
|
|
}
|
|
|
|
EXTERN int gb_gpu_num_devices() {
|
|
return GBMF[0].gpu.num_devices();
|
|
}
|
|
|
|
EXTERN double gb_gpu_bytes() {
|
|
return GBMF[0].host_memory_usage();
|
|
}
|
|
|