git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10668 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2013-08-23 14:46:18 +00:00
parent 402d1a8605
commit 22751a2aae
19 changed files with 2426 additions and 0 deletions

View File

@ -0,0 +1,5 @@
# Settings that the LAMMPS build will import when this package library is used
gpu_SYSINC =
gpu_SYSLIB = -framework OpenCL
gpu_SYSPATH =

View File

@ -0,0 +1,5 @@
# Settings that the LAMMPS build will import when this package library is used
gpu_SYSINC =
gpu_SYSLIB = -lOpenCL
gpu_SYSPATH =

358
lib/gpu/lal_base_three.cpp Normal file
View File

@ -0,0 +1,358 @@
/***************************************************************************
base_three.cpp
-------------------
W. Michael Brown (ORNL)
Base class for pair styles with per-particle data for position and type
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Tue April 2, 2013
email : brownw@ornl.gov
***************************************************************************/
#include "lal_base_three.h"
using namespace LAMMPS_AL;
#define BaseThreeT BaseThree<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
#ifdef THREE_CONCURRENT
ans2=new Answer<numtyp,acctyp>();
#endif
}
template <class numtyp, class acctyp>
BaseThreeT::~BaseThree() {
delete ans;
delete nbor;
#ifdef THREE_CONCURRENT
delete ans2;
#endif
}
template <class numtyp, class acctyp>
int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const {
int b=device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
#ifdef THREE_CONCURRENT
b+=ans2->bytes_per_atom();
#endif
return b;
}
template <class numtyp, class acctyp>
int BaseThreeT::init_three(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_two, const char *k_three_center,
const char *k_three_end) {
screen=_screen;
int gpu_nbor=0;
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_atom();
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
if (_threads_per_atom*_threads_per_atom>device->warp_size())
return -10;
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;
#ifdef THREE_CONCURRENT
_end_command_queue=ucl_device->num_queues();
ucl_device->push_command_queue();
if (!ans2->init(ans->max_inum(),false,false,*(device->gpu)))
return -3;
ans2->cq(_end_command_queue);
#endif
_block_pair=device->pair_block_size();
_block_size=device->block_ellipse();
compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->x,4);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
#ifdef THREE_CONCURRENT
_max_an_bytes+=ans2->gpu_bytes();
#endif
return 0;
}
template <class numtyp, class acctyp>
void BaseThreeT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void BaseThreeT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
if (_compiled) {
k_three_center.clear();
k_three_end.clear();
k_three_end_vatom.clear();
k_pair.clear();
delete pair_program;
_compiled=false;
}
time_pair.clear();
hd_balancer.clear();
nbor->clear();
ans->clear();
#ifdef THREE_CONCURRENT
ans2->clear();
assert(ucl_device->num_queues()==_end_command_queue+1);
ucl_device->pop_command_queue();
#endif
device->clear();
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
int *ilist, int *numj, int **firstneigh,
bool &success) {
success=true;
int mn=nbor->max_nbor_loop(nlist,numj,ilist);
resize_atom(inum,nall,success);
resize_local(nall,mn,success);
if (!success)
return NULL;
nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
#ifdef THREE_CONCURRENT
bytes+=ans2->gpu_bytes();
#endif
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void BaseThreeT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, int *tag,
int **nspecial, int **special,
bool &success) {
success=true;
resize_atom(inum,nall,success);
resize_local(nall,host_inum,nbor->max_nbors(),success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
#ifdef THREE_CONCURRENT
bytes+=ans2->gpu_bytes();
#endif
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
acc_timers();
if (nlist==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,nlocal,cpu_time);
ans->inum(inum);
#ifdef THREE_CONCURRENT
ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
}
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
device->add_ans_object(ans);
#ifdef THREE_CONCURRENT
ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
device->add_ans_object(ans2);
#endif
hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** BaseThreeT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag,
int **nspecial, int **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return NULL;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
#ifdef THREE_CONCURRENT
ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans);
#ifdef THREE_CONCURRENT
ans2->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans2);
#endif
hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double BaseThreeT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(BaseThree<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *ktwo, const char *kthree_center,
const char *kthree_end) {
if (_compiled)
return;
std::string vatom_name=std::string(kthree_end)+"_vatom";
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str());
k_three_center.set_function(*pair_program,kthree_center);
k_three_end.set_function(*pair_program,kthree_end);
k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
k_pair.set_function(*pair_program,ktwo);
pos_tex.get_texture(*pair_program,"pos_tex");
#ifdef THREE_CONCURRENT
k_three_end.cq(ucl_device->cq(_end_command_queue));
k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
#endif
_compiled=true;
}
template class BaseThree<PRECISION,ACC_PRECISION>;

221
lib/gpu/lal_base_three.h Normal file
View File

@ -0,0 +1,221 @@
/***************************************************************************
base_three.h
-------------------
W. Michael Brown (ORNL)
Base class for 3-body potentials
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Tue April 2, 2013
email : brownw@ornl.gov
***************************************************************************/
#ifndef LAL_BASE_ATOMIC_H
#define LAL_BASE_ATOMIC_H
#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"
#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BaseThree {
public:
BaseThree();
virtual ~BaseThree();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param k_two name for the kernel for 2-body force calculation
* \param k_three name for the kernel for 3-body force calculation
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card
* - -10 if invalid thread_per_atom setting **/
int init_three(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const void *pair_program, const char *k_two,
const char *k_three_center, const char *k_three_end);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success))
pos_tex.bind_float(atom->x,4);
ans->resize(inum,success);
#ifdef THREE_CONCURRENT
ans2->resize(inum,success);
#endif
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
nbor->acc_timers();
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
#ifdef THREE_CONCURRENT
ans2->acc_timers();
#endif
}
}
/// Zero timers
inline void zero_timers() {
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
#ifdef THREE_CONCURRENT
ans2->zero_timers();
#endif
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, const int nlist, int *ilist,
int *numj, int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, bool &success);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int * compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
Balance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
Atom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans;
#ifdef THREE_CONCURRENT
Answer<numtyp,acctyp> *ans2;
#endif
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
Neighbor *nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
inline int block_pair() { return _block_pair; }
inline int block_size() { return _block_size; }
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
protected:
bool _compiled;
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *k_two, const char *k_three_center,
const char *k_three_end);
virtual void loop(const bool _eflag, const bool _vflag,
const int evatom) = 0;
};
}
#endif

152
lib/gpu/lal_beck.cpp Normal file
View File

@ -0,0 +1,152 @@
/***************************************************************************
beck.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the beck pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "beck_cl.h"
#elif defined(USE_CUDART)
const char *beck=0;
#else
#include "beck_cubin.h"
#endif
#include "lal_beck.h"
#include <cassert>
using namespace LAMMPS_AL;
#define BeckT Beck<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
BeckT::Beck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
BeckT::~Beck() {
clear();
}
template <class numtyp, class acctyp>
int BeckT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int BeckT::init(const int ntypes,
double **host_cutsq, double **host_aa,
double **host_alpha, double **host_beta,
double **host_AA, double **host_BB,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,beck,"k_beck");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
beck1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,beck1,host_write,host_aa,host_alpha,
host_beta);
beck2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,beck2,host_write,host_AA,host_BB,
host_cutsq);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=beck1.row_bytes()+beck2.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void BeckT::clear() {
if (!_allocated)
return;
_allocated=false;
beck1.clear();
beck2.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double BeckT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Beck<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BeckT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &beck1, &beck2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &beck1, &beck2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class Beck<PRECISION,ACC_PRECISION>;

80
lib/gpu/lal_beck.h Normal file
View File

@ -0,0 +1,80 @@
/***************************************************************************
beck.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the beck pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_BECK_H
#define LAL_BECK_H
#include "lal_base_atomic.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Beck : public BaseAtomic<numtyp, acctyp> {
public:
Beck();
~Beck();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_aa, double **host_alpha,
double **host_beta, double **host_AA,
double **host_BB, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// beck1.x = aa, beck1.y = alpha, beck1.z = beta
UCL_D_Vec<numtyp4> beck1;
/// beck2.x = AA, beck2.y = BB, beck2.z = cutsq
UCL_D_Vec<numtyp4> beck2;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

120
lib/gpu/lal_beck_ext.cpp Normal file
View File

@ -0,0 +1,120 @@
/***************************************************************************
beck_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to beck acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_beck.h"
using namespace std;
using namespace LAMMPS_AL;
static Beck<PRECISION,ACC_PRECISION> BLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
double **alpha, double **beta, double **AA, double **BB,
double *special_lj, const int inum, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
BLMF.clear();
gpu_mode=BLMF.device->gpu_mode();
double gpu_split=BLMF.device->particle_split();
int first_gpu=BLMF.device->first_device();
int last_gpu=BLMF.device->last_device();
int world_me=BLMF.device->world_me();
int gpu_rank=BLMF.device->gpu_rank();
int procs_per_gpu=BLMF.device->procs_per_gpu();
BLMF.device->init_message(screen,"beck",first_gpu,last_gpu);
bool message=false;
if (BLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta,
AA, BB, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
BLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB,
special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
BLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
BLMF.estimate_gpu_overhead();
return init_ok;
}
void beck_gpu_clear() {
BLMF.clear();
}
int ** beck_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void beck_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
BLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double beck_gpu_bytes() {
return BLMF.host_memory_usage();
}

200
lib/gpu/lal_lj_coul_msm.cpp Normal file
View File

@ -0,0 +1,200 @@
/***************************************************************************
lj_coul_msm.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the lj/cut/coul/msm pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#if defined(USE_OPENCL)
#include "lj_coul_msm_cl.h"
#elif defined(USE_CUDART)
const char *lj_coul_msm=0;
#else
#include "lj_coul_msm_cubin.h"
#endif
#include "lal_lj_coul_msm.h"
#include <cassert>
using namespace LAMMPS_AL;
#define LJCoulMSMT LJCoulMSM<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
LJCoulMSMT::LJCoulMSM() : BaseCharge<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
LJCoulMSMT::~LJCoulMSM() {
clear();
}
template <class numtyp, class acctyp>
int LJCoulMSMT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJCoulMSMT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_gcons,
double **host_dgcons, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const int order,
const double qqrd2e) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_coul_msm,"k_lj_coul_msm");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_cut_ljsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
// pack gcons and dgcons
int nrows, ncols;
nrows = 7;
ncols = 7;
UCL_H_Vec<numtyp> dview_gcons(nrows*ncols,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int ix=0; ix<nrows; ix++)
for (int iy=0; iy<ncols; iy++)
dview_gcons[ix*ncols+iy]=host_gcons[ix][iy];
gcons.alloc(nrows*ncols,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(gcons,dview_gcons,false);
gcons_tex.get_texture(*(this->pair_program),"gcons_tex");
gcons_tex.bind_float(gcons,1);
nrows = 7;
ncols = 6;
UCL_H_Vec<numtyp> dview_dgcons(nrows*ncols,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int ix=0; ix<nrows; ix++)
for (int iy=0; iy<ncols; iy++)
dview_dgcons[ix*ncols+iy]=host_dgcons[ix][iy];
dgcons.alloc(nrows*ncols,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(dgcons,dview_dgcons,false);
dgcons_tex.get_texture(*(this->pair_program),"dgcons_tex");
dgcons_tex.bind_float(dgcons,1);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_order=order;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+
gcons.row_bytes()+dgcons.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void LJCoulMSMT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
gcons.clear();
dgcons.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJCoulMSMT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJCoulMSM<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_order,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons,
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->q, &_cut_coulsq,
&_qqrd2e, &_order, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class LJCoulMSM<PRECISION,ACC_PRECISION>;

88
lib/gpu/lal_lj_coul_msm.h Normal file
View File

@ -0,0 +1,88 @@
/***************************************************************************
lj_coul_msm.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the lj/cut/coul/msm pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_LJ_COUL_MSM_H
#define LAL_LJ_COUL_MSM_H
#include "lal_base_charge.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
public:
LJCoulMSM();
~LJCoulMSM();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_gcons, double **host_dgcons,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const int order, const double qqrd2e);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
UCL_D_Vec<numtyp> gcons, dgcons;
UCL_Texture gcons_tex, dgcons_tex;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut_coulsq, _qqrd2e;
int _order;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -0,0 +1,131 @@
/***************************************************************************
lj_coul_msm_ext.cpp
-------------------
W. Michael Brown (ORNL)
Functions for LAMMPS access to lj/cut/coul/msm acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_lj_coul_msm.h"
using namespace std;
using namespace LAMMPS_AL;
static LJCoulMSM<PRECISION,ACC_PRECISION> LJCMLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_gcons, double **host_dgcons,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const int order, const double qqrd2e) {
LJCMLMF.clear();
gpu_mode=LJCMLMF.device->gpu_mode();
double gpu_split=LJCMLMF.device->particle_split();
int first_gpu=LJCMLMF.device->first_device();
int last_gpu=LJCMLMF.device->last_device();
int world_me=LJCMLMF.device->world_me();
int gpu_rank=LJCMLMF.device->gpu_rank();
int procs_per_gpu=LJCMLMF.device->procs_per_gpu();
LJCMLMF.device->init_message(screen,"lj/cut/coul/msm",first_gpu,last_gpu);
bool message=false;
if (LJCMLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
host_gcons, host_dgcons, offset,
special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, order, qqrd2e);
LJCMLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
host_gcons, host_dgcons, offset,
special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, order, qqrd2e);
LJCMLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
LJCMLMF.estimate_gpu_overhead();
return init_ok;
}
void ljcm_gpu_clear() {
LJCMLMF.clear();
}
int** ljcm_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return LJCMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void ljcm_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
LJCMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
}
double ljcm_gpu_bytes() {
return LJCMLMF.host_memory_usage();
}

152
lib/gpu/lal_mie.cpp Normal file
View File

@ -0,0 +1,152 @@
/***************************************************************************
mie.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the mie pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "mie_cl.h"
#elif defined(USE_CUDART)
const char *mie=0;
#else
#include "mie_cubin.h"
#endif
#include "lal_mie.h"
#include <cassert>
using namespace LAMMPS_AL;
#define MieT Mie<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
MieT::Mie() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
MieT::~Mie() {
clear();
}
template <class numtyp, class acctyp>
int MieT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int MieT::init(const int ntypes, double **host_cutsq,
double **host_mie1, double **host_mie2,
double **host_mie3, double **host_mie4,
double **host_gamA, double **host_gamR,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,mie,"k_mie");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
mie1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,mie1,host_write,host_mie1,host_mie2,
host_gamA,host_gamR);
mie3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,mie3,host_write,host_mie3,host_mie4,
host_offset,host_cutsq);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=mie1.row_bytes()+mie3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void MieT::clear() {
if (!_allocated)
return;
_allocated=false;
mie1.clear();
mie3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double MieT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Mie<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void MieT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &mie1, &mie3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &mie1, &mie3, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class Mie<PRECISION,ACC_PRECISION>;

80
lib/gpu/lal_mie.h Normal file
View File

@ -0,0 +1,80 @@
/***************************************************************************
mie.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the mie/cut pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_MIE_H
#define LAL_MIE_H
#include "lal_base_atomic.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Mie : public BaseAtomic<numtyp, acctyp> {
public:
Mie();
~Mie();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_mie1, double **host_mie2, double **host_mie3,
double **host_mie4, double **host_gamA, double **host_gamR,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// mie1.x = mie1, mie1.y = mie2, mie1.z = gamA, mie1.w = gamR
UCL_D_Vec<numtyp4> mie1;
/// mie3.x = mie3, mie3.y = mie4, mie3.z = offset, mie3.w = cutsq
UCL_D_Vec<numtyp4> mie3;
/// Special Mie values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

124
lib/gpu/lal_mie_ext.cpp Normal file
View File

@ -0,0 +1,124 @@
/***************************************************************************
mie_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to mie acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_mie.h"
using namespace std;
using namespace LAMMPS_AL;
static Mie<PRECISION,ACC_PRECISION> MLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
double **host_mie2, double **host_mie3, double **host_mie4,
double **host_gamA, double **host_gamR,
double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
MLMF.clear();
gpu_mode=MLMF.device->gpu_mode();
double gpu_split=MLMF.device->particle_split();
int first_gpu=MLMF.device->first_device();
int last_gpu=MLMF.device->last_device();
int world_me=MLMF.device->world_me();
int gpu_rank=MLMF.device->gpu_rank();
int procs_per_gpu=MLMF.device->procs_per_gpu();
MLMF.device->init_message(screen,"mie",first_gpu,last_gpu);
bool message=false;
if (MLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
host_mie3, host_mie4, host_gamA, host_gamR,
offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
MLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
host_mie3, host_mie4, host_gamA, host_gamR,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
MLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
MLMF.estimate_gpu_overhead();
return init_ok;
}
void mie_gpu_clear() {
MLMF.clear();
}
int ** mie_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return MLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void mie_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
MLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double mie_gpu_bytes() {
return MLMF.host_memory_usage();
}

145
lib/gpu/lal_soft.cpp Normal file
View File

@ -0,0 +1,145 @@
/***************************************************************************
soft.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the soft pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "soft_cl.h"
#elif defined(USE_CUDART)
const char *soft=0;
#else
#include "soft_cubin.h"
#endif
#include "lal_soft.h"
#include <cassert>
using namespace LAMMPS_AL;
#define SoftT Soft<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
SoftT::Soft() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
SoftT::~Soft() {
clear();
}
template <class numtyp, class acctyp>
int SoftT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int SoftT::init(const int ntypes, double **host_cutsq,
double **host_prefactor, double **host_cut,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,soft,"k_soft");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_prefactor,
host_cut,host_cutsq);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void SoftT::clear() {
if (!_allocated)
return;
_allocated=false;
coeff.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double SoftT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Soft<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void SoftT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class Soft<PRECISION,ACC_PRECISION>;

77
lib/gpu/lal_soft.h Normal file
View File

@ -0,0 +1,77 @@
/***************************************************************************
soft.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the soft pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_GAUSS_H
#define LAL_GAYSS_H
#include "lal_base_atomic.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Soft : public BaseAtomic<numtyp, acctyp> {
public:
Soft();
~Soft();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_prefactor, double **host_cut,
double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// coeff.x = prefactor, coeff.y = cut, coeff.z = cutsq
UCL_D_Vec<numtyp4> coeff;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kßernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

120
lib/gpu/lal_soft_ext.cpp Normal file
View File

@ -0,0 +1,120 @@
/***************************************************************************
soft_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to soft acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_soft.h"
using namespace std;
using namespace LAMMPS_AL;
static Soft<PRECISION,ACC_PRECISION> SLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
double **host_cut, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
SLMF.clear();
gpu_mode=SLMF.device->gpu_mode();
double gpu_split=SLMF.device->particle_split();
int first_gpu=SLMF.device->first_device();
int last_gpu=SLMF.device->last_device();
int world_me=SLMF.device->world_me();
int gpu_rank=SLMF.device->gpu_rank();
int procs_per_gpu=SLMF.device->procs_per_gpu();
SLMF.device->init_message(screen,"soft",first_gpu,last_gpu);
bool message=false;
if (SLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
SLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
SLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
SLMF.estimate_gpu_overhead();
return init_ok;
}
void soft_gpu_clear() {
SLMF.clear();
}
int ** soft_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return SLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void soft_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
SLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double soft_gpu_bytes() {
return SLMF.host_memory_usage();
}

167
lib/gpu/lal_sw.cpp Normal file
View File

@ -0,0 +1,167 @@
/***************************************************************************
sw.cpp
-------------------
W. Michael Brown (ORNL)
Class for acceleration of the sw pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Tue March 26, 2013
email : brownw@ornl.gov
***************************************************************************/
#if defined(USE_OPENCL)
#include "sw_cl.h"
#elif defined(USE_CUDART)
const char *lj=0;
#else
#include "sw_cubin.h"
#endif
#include "lal_sw.h"
#include <cassert>
using namespace LAMMPS_AL;
#define SWT SW<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
SWT::SW() : BaseThree<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
SWT::~SW() {
clear();
}
template <class numtyp, class acctyp>
int SWT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int SWT::init(const int nlocal, const int nall, const int max_nbors,
const double cell_size, const double gpu_split, FILE *_screen,
const double epsilon, const double sigma,
const double lambda, const double gamma,
const double costheta, const double biga,
const double bigb, const double powerp,
const double powerq, const double cut, const double cutsq) {
sw_epsilon=static_cast<numtyp>(epsilon);
sw_sigma=static_cast<numtyp>(sigma);
sw_lambda=static_cast<numtyp>(lambda);
sw_gamma=static_cast<numtyp>(gamma);
sw_costheta=static_cast<numtyp>(costheta);
sw_biga=static_cast<numtyp>(biga);
sw_bigb=static_cast<numtyp>(bigb);
sw_powerp=static_cast<numtyp>(powerp);
sw_powerq=static_cast<numtyp>(powerq);
sw_cut=static_cast<numtyp>(cut);
sw_cutsq=static_cast<numtyp>(cutsq);
if (sw_cutsq>=sw_cut*sw_cut)
sw_cutsq=sw_cut*sw_cut-1e-4;
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,sw,"k_sw","k_sw_three_center",
"k_sw_three_end");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
shared_types=true;
_allocated=true;
this->_max_bytes=0;
return 0;
}
template <class numtyp, class acctyp>
void SWT::clear() {
if (!_allocated)
return;
_allocated=false;
this->clear_atomic();
}
template <class numtyp, class acctyp>
double SWT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(SW<numtyp,acctyp>);
}
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
// Compute the block size and grid size to keep all cores busy
int BX=this->block_pair();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom, &sw_cut, &sw_epsilon, &sw_sigma,
&sw_biga, &sw_bigb, &sw_powerp, &sw_powerq, &sw_cutsq);
BX=this->block_size();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/(KTHREADS*JTHREADS))));
this->k_three_center.set_size(GX,BX);
this->k_three_center.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom,
&sw_cut, &sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
&sw_costheta, &sw_cutsq);
Answer<numtyp,acctyp> *end_ans;
#ifdef THREE_CONCURRENT
end_ans=this->ans2;
#else
end_ans=this->ans;
#endif
if (evatom!=0) {
this->k_three_end_vatom.set_size(GX,BX);
this->k_three_end_vatom.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &end_ans->force,
&end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &sw_cut,
&sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
&sw_costheta, &sw_cutsq);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &end_ans->force,
&end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &sw_cut,
&sw_epsilon, &sw_sigma, &sw_lambda, &sw_gamma,
&sw_costheta, &sw_cutsq);
}
this->time_pair.stop();
}
template class SW<PRECISION,ACC_PRECISION>;

73
lib/gpu/lal_sw.h Normal file
View File

@ -0,0 +1,73 @@
/***************************************************************************
sw.h
-------------------
W. Michael Brown (ORNL)
Class for acceleration of the sw pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Tue March 26, 2013
email : brownw@ornl.gov
***************************************************************************/
#ifndef LAL_SW_H
#define LAL_SW_H
#include "lal_base_three.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class SW : public BaseThree<numtyp, acctyp> {
public:
SW();
~SW();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int nlocal, const int nall, const int max_nbors,
const double cell_size, const double gpu_split, FILE *screen,
const double epsilon, const double sigma,
const double lambda, const double gamma,
const double costheta, const double biga,
const double bigb, const double powerp,
const double powerq, const double cut, const double cutsq);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag, const int evatom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta;
numtyp sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq;
};
}
#endif

128
lib/gpu/lal_sw_ext.cpp Normal file
View File

@ -0,0 +1,128 @@
/***************************************************************************
sw_ext.cpp
-------------------
W. Michael Brown (ORNL)
Functions for LAMMPS access to sw acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Tue March 26, 2013
email : brownw@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_sw.h"
using namespace std;
using namespace LAMMPS_AL;
static SW<PRECISION,ACC_PRECISION> SWMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int sw_gpu_init(const int inum, const int nall, const int max_nbors,
const double cell_size, int &gpu_mode, FILE *screen,
const double sw_epsilon, const double sw_sigma,
const double sw_lambda, const double sw_gamma,
const double sw_costheta, const double sw_biga,
const double sw_bigb, const double sw_powerp,
const double sw_powerq, const double sw_cut,
const double sw_cutsq) {
SWMF.clear();
gpu_mode=SWMF.device->gpu_mode();
double gpu_split=SWMF.device->particle_split();
int first_gpu=SWMF.device->first_device();
int last_gpu=SWMF.device->last_device();
int world_me=SWMF.device->world_me();
int gpu_rank=SWMF.device->gpu_rank();
int procs_per_gpu=SWMF.device->procs_per_gpu();
// disable host/device split for now
if (gpu_split != 1.0)
return -8;
SWMF.device->init_message(screen,"sw/gpu",first_gpu,last_gpu);
bool message=false;
if (SWMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
SWMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut,
sw_cutsq);
SWMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
SWMF.estimate_gpu_overhead();
return init_ok;
}
void sw_gpu_clear() {
SWMF.clear();
}
int ** sw_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return SWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void sw_gpu_compute(const int ago, const int nlocal, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success) {
SWMF.compute(ago,nlocal,nall,nlist,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double sw_gpu_bytes() {
return SWMF.host_memory_usage();
}