From 9418bacff7fa83c2f637d15fe3ab2bd61a66a587 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Tue, 14 Jun 2011 13:33:48 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6406 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Makefile.firefly | 41 +++ lib/gpu/Makefile.firefly_opencl | 31 +++ lib/gpu/base_ellipsoid.cpp | 470 ++++++++++++++++++++++++++++++++ lib/gpu/base_ellipsoid.h | 255 +++++++++++++++++ lib/gpu/cmmc_msm_gpu.cpp | 131 +++++++++ lib/gpu/cmmc_msm_gpu_memory.cpp | 169 ++++++++++++ lib/gpu/cmmc_msm_gpu_memory.h | 83 ++++++ lib/gpu/ellipsoid_extra.h | 424 ++++++++++++++++++++++++++++ lib/gpu/gayberne.cpp | 307 +++++++++++++++++++++ lib/gpu/gayberne.h | 95 +++++++ lib/gpu/gayberne_ext.cpp | 141 ++++++++++ lib/gpu/lj_class2_long.cpp | 168 ++++++++++++ lib/gpu/lj_class2_long.h | 84 ++++++ lib/gpu/lj_class2_long_ext.cpp | 129 +++++++++ lib/gpu/re_squared.cpp | 308 +++++++++++++++++++++ lib/gpu/re_squared.h | 91 +++++++ lib/gpu/re_squared_ext.cpp | 138 ++++++++++ 17 files changed, 3065 insertions(+) create mode 100644 lib/gpu/Makefile.firefly create mode 100644 lib/gpu/Makefile.firefly_opencl create mode 100644 lib/gpu/base_ellipsoid.cpp create mode 100644 lib/gpu/base_ellipsoid.h create mode 100644 lib/gpu/cmmc_msm_gpu.cpp create mode 100644 lib/gpu/cmmc_msm_gpu_memory.cpp create mode 100644 lib/gpu/cmmc_msm_gpu_memory.h create mode 100644 lib/gpu/ellipsoid_extra.h create mode 100644 lib/gpu/gayberne.cpp create mode 100644 lib/gpu/gayberne.h create mode 100644 lib/gpu/gayberne_ext.cpp create mode 100644 lib/gpu/lj_class2_long.cpp create mode 100644 lib/gpu/lj_class2_long.h create mode 100644 lib/gpu/lj_class2_long_ext.cpp create mode 100644 lib/gpu/re_squared.cpp create mode 100644 lib/gpu/re_squared.h create mode 100644 lib/gpu/re_squared_ext.cpp diff --git a/lib/gpu/Makefile.firefly b/lib/gpu/Makefile.firefly new file mode 100644 index 0000000000..6f481a0d98 --- /dev/null +++ b/lib/gpu/Makefile.firefly @@ -0,0 +1,41 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +CUDA_HOME = /usr/local/cuda +NVCC = nvcc + +CUDA_ARCH = -arch=sm_11 +CUDA_PRECISION = -D_SINGLE_SINGLE +CUDA_INCLUDE = -I/usr/local/cuda/include +CUDA_LIB = -L/usr/local/cuda/lib64 +CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math +#CUDA_OPTS = -DUNIX -g -G + +CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK -fopenmp +CUDR_OPTS = -g -Wall -O2 -DUCL_NO_EXIT # -xHost -no-prec-div -ansi-alias +#CUDR_OPTS = -g -Wall -DUCL_SYNC_DEBUG + +BIN_DIR = /home/wb8/bin +OBJ_DIR = /home/wb8/obj/lammps +LIB_DIR = /home/wb8/obj/lammps +AR = ar +BSH = /bin/sh + +include Nvidia.makefile + diff --git a/lib/gpu/Makefile.firefly_opencl b/lib/gpu/Makefile.firefly_opencl new file mode 100644 index 0000000000..1f4bf3c94c --- /dev/null +++ b/lib/gpu/Makefile.firefly_opencl @@ -0,0 +1,31 @@ +# /* ---------------------------------------------------------------------- +# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator +# http://lammps.sandia.gov, Sandia National Laboratories +# Steve Plimpton, sjplimp@sandia.gov +# +# Copyright (2003) Sandia Corporation. Under the terms of Contract +# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains +# certain rights in this software. This software is distributed under +# the GNU General Public License. +# +# See the README file in the top-level LAMMPS directory. +# ------------------------------------------------------------------------- */ +# +# /* ---------------------------------------------------------------------- +# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +# Peng Wang (Nvidia), penwang@nvidia.com +# Paul Crozier (SNL), pscrozi@sandia.gov +# ------------------------------------------------------------------------- */ + +OCL_CPP = mpic++ -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK -I/usr/local/cuda/include/ +OCL_LINK = -lOpenCL +OCL_PREC = -D_SINGLE_SINGLE + +BIN_DIR = /home/wb8/bin +OBJ_DIR = /home/wb8/obj/lammps +LIB_DIR = /home/wb8/obj/lammps +AR = ar +BSH = /bin/sh + +include Opencl.makefile + diff --git a/lib/gpu/base_ellipsoid.cpp b/lib/gpu/base_ellipsoid.cpp new file mode 100644 index 0000000000..b2223db59d --- /dev/null +++ b/lib/gpu/base_ellipsoid.cpp @@ -0,0 +1,470 @@ +/*************************************************************************** + base_ellipsoid.cpp + ------------------- + W. Michael Brown + + Base class for acceleration of ellipsoid potentials + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Thu May 5 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#include "base_ellipsoid.h" +using namespace LAMMPS_AL; + +#ifdef USE_OPENCL +#include "ellipsoid_nbor_cl.h" +#else +#include "ellipsoid_nbor_ptx.h" +#endif + +#define BaseEllipsoidT BaseEllipsoid +extern PairGPUDevice pair_gpu_device; + +template +BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) { + device=&pair_gpu_device; + ans=new PairGPUAns(); + nbor=new PairGPUNbor(); +} + +template +BaseEllipsoidT::~BaseEllipsoid() { + delete ans; + delete nbor; +} + +template +int BaseEllipsoidT::bytes_per_atom(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseEllipsoidT::init_base(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, + FILE *_screen, const int ntypes, int **h_form, + const char *ellipsoid_program, + const char *lj_program, const bool ellip_sphere) { + nbor_time_avail=false; + screen=_screen; + _ellipsoid_sphere=ellip_sphere; + + bool gpu_nbor=false; + if (device->gpu_mode()==PairGPUDevice::GPU_NEIGH) + gpu_nbor=true; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_charge(); + + int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,true); + if (success!=0) + return success; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_lj.init(*ucl_device); + time_nbor1.init(*ucl_device); + time_ellipsoid.init(*ucl_device); + time_nbor2.init(*ucl_device); + time_ellipsoid2.init(*ucl_device); + time_nbor3.init(*ucl_device); + time_ellipsoid3.init(*ucl_device); + time_lj.zero(); + time_nbor1.zero(); + time_ellipsoid.zero(); + time_nbor2.zero(); + time_ellipsoid2.zero(); + time_nbor3.zero(); + time_ellipsoid3.zero(); + + // See if we want fast GB-sphere or sphere-sphere calculations + _host_form=h_form; + _multiple_forms=false; + for (int i=1; i0) { + std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n"; + exit(1); + } + + if (_multiple_forms) + ans->dev_ans.zero(); + + // Memory for ilist ordered by particle type + if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS) + return 0; + else return -3; + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return 0; +} + +template +void BaseEllipsoidT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead); +} + +template +void BaseEllipsoidT::clear_base() { + // Output any timing information + output_times(); + host_olist.clear(); + + if (_compiled) { + k_nbor_fast.clear(); + k_nbor.clear(); + k_ellipsoid.clear(); + k_ellipsoid_sphere.clear(); + k_sphere_ellipsoid.clear(); + k_lj_fast.clear(); + k_lj.clear(); + delete nbor_program; + delete ellipsoid_program; + delete lj_program; + _compiled=false; + } + + time_nbor1.clear(); + time_ellipsoid.clear(); + time_nbor2.clear(); + time_ellipsoid2.clear(); + time_nbor3.clear(); + time_ellipsoid3.clear(); + time_lj.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); + device->clear(); +} + +template +void BaseEllipsoidT::output_times() { + // Output any timing information + acc_timers(); + double single[9], times[9]; + + single[0]=atom->transfer_time()+ans->transfer_time(); + single[1]=nbor->time_nbor.total_seconds(); + single[2]=time_nbor1.total_seconds()+time_nbor2.total_seconds()+ + time_nbor3.total_seconds()+nbor->time_nbor.total_seconds(); + single[3]=time_ellipsoid.total_seconds()+time_ellipsoid2.total_seconds()+ + time_ellipsoid3.total_seconds(); + if (_multiple_forms) + single[4]=time_lj.total_seconds(); + else + single[4]=0; + single[5]=atom->cast_time()+ans->cast_time(); + single[6]=_gpu_overhead; + single[7]=_driver_overhead; + single[8]=ans->cpu_idle_time(); + + MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica()); + double avg_split=hd_balancer.all_avg_split(); + + _max_bytes+=atom->max_gpu_bytes(); + double mpi_max_bytes; + MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, + device->replica()); + double max_mb=mpi_max_bytes/(1024*1024); + + if (device->replica_me()==0) + if (screen && times[5]>0.0) { + int replica_size=device->replica_size(); + + fprintf(screen,"\n\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + fprintf(screen," GPU Time Info (average): "); + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + + if (device->procs_per_gpu()==1) { + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); + fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size); + fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size); + if (nbor->gpu_nbor()) + fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size); + else + fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size); + fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); + fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); + } + fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size); + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); + fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); + fprintf(screen,"-------------------------------------"); + fprintf(screen,"--------------------------------\n\n"); + + + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + } + _max_bytes=0.0; +} + +// --------------------------------------------------------------------------- +// Pack neighbors to limit thread divergence for lj-lj and ellipse +// --------------------------------------------------------------------------- +template +void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, + const int inum, const int form_low, + const int form_high, const bool shared_types, + int ntypes) { + int stride=nbor->nbor_pitch(); + if (shared_types) { + k_nbor_fast.set_size(GX,BX); + k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(), + &nbor->dev_nbor.begin(), &stride, &start, &inum, + &nbor->dev_packed.begin(), &form_low, &form_high); + } else { + k_nbor.set_size(GX,BX); + k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes, + &nbor->dev_nbor.begin(), &stride, &start, &inum, + &nbor->dev_packed.begin(), &form_low, &form_high); + } +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +void BaseEllipsoidT::reset_nbors(const int nall, const int inum, + const int osize, int *ilist, + int *numj, int *type, int **firstneigh, + bool &success) { + success=true; + + nbor_time_avail=true; + + int mn=nbor->max_nbor_loop(osize,numj,ilist); + resize_atom(nall,success); + resize_local(inum,0,mn,osize,success); + if (!success) + return; + + if (_multiple_forms) { + int p=0; + for (int i=0; iget_host(inum,host_olist.begin(),numj,firstneigh,block_size()); + nbor->copy_unpacked(inum,mn); + return; + } + _last_ellipse=inum; + _max_last_ellipse=inum; + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + nbor->copy_unpacked(inum,mn); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, int *tag, + int **nspecial, int **special, + bool &success) { + nbor_time_avail=true; + + success=true; + resize_atom(nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),0,success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag, + nspecial, special, success, mn); + nbor->copy_unpacked(inum,mn); + _last_ellipse=inum; + _max_last_ellipse=inum; + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double **host_quat) { + acc_timers(); + if (inum_full==0) { + host_start=0; + zero_timers(); + return NULL; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + _last_ellipse=std::min(inum,_max_last_ellipse); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, inum_full, ilist, numj, host_type, firstneigh, + success); + if (!success) + return NULL; + } + int *list; + if (_multiple_forms) + list=host_olist.begin(); + else + list=ilist; + + atom->cast_x_data(host_x,host_type); + atom->cast_quat_data(host_quat[0]); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_quat_data(); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom,list); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + return list; +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double **host_quat) { + acc_timers(); + if (inum_full==0) { + host_start=0; + zero_timers(); + return NULL; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + _last_ellipse=std::min(inum,_max_last_ellipse); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return NULL; + atom->cast_quat_data(host_quat[0]); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_quat_data(host_quat[0]); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + + atom->add_quat_data(); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + return nbor->host_jlist.begin()-host_start; +} + +template +double BaseEllipsoidT::host_memory_usage_base() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseEllipsoid); +} + +template +void BaseEllipsoidT::compile_kernels(UCL_Device &dev, + const char *ellipsoid_string, + const char *lj_string, const bool e_s) { + if (_compiled) + return; + + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE); + + nbor_program=new UCL_Program(dev); + nbor_program->load_string(ellipsoid_nbor,flags.c_str()); + k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast"); + k_nbor.set_function(*nbor_program,"kernel_nbor"); + + ellipsoid_program=new UCL_Program(dev); + ellipsoid_program->load_string(ellipsoid_string,flags.c_str()); + k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid"); + + lj_program=new UCL_Program(dev); + lj_program->load_string(lj_string,flags.c_str()); + k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid"); + k_lj_fast.set_function(*lj_program,"kernel_lj_fast"); + k_lj.set_function(*lj_program,"kernel_lj"); + if (e_s) + k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere"); + + _compiled=true; +} + +template class BaseEllipsoid; + diff --git a/lib/gpu/base_ellipsoid.h b/lib/gpu/base_ellipsoid.h new file mode 100644 index 0000000000..ba0d5d2d1f --- /dev/null +++ b/lib/gpu/base_ellipsoid.h @@ -0,0 +1,255 @@ +/*************************************************************************** + base_ellipsoid.h + ------------------- + W. Michael Brown + + Base class for acceleration of ellipsoid potentials + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Thu May 5 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef BASE_ELLIPSOID_H +#define BASE_ELLIPSOID_H + +#include "pair_gpu_device.h" +#include "pair_gpu_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseEllipsoid { + public: + BaseEllipsoid(); + virtual ~BaseEllipsoid(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_base(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, const int ntypes, + int **h_form, const char *ellipsoid_program, + const char *lj_program, const bool ellipsoid_sphere=false); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int nall, bool &success) { + atom->resize(nall, success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \param olist_size size of list of particles from CPU neighboring + * \note host_inum is 0 if the host is performing neighboring + * \note if GPU is neighboring nlocal+host_inum=total number local particles + * \note if CPU is neighboring olist_size=total number of local particles + * \note if GPU is neighboring olist_size=0 **/ + inline void resize_local(const int nlocal, const int host_inum, + const int max_nbors, const int olist_size, + bool &success) { + ans->resize(nlocal, success); + if (_multiple_forms) ans->dev_ans.zero(); + + if (olist_size>static_cast(host_olist.numel())) { + host_olist.clear(); + int new_size=static_cast(static_cast(olist_size)*1.10); + success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS); + } + + nbor->resize(nlocal,host_inum,max_nbors,success); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_bytes) + _max_bytes=bytes; + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_base(); + + /// Output any timing information + void output_times(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_base() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_nbor.add_to_total(); + nbor_time_avail=false; + } + time_nbor1.add_to_total(); + time_ellipsoid.add_to_total(); + if (_multiple_forms) { + time_nbor2.add_to_total(); + time_ellipsoid2.add_to_total(); + if (_ellipsoid_sphere) { + time_nbor3.add_to_total(); + time_ellipsoid3.add_to_total(); + } + time_lj.add_to_total(); + } + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + nbor_time_avail=false; + time_nbor1.zero(); + time_ellipsoid.zero(); + if (_multiple_forms) { + time_nbor2.zero(); + time_ellipsoid2.zero(); + if (_ellipsoid_sphere) { + time_nbor3.zero(); + time_ellipsoid3.zero(); + } + time_lj.zero(); + } + atom->zero_timers(); + ans->zero_timers(); + } + + /// Pack neighbors to limit thread divergence for lj-lj and ellipse + void pack_nbors(const int GX, const int BX, const int start, const int inum, + const int form_low, const int form_high, + const bool shared_types, int ntypes); + + /// Copy neighbor list from host + void reset_nbors(const int nall, const int inum, const int osize, int *ilist, + int *numj, int *type, int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, bool &success); + + /// Pair loop with host neighboring + int* compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **quat); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **host_quat); + + /// Build neighbor list on accelerator + void build_nbor_list(const int inum, const int host_inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, bool &success); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + PairGPUDevice *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_nbor1, time_ellipsoid, time_nbor2, time_ellipsoid2, time_lj; + UCL_Timer time_nbor3, time_ellipsoid3; + + /// Host device load balancer + PairGPUBalance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + PairGPUAtom *atom; + + // --------------------------- TYPE DATA -------------------------- + + /// cut_form.x = cutsq, cut_form.y = form + UCL_D_Vec cut_form; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + PairGPUNbor *nbor; + /// ilist with particles sorted by type + UCL_H_Vec host_olist; + /// True if we need to accumulate time for neighboring + bool nbor_time_avail; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *nbor_program, *ellipsoid_program, *lj_program; + UCL_Kernel k_nbor_fast, k_nbor; + UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid; + UCL_Kernel k_lj_fast, k_lj; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + + protected: + bool _compiled, _ellipsoid_sphere; + int _block_size, _threads_per_atom; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; + + // True if we want to use fast GB-sphere or sphere-sphere calculations + bool _multiple_forms; + int **_host_form; + int _last_ellipse, _max_last_ellipse; + + void compile_kernels(UCL_Device &dev, const char *ellipsoid_string, + const char *lj_string, const bool e_s); + + virtual void loop(const bool _eflag, const bool _vflag) = 0; +}; + +} + +#endif + diff --git a/lib/gpu/cmmc_msm_gpu.cpp b/lib/gpu/cmmc_msm_gpu.cpp new file mode 100644 index 0000000000..cfa6c50453 --- /dev/null +++ b/lib/gpu/cmmc_msm_gpu.cpp @@ -0,0 +1,131 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include +#include +#include + +#include "cmmc_msm_gpu_memory.h" + +using namespace std; + +static CMMM_GPU_Memory CMMMMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int cmmm_gpu_init(const int ntypes, double **cutsq, int **cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const int smooth) { + CMMMMF.clear(); + gpu_mode=CMMMMF.device->gpu_mode(); + double gpu_split=CMMMMF.device->particle_split(); + int first_gpu=CMMMMF.device->first_device(); + int last_gpu=CMMMMF.device->last_device(); + int world_me=CMMMMF.device->world_me(); + int gpu_rank=CMMMMF.device->gpu_rank(); + int procs_per_gpu=CMMMMF.device->procs_per_gpu(); + + CMMMMF.device->init_message(screen,"cg/cmm/coul/msm",first_gpu,last_gpu); + + bool message=false; + if (CMMMMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=CMMMMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e,smooth); + + CMMMMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CMMMMF.estimate_gpu_overhead(); + return init_ok; +} + +void cmmm_gpu_clear() { + CMMMMF.clear(); +} + +int** cmmm_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CMMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void cmmm_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CMMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double cmmm_gpu_bytes() { + return CMMMMF.host_memory_usage(); +} + + diff --git a/lib/gpu/cmmc_msm_gpu_memory.cpp b/lib/gpu/cmmc_msm_gpu_memory.cpp new file mode 100644 index 0000000000..22d69a33e2 --- /dev/null +++ b/lib/gpu/cmmc_msm_gpu_memory.cpp @@ -0,0 +1,169 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifdef USE_OPENCL +#include "cmmc_msm_gpu_cl.h" +#else +#include "cmmc_msm_gpu_ptx.h" +#endif + +#include "cmmc_msm_gpu_memory.h" +#include +#define CMMM_GPU_MemoryT CMMM_GPU_Memory + +extern PairGPUDevice pair_gpu_device; + +template +CMMM_GPU_MemoryT::CMMM_GPU_Memory() : ChargeGPUMemory(), + _allocated(false) { +} + +template +CMMM_GPU_MemoryT::~CMMM_GPU_Memory() { + clear(); +} + +template +int CMMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int CMMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, + const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const int smooth) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,cmmc_msm_gpu_kernel); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq, + host_cut_ljsq,host_lj1,host_lj2); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3, + host_lj4,host_offset); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _smooth=smooth; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void CMMM_GPU_MemoryT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double CMMM_GPU_MemoryT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CMMM_GPU_Memory); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, + &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_smooth, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_smooth, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class CMMM_GPU_Memory; diff --git a/lib/gpu/cmmc_msm_gpu_memory.h b/lib/gpu/cmmc_msm_gpu_memory.h new file mode 100644 index 0000000000..9b5205f702 --- /dev/null +++ b/lib/gpu/cmmc_msm_gpu_memory.h @@ -0,0 +1,83 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef CMMM_GPU_MEMORY_H +#define CMMM_GPU_MEMORY_H + +#include "charge_gpu_memory.h" + +template +class CMMM_GPU_Memory : public ChargeGPUMemory { + public: + CMMM_GPU_Memory(); + ~CMMM_GPU_Memory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, int ** cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const int smooth); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, + UCL_D_Vec lj1; + /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset + UCL_D_Vec lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e; + + private: + bool _allocated; + int _smooth; + void loop(const bool _eflag, const bool _vflag); +}; + +#endif + diff --git a/lib/gpu/ellipsoid_extra.h b/lib/gpu/ellipsoid_extra.h new file mode 100644 index 0000000000..9d54efdeb9 --- /dev/null +++ b/lib/gpu/ellipsoid_extra.h @@ -0,0 +1,424 @@ +// ************************************************************************** +// ellipsoid_extra.h +// ------------------- +// W. Michael Brown +// +// Device code for Ellipsoid math routines +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifndef ELLIPSOID_EXTRA_H +#define ELLIPSOID_EXTRA_H + +enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#ifdef NV_KERNEL + +#include "nv_kernel_def.h" + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 + +#endif + +/* ---------------------------------------------------------------------- + dot product of 2 vectors +------------------------------------------------------------------------- */ + +__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2) +{ + return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2]; +} + +/* ---------------------------------------------------------------------- + cross product of 2 vectors +------------------------------------------------------------------------- */ + +__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans) +{ + ans[0] = v1[1]*v2[2]-v1[2]*v2[1]; + ans[1] = v1[2]*v2[0]-v1[0]*v2[2]; + ans[2] = v1[0]*v2[1]-v1[1]*v2[0]; +} + +/* ---------------------------------------------------------------------- + determinant of a matrix +------------------------------------------------------------------------- */ + +__inline numtyp gpu_det3(const numtyp m[9]) +{ + numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - + m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + + m[6]*m[1]*m[5] - m[6]*m[2]*m[4]; + return ans; +} + +/* ---------------------------------------------------------------------- + diagonal matrix times a full matrix +------------------------------------------------------------------------- */ + +__inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], + numtyp ans[9]) +{ + ans[0] = shape.x*m[0]; + ans[1] = shape.x*m[1]; + ans[2] = shape.x*m[2]; + ans[3] = shape.y*m[3]; + ans[4] = shape.y*m[4]; + ans[5] = shape.y*m[5]; + ans[6] = shape.z*m[6]; + ans[7] = shape.z*m[7]; + ans[8] = shape.z*m[8]; +} + +/* ---------------------------------------------------------------------- + add two matrices +------------------------------------------------------------------------- */ + +__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9]) +{ + ans[0] = m[0]+m2[0]; + ans[1] = m[1]+m2[1]; + ans[2] = m[2]+m2[2]; + ans[3] = m[3]+m2[3]; + ans[4] = m[4]+m2[4]; + ans[5] = m[5]+m2[5]; + ans[6] = m[6]+m2[6]; + ans[7] = m[7]+m2[7]; + ans[8] = m[8]+m2[8]; +} + +/* ---------------------------------------------------------------------- + multiply the transpose of mat1 times mat2 +------------------------------------------------------------------------- */ + +__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9], + numtyp ans[9]) +{ + ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6]; + ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7]; + ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8]; + ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6]; + ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7]; + ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8]; + ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6]; + ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7]; + ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8]; +} + +/* ---------------------------------------------------------------------- + row vector times matrix +------------------------------------------------------------------------- */ + +__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans) +{ + ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6]; + ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7]; + ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2]; +} + +/* ---------------------------------------------------------------------- + solve Ax = b or M ans = v + use gaussian elimination & partial pivoting on matrix + error_flag set to 2 if bad matrix inversion attempted +------------------------------------------------------------------------- */ + +__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans, + __global int *error_flag) +{ + // create augmented matrix for pivoting + + numtyp aug[12], t; + + aug[3] = v[0]; + aug[0] = m[0]; + aug[1] = m[1]; + aug[2] = m[2]; + aug[7] = v[1]; + aug[4] = m[3]; + aug[5] = m[4]; + aug[6] = m[5]; + aug[11] = v[2]; + aug[8] = m[6]; + aug[9] = m[7]; + aug[10] = m[8]; + + if (fabs(aug[4]) > fabs(aug[0])) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; + swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; + swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; + swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; + } + if (fabs(aug[8]) > fabs(aug[0])) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; + swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; + swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; + swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; + } + + if (aug[0] != (numtyp)0.0) { + if (0!=0) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt; + swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt; + swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt; + swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt; + } + } else if (aug[4] != (numtyp)0.0) { + if (1!=0) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; + swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; + swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; + swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; + } + } else if (aug[8] != (numtyp)0.0) { + if (2!=0) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; + swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; + swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; + swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; + } + } else + *error_flag=2; + + t = aug[4]/aug[0]; + aug[5]-=t*aug[1]; + aug[6]-=t*aug[2]; + aug[7]-=t*aug[3]; + t = aug[8]/aug[0]; + aug[9]-=t*aug[1]; + aug[10]-=t*aug[2]; + aug[11]-=t*aug[3]; + + if (fabs(aug[9]) > fabs(aug[5])) { + numtyp swapt; + swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; + swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; + swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; + swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; + } + + if (aug[5] != (numtyp)0.0) { + if (1!=1) { + numtyp swapt; + swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt; + swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt; + swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt; + swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt; + } + } else if (aug[9] != (numtyp)0.0) { + if (2!=1) { + numtyp swapt; + swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; + swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; + swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; + swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; + } + } + + t = aug[9]/aug[5]; + aug[10]-=t*aug[6]; + aug[11]-=t*aug[7]; + + if (aug[10] == (numtyp)0.0) + *error_flag=2; + + ans[2] = aug[11]/aug[10]; + t = (numtyp)0.0; + t += aug[6]*ans[2]; + ans[1] = (aug[7]-t) / aug[5]; + t = (numtyp)0.0; + t += aug[1]*ans[1]; + t += aug[2]*ans[2]; + ans[0] = (aug[3]-t) / aug[0]; +} + +/* ---------------------------------------------------------------------- + compute rotation matrix from quaternion conjugate + quat = [w i j k] +------------------------------------------------------------------------- */ + +__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, + numtyp mat[9]) +{ + numtyp4 q=qif[qi]; + + numtyp w2 = q.x*q.x; + numtyp i2 = q.y*q.y; + numtyp j2 = q.z*q.z; + numtyp k2 = q.w*q.w; + numtyp twoij = (numtyp)2.0*q.y*q.z; + numtyp twoik = (numtyp)2.0*q.y*q.w; + numtyp twojk = (numtyp)2.0*q.z*q.w; + numtyp twoiw = (numtyp)2.0*q.y*q.x; + numtyp twojw = (numtyp)2.0*q.z*q.x; + numtyp twokw = (numtyp)2.0*q.w*q.x; + + mat[0] = w2+i2-j2-k2; + mat[3] = twoij-twokw; + mat[6] = twojw+twoik; + + mat[1] = twoij+twokw; + mat[4] = w2-i2+j2-k2; + mat[7] = twojk-twoiw; + + mat[2] = twoik-twojw; + mat[5] = twojk+twoiw; + mat[8] = w2-i2-j2+k2; +} + +/* ---------------------------------------------------------------------- + transposed matrix times diagonal matrix +------------------------------------------------------------------------- */ + +__inline void gpu_transpose_times_diag3(const numtyp m[9], + const numtyp4 d, numtyp ans[9]) +{ + ans[0] = m[0]*d.x; + ans[1] = m[3]*d.y; + ans[2] = m[6]*d.z; + ans[3] = m[1]*d.x; + ans[4] = m[4]*d.y; + ans[5] = m[7]*d.z; + ans[6] = m[2]*d.x; + ans[7] = m[5]*d.y; + ans[8] = m[8]*d.z; +} + +/* ---------------------------------------------------------------------- + multiply mat1 times mat2 +------------------------------------------------------------------------- */ + +__inline void gpu_times3(const numtyp m[9], const numtyp m2[9], + numtyp ans[9]) +{ + ans[0] = m[0]*m2[0] + m[1]*m2[3] + m[2]*m2[6]; + ans[1] = m[0]*m2[1] + m[1]*m2[4] + m[2]*m2[7]; + ans[2] = m[0]*m2[2] + m[1]*m2[5] + m[2]*m2[8]; + ans[3] = m[3]*m2[0] + m[4]*m2[3] + m[5]*m2[6]; + ans[4] = m[3]*m2[1] + m[4]*m2[4] + m[5]*m2[7]; + ans[5] = m[3]*m2[2] + m[4]*m2[5] + m[5]*m2[8]; + ans[6] = m[6]*m2[0] + m[7]*m2[3] + m[8]*m2[6]; + ans[7] = m[6]*m2[1] + m[7]*m2[4] + m[8]*m2[7]; + ans[8] = m[6]*m2[2] + m[7]*m2[5] + m[8]*m2[8]; +} + +/* ---------------------------------------------------------------------- + Apply principal rotation generator about x to rotation matrix m +------------------------------------------------------------------------- */ + +__inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9]) +{ + ans[0] = 0; + ans[1] = -m[2]; + ans[2] = m[1]; + ans[3] = 0; + ans[4] = -m[5]; + ans[5] = m[4]; + ans[6] = 0; + ans[7] = -m[8]; + ans[8] = m[7]; +} + +/* ---------------------------------------------------------------------- + Apply principal rotation generator about y to rotation matrix m +------------------------------------------------------------------------- */ + +__inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9]) +{ + ans[0] = m[2]; + ans[1] = 0; + ans[2] = -m[0]; + ans[3] = m[5]; + ans[4] = 0; + ans[5] = -m[3]; + ans[6] = m[8]; + ans[7] = 0; + ans[8] = -m[6]; +} + +/* ---------------------------------------------------------------------- + Apply principal rotation generator about z to rotation matrix m +------------------------------------------------------------------------- */ + +__inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9]) +{ + ans[0] = -m[1]; + ans[1] = m[0]; + ans[2] = 0; + ans[3] = -m[4]; + ans[4] = m[3]; + ans[5] = 0; + ans[6] = -m[7]; + ans[7] = m[6]; + ans[8] = 0; +} + +/* ---------------------------------------------------------------------- + matrix times vector +------------------------------------------------------------------------- */ + +__inline void gpu_times_column3(const numtyp m[9], const numtyp v[3], + numtyp ans[3]) +{ + ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2]; + ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2]; + ans[2] = m[6]*v[0] + m[7]*v[1] + m[8]*v[2]; +} + + + + + + + + + + + + + +#endif diff --git a/lib/gpu/gayberne.cpp b/lib/gpu/gayberne.cpp new file mode 100644 index 0000000000..e26c5978a4 --- /dev/null +++ b/lib/gpu/gayberne.cpp @@ -0,0 +1,307 @@ +/*************************************************************************** + gayberne.cpp + ------------------- + W. Michael Brown + + Host code for Gay-Berne potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "gayberne_cl.h" +#else +#include "gayberne_ptx.h" +#endif + +#include "gayberne.h" +#include +using namespace LAMMPS_AL; + +#define GayBerneT GayBerne +extern PairGPUDevice pair_gpu_device; + +template +GayBerneT::GayBerne() : BaseEllipsoid(), + _allocated(false) { +} + +template +GayBerneT::~GayBerne() { + clear(); +} + +template +int GayBerneT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom(max_nbors); +} + +template +int GayBerneT::init(const int ntypes, const double gamma, + const double upsilon, const double mu, + double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, + int **h_form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, + double **host_offset, const double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ntypes,h_form,gayberne,gayberne_lj); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + _shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) { + lj_types=max_shared_types; + _shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for copying type data + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write, + host_sigma,host_epsilon); + + this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write, + host_cutsq,h_form); + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq,h_form); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + dev_error.alloc(1,*(this->ucl_device)); + dev_error.zero(); + + // Allocate, cast and asynchronous memcpy of constant data + // Copy data for bonded interactions + gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY); + host_write[0]=static_cast(gamma); + host_write[1]=static_cast(upsilon); + host_write[2]=static_cast(mu); + host_write[3]=static_cast(host_special_lj[0]); + host_write[4]=static_cast(host_special_lj[1]); + host_write[5]=static_cast(host_special_lj[2]); + host_write[6]=static_cast(host_special_lj[3]); + ucl_copy(gamma_upsilon_mu,host_write,7,false); + + lshape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + UCL_H_Vec d_view; + d_view.view(host_lshape,lshape.numel(),*(this->ucl_device)); + ucl_copy(lshape,d_view,false); + + // Copy shape, well, sigma, epsilon, and cutsq onto GPU + // - cast if necessary + shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i view4; + view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device)); + ucl_copy(shape,view4,false); + + well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; iucl_device)); + ucl_copy(well,view4,false); + + _allocated=true; + this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+ + lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+ + lshape.row_bytes()+shape.row_bytes()+well.row_bytes(); + + return 0; +} + +template +void GayBerneT::clear() { + if (!_allocated) + return; + + UCL_H_Vec err_flag(1,*(this->ucl_device)); + ucl_copy(err_flag,dev_error,false); + if (err_flag[0] == 2) + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + err_flag.clear(); + + _allocated=false; + + dev_error.clear(); + lj1.clear(); + lj3.clear(); + sigma_epsilon.clear(); + this->cut_form.clear(); + + shape.clear(); + well.clear(); + lshape.clear(); + gamma_upsilon_mu.clear(); + + this->clear_base(); +} + +template +double GayBerneT::host_memory_usage() const { + return this->host_memory_usage_base()+sizeof(GayBerneT)+ + 4*sizeof(numtyp); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void GayBerneT::loop(const bool _eflag, const bool _vflag) { + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=0, NGX; + int stride=this->nbor->nbor_pitch(); + int ainum=this->ans->inum(); + + if (this->_multiple_forms) { + this->time_nbor1.start(); + if (this->_last_ellipse>0) { + // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- + GX=static_cast(ceil(static_cast(this->_last_ellipse)/ + (BX/this->_threads_per_atom))); + NGX=static_cast(ceil(static_cast(this->_last_ellipse)/BX)); + this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE, + ELLIPSE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor1.stop(); + + this->time_ellipsoid.start(); + this->k_ellipsoid.set_size(GX,BX); + this->k_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(), + &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), + &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, + &this->_threads_per_atom); + this->time_ellipsoid.stop(); + + if (this->_last_ellipse==this->ans->inum()) { + this->time_nbor2.start(); + this->time_nbor2.stop(); + this->time_ellipsoid2.start(); + this->time_ellipsoid2.stop(); + this->time_lj.start(); + this->time_lj.stop(); + return; + } + + // ------------ SPHERE_ELLIPSE --------------- + + this->time_nbor2.start(); + GX=static_cast(ceil(static_cast(this->ans->inum()- + this->_last_ellipse)/ + (BX/this->_threads_per_atom))); + NGX=static_cast(ceil(static_cast(this->ans->inum()- + this->_last_ellipse)/BX)); + this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(), + SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor2.stop(); + + this->time_ellipsoid2.start(); + this->k_sphere_ellipsoid.set_size(GX,BX); + this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), + &this->well.begin(), &this->gamma_upsilon_mu.begin(), + &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), + &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, + &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + this->time_ellipsoid2.stop(); + } else { + this->ans->dev_ans.zero(); + this->ans->dev_engv.zero(); + this->time_nbor1.stop(); + this->time_ellipsoid.start(); + this->time_ellipsoid.stop(); + this->time_nbor2.start(); + this->time_nbor2.stop(); + this->time_ellipsoid2.start(); + this->time_ellipsoid2.stop(); + } + + // ------------ LJ --------------- + this->time_lj.start(); + if (this->_last_ellipseans->inum()) { + if (this->_shared_types) { + this->k_lj_fast.set_size(GX,BX); + this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(), + &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride, + &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), + &eflag, &vflag, &this->_last_ellipse, &ainum, + &this->_threads_per_atom); + } else { + this->k_lj.set_size(GX,BX); + this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(), + &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(), + &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, + &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + } + } + this->time_lj.stop(); + } else { + GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + NGX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + this->time_nbor1.start(); + this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE, + ELLIPSE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor1.stop(); + this->time_ellipsoid.start(); + this->k_ellipsoid.set_size(GX,BX); + this->k_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(), + &stride, &this->ans->dev_ans.begin(), &ainum, + &this->ans->dev_engv.begin(), &this->dev_error.begin(), + &eflag, &vflag, &ainum, &this->_threads_per_atom); + this->time_ellipsoid.stop(); + } +} + +template class GayBerne; + diff --git a/lib/gpu/gayberne.h b/lib/gpu/gayberne.h new file mode 100644 index 0000000000..60d9b947d5 --- /dev/null +++ b/lib/gpu/gayberne.h @@ -0,0 +1,95 @@ +/*************************************************************************** + gayberne.h + ------------------- + W. Michael Brown + + Host code for Gay-Berne potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef GAYBERNE_H +#define GAYBERNE_H + +#include "base_ellipsoid.h" +#include "mpi.h" + +namespace LAMMPS_AL { + +template +class GayBerne : public BaseEllipsoid { + public: + GayBerne(); + ~GayBerne(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param gpu_nbor true if neighboring performed on device + * \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \return false if there is not sufficient memory or device init prob + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const double gamma, + const double upsilon, const double mu, double **host_shape, + double **host_well, double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, int **h_form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + const double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Device Error Flag - Set if a bad matrix inversion occurs + UCL_D_Vec dev_error; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon + UCL_D_Vec sigma_epsilon; + // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ... + UCL_D_Vec gamma_upsilon_mu; + + /// If atom type constants fit in shared memory, use fast kernels + bool _shared_types; + int _lj_types; + + // --------------------------- ATOM DATA -------------------------- + + /// Aspherical Const Data for Atoms + UCL_D_Vec shape, well; + /// Aspherical Const Data for Atoms + UCL_D_Vec lshape; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/gayberne_ext.cpp b/lib/gpu/gayberne_ext.cpp new file mode 100644 index 0000000000..7eec718d40 --- /dev/null +++ b/lib/gpu/gayberne_ext.cpp @@ -0,0 +1,141 @@ +/*************************************************************************** + gayberne_ext.cpp + ------------------- + W. Michael Brown + + LAMMPS Wrappers for Gay-Berne Acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "gayberne.h" + +using namespace std; +using namespace LAMMPS_AL; + +static GayBerne GBMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int gb_gpu_init(const int ntypes, const double gamma, + const double upsilon, const double mu, double **shape, + double **well, double **cutsq, double **sigma, + double **epsilon, double *host_lshape, int **form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { + GBMF.clear(); + gpu_mode=GBMF.device->gpu_mode(); + double gpu_split=GBMF.device->particle_split(); + int first_gpu=GBMF.device->first_device(); + int last_gpu=GBMF.device->last_device(); + int world_me=GBMF.device->world_me(); + int gpu_rank=GBMF.device->gpu_rank(); + int procs_per_gpu=GBMF.device->procs_per_gpu(); + + GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu); + + bool message=false; + if (GBMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, + sigma, epsilon, host_lshape, form, host_lj1, + host_lj2, host_lj3, host_lj4, offset, special_lj, + inum, nall, max_nbors, maxspecial, cell_size, gpu_split, + screen); + + GBMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + GBMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Clear memory on host and device +// --------------------------------------------------------------------------- +void gb_gpu_clear() { + GBMF.clear(); +} + + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **host_quat); + +int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **jnum, const double cpu_time, bool &success, + double **host_quat) { + return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, + tag, nspecial, special, eflag, vflag, eatom, vatom, + host_start, ilist, jnum, cpu_time, success, host_quat); +} + +int * gb_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_quat) { + return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist, + numj, firstneigh, eflag, vflag, eatom, vatom, host_start, + cpu_time, success, host_quat); +} + +// --------------------------------------------------------------------------- +// Return memory usage +// --------------------------------------------------------------------------- +double gb_gpu_bytes() { + return GBMF.host_memory_usage(); +} + diff --git a/lib/gpu/lj_class2_long.cpp b/lib/gpu/lj_class2_long.cpp new file mode 100644 index 0000000000..b329e95d99 --- /dev/null +++ b/lib/gpu/lj_class2_long.cpp @@ -0,0 +1,168 @@ +/*************************************************************************** + lj_class2_long.cpp + ------------------- + W. Michael Brown + + Host code for COMPASS LJ long potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Mon May 16 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "lj_class2_long_cl.h" +#else +#include "lj_class2_long_ptx.h" +#endif + +#include "lj_class2_long.h" +#include +using namespace LAMMPS_AL; + +#define LJClass2LongT LJClass2Long + +extern PairGPUDevice pair_gpu_device; + +template +LJClass2LongT::LJClass2Long() : ChargeGPUMemory(), + _allocated(false) { +} + +template +LJClass2LongT::~LJClass2Long() { + clear(); +} + +template +int LJClass2LongT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJClass2LongT::init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_class2_long); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void LJClass2LongT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJClass2LongT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJClass2Long); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJClass2Long; + diff --git a/lib/gpu/lj_class2_long.h b/lib/gpu/lj_class2_long.h new file mode 100644 index 0000000000..705219f92b --- /dev/null +++ b/lib/gpu/lj_class2_long.h @@ -0,0 +1,84 @@ +/*************************************************************************** + lj_class2_long.h + ------------------- + W. Michael Brown + + Host code for COMPASS LJ long potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Mon May 16 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LJ_CLASS2_LONG_H +#define LJ_CLASS2_LONG_H + +#include "charge_gpu_memory.h" + +namespace LAMMPS_AL { + +template +class LJClass2Long : public ChargeGPUMemory { + public: + LJClass2Long(); + ~LJClass2Long(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif + diff --git a/lib/gpu/lj_class2_long_ext.cpp b/lib/gpu/lj_class2_long_ext.cpp new file mode 100644 index 0000000000..8a90e47530 --- /dev/null +++ b/lib/gpu/lj_class2_long_ext.cpp @@ -0,0 +1,129 @@ +/*************************************************************************** + lj_class2_long_ext.cpp + ------------------- + W. Michael Brown + + LAMMPS Wrappers for COMMPASS LJ long Acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Mon May 16 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lj_class2_long.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJClass2Long C2CLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + C2CLMF.clear(); + gpu_mode=C2CLMF.device->gpu_mode(); + double gpu_split=C2CLMF.device->particle_split(); + int first_gpu=C2CLMF.device->first_device(); + int last_gpu=C2CLMF.device->last_device(); + int world_me=C2CLMF.device->world_me(); + int gpu_rank=C2CLMF.device->gpu_rank(); + int procs_per_gpu=C2CLMF.device->procs_per_gpu(); + + C2CLMF.device->init_message(screen,"lj/class2/coul/long",first_gpu,last_gpu); + + bool message=false; + if (C2CLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + + C2CLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + C2CLMF.estimate_gpu_overhead(); + return init_ok; +} + +void c2cl_gpu_clear() { + C2CLMF.clear(); +} + +int** c2cl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return C2CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void c2cl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + C2CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double c2cl_gpu_bytes() { + return C2CLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/re_squared.cpp b/lib/gpu/re_squared.cpp new file mode 100644 index 0000000000..b27b6944ec --- /dev/null +++ b/lib/gpu/re_squared.cpp @@ -0,0 +1,308 @@ +/*************************************************************************** + re_squared.cpp + ------------------- + W. Michael Brown + + Host code for RE-Squared potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Fri May 06 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "re_squared_cl.h" +#else +#include "re_squared_ptx.h" +#endif + +#include "re_squared.h" +#include +using namespace LAMMPS_AL; + +#define RESquaredT RESquared +extern PairGPUDevice pair_gpu_device; + +template +RESquaredT::RESquared() : BaseEllipsoid(), + _allocated(false) { +} + +template +RESquaredT::~RESquared() { + clear(); +} + +template +int RESquaredT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom(max_nbors); +} + +template +int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, + double **host_epsilon, int **h_form, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, const double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ntypes,h_form,re_squared,re_squared_lj,true); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + _shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) { + lj_types=max_shared_types; + _shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for copying type data + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write, + host_sigma,host_epsilon); + + this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write, + host_cutsq,h_form); + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq,h_form); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + dev_error.alloc(1,*(this->ucl_device)); + dev_error.zero(); + + // Allocate, cast and asynchronous memcpy of constant data + // Copy data for bonded interactions + special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + host_write[0]=static_cast(host_special_lj[0]); + host_write[1]=static_cast(host_special_lj[1]); + host_write[2]=static_cast(host_special_lj[2]); + host_write[3]=static_cast(host_special_lj[3]); + ucl_copy(special_lj,host_write,4,false); + + // Copy shape, well, sigma, epsilon, and cutsq onto GPU + // - cast if necessary + shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i view4; + view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device)); + ucl_copy(shape,view4,false); + + well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; iucl_device)); + ucl_copy(well,view4,false); + + _allocated=true; + this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+ + lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+ + shape.row_bytes()+well.row_bytes(); + + return 0; +} + +template +void RESquaredT::clear() { + if (!_allocated) + return; + + UCL_H_Vec err_flag(1,*(this->ucl_device)); + ucl_copy(err_flag,dev_error,false); + if (err_flag[0] == 2) + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + err_flag.clear(); + + _allocated=false; + + dev_error.clear(); + lj1.clear(); + lj3.clear(); + sigma_epsilon.clear(); + this->cut_form.clear(); + + shape.clear(); + well.clear(); + special_lj.clear(); + + this->clear_base(); +} + +template +double RESquaredT::host_memory_usage() const { + return this->host_memory_usage_base()+sizeof(RESquaredT)+ + 4*sizeof(numtyp); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void RESquaredT::loop(const bool _eflag, const bool _vflag) { + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX, NGX; + int stride=this->nbor->nbor_pitch(); + int ainum=this->ans->inum(); + + if (this->_multiple_forms) { + if (this->_last_ellipse>0) { + // ------------ ELLIPSE_ELLIPSE --------------- + this->time_nbor1.start(); + GX=static_cast(ceil(static_cast(this->_last_ellipse)/ + (BX/this->_threads_per_atom))); + NGX=static_cast(ceil(static_cast(this->_last_ellipse)/BX)); + this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE, + ELLIPSE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor1.stop(); + + this->time_ellipsoid.start(); + this->k_ellipsoid.set_size(GX,BX); + this->k_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->special_lj.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride, + &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), + &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, + &this->_threads_per_atom); + this->time_ellipsoid.stop(); + + // ------------ ELLIPSE_SPHERE --------------- + this->time_nbor2.start(); + this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE, + ELLIPSE_SPHERE,_shared_types,_lj_types); + this->time_nbor2.stop(); + + this->time_ellipsoid2.start(); + this->k_ellipsoid_sphere.set_size(GX,BX); + this->k_ellipsoid_sphere.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->special_lj.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride, + &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), + &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, + &this->_threads_per_atom); + this->time_ellipsoid2.stop(); + + if (this->_last_ellipse==this->ans->inum()) { + this->time_nbor3.zero(); + this->time_ellipsoid3.zero(); + this->time_lj.zero(); + return; + } + + // ------------ SPHERE_ELLIPSE --------------- + + this->time_nbor3.start(); + GX=static_cast(ceil(static_cast(this->ans->inum()- + this->_last_ellipse)/ + (BX/this->_threads_per_atom))); + NGX=static_cast(ceil(static_cast(this->ans->inum()- + this->_last_ellipse)/BX)); + this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(), + SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor3.stop(); + + this->time_ellipsoid3.start(); + this->k_sphere_ellipsoid.set_size(GX,BX); + this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), + &this->well.begin(), &this->special_lj.begin(), + &this->sigma_epsilon.begin(), &this->_lj_types, + &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, + &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + this->time_ellipsoid3.stop(); + } else { + this->ans->dev_ans.zero(); + this->ans->dev_engv.zero(); + this->time_nbor1.zero(); + this->time_ellipsoid.zero(); + this->time_nbor2.zero(); + this->time_ellipsoid2.zero(); + this->time_nbor3.zero(); + this->time_ellipsoid3.zero(); + } + + // ------------ LJ --------------- + this->time_lj.start(); + if (this->_last_ellipseans->inum()) { + if (this->_shared_types) { + this->k_lj_fast.set_size(GX,BX); + this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(), + &this->lj3.begin(), &this->special_lj.begin(), &stride, + &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), + &eflag, &vflag, &this->_last_ellipse, &ainum, + &this->_threads_per_atom); + } else { + this->k_lj.set_size(GX,BX); + this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(), + &this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(), + &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, + &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + } + } + this->time_lj.stop(); + } else { + GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + NGX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + this->time_nbor1.start(); + this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE, + ELLIPSE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor1.stop(); + this->time_ellipsoid.start(); + this->k_ellipsoid.set_size(GX,BX); + this->k_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->special_lj.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride, + &this->ans->dev_ans.begin(), &ainum, &this->ans->dev_engv.begin(), + &this->dev_error.begin(), &eflag, &vflag, &ainum, + &this->_threads_per_atom); + this->time_ellipsoid.stop(); + } +} + +template class RESquared; + diff --git a/lib/gpu/re_squared.h b/lib/gpu/re_squared.h new file mode 100644 index 0000000000..d0c8a4dd4e --- /dev/null +++ b/lib/gpu/re_squared.h @@ -0,0 +1,91 @@ +/*************************************************************************** + re_squared.h + ------------------- + W. Michael Brown + + Host code for RE-Squared potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Fri May 06 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef RE_SQUARED_H +#define RE_SQUARED_H + +#include "base_ellipsoid.h" +#include "mpi.h" + +namespace LAMMPS_AL { + +template +class RESquared : public BaseEllipsoid { + public: + RESquared(); + ~RESquared(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param gpu_nbor true if neighboring performed on device + * \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \return false if there is not sufficient memory or device init prob + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, double **host_epsilon, + int **h_form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **host_offset, + const double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Device Error Flag - Set if a bad matrix inversion occurs + UCL_D_Vec dev_error; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon + UCL_D_Vec sigma_epsilon; + /// special lj 0-4 + UCL_D_Vec special_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool _shared_types; + int _lj_types; + + // --------------------------- ATOM DATA -------------------------- + + /// Aspherical Const Data for Atoms + UCL_D_Vec shape, well; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/re_squared_ext.cpp b/lib/gpu/re_squared_ext.cpp new file mode 100644 index 0000000000..77835066e2 --- /dev/null +++ b/lib/gpu/re_squared_ext.cpp @@ -0,0 +1,138 @@ +/*************************************************************************** + re_squared_ext.cpp + ------------------- + W. Michael Brown + + LAMMPS Wrappers for RE-Squared Acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "re_squared.h" + +using namespace std; +using namespace LAMMPS_AL; + +static RESquared REMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, + double **sigma, double **epsilon, + int **form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + REMF.clear(); + gpu_mode=REMF.device->gpu_mode(); + double gpu_split=REMF.device->particle_split(); + int first_gpu=REMF.device->first_device(); + int last_gpu=REMF.device->last_device(); + int world_me=REMF.device->world_me(); + int gpu_rank=REMF.device->gpu_rank(); + int procs_per_gpu=REMF.device->procs_per_gpu(); + + REMF.device->init_message(screen,"resquared",first_gpu,last_gpu); + + bool message=false; + if (REMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, + form, host_lj1, host_lj2, host_lj3, host_lj4, offset, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, + gpu_split, screen); + + REMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + REMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Clear memory on host and device +// --------------------------------------------------------------------------- +void re_gpu_clear() { + REMF.clear(); +} + + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **host_quat); + +int** re_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **jnum, const double cpu_time, bool &success, + double **host_quat) { + return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, + tag, nspecial, special, eflag, vflag, eatom, vatom, + host_start, ilist, jnum, cpu_time, success, host_quat); +} + +int * re_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_quat) { + return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist, + numj, firstneigh, eflag, vflag, eatom, vatom, host_start, + cpu_time, success, host_quat); +} + +// --------------------------------------------------------------------------- +// Return memory usage +// --------------------------------------------------------------------------- +double re_gpu_bytes() { + return REMF.host_memory_usage(); +} +