git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7280 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2011-12-02 16:01:04 +00:00
parent 4fe24446de
commit 00dc2b891f
110 changed files with 0 additions and 28102 deletions

View File

@ -1,41 +0,0 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
CUDA_HOME = /usr/local/cuda
NVCC = nvcc
CUDA_ARCH = -arch=sm_11
CUDA_PRECISION = -D_SINGLE_SINGLE
CUDA_INCLUDE = -I/usr/local/cuda/include
CUDA_LIB = -L/usr/local/cuda/lib64
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
#CUDA_OPTS = -DUNIX -g -G
CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK -fopenmp
CUDR_OPTS = -g -Wall -O2 -DUCL_NO_EXIT # -xHost -no-prec-div -ansi-alias
#CUDR_OPTS = -g -Wall -DUCL_SYNC_DEBUG
BIN_DIR = /home/wb8/bin
OBJ_DIR = /home/wb8/obj/lammps
LIB_DIR = /home/wb8/obj/lammps
AR = ar
BSH = /bin/sh
include Nvidia.makefile

View File

@ -1,31 +0,0 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK -I/usr/local/cuda/include/
OCL_LINK = -lOpenCL
OCL_PREC = -D_SINGLE_SINGLE
BIN_DIR = /home/wb8/bin
OBJ_DIR = /home/wb8/obj/lammps
LIB_DIR = /home/wb8/obj/lammps
AR = ar
BSH = /bin/sh
include Opencl.makefile

View File

@ -1,289 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include "atomic_gpu_memory.h"
#define AtomicGPUMemoryT AtomicGPUMemory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor();
}
template <class numtyp, class acctyp>
AtomicGPUMemoryT::~AtomicGPUMemory() {
delete ans;
delete nbor;
}
template <class numtyp, class acctyp>
int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
nbor_time_avail=false;
screen=_screen;
bool gpu_nbor=false;
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_atom();
if (_threads_per_atom>1 && gpu_nbor==false) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
compile_kernels(*ucl_device,pair_program);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->dev_x,4);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
return 0;
}
template <class numtyp, class acctyp>
void AtomicGPUMemoryT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void AtomicGPUMemoryT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
if (_compiled) {
k_pair_fast.clear();
k_pair.clear();
delete pair_program;
_compiled=false;
}
time_pair.clear();
hd_balancer.clear();
nbor->clear();
ans->clear();
device->clear();
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
int *numj, int **firstneigh, bool &success) {
success=true;
nbor_time_avail=true;
int mn=nbor->max_nbor_loop(inum,numj,ilist);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return false;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, int *tag,
int **nspecial, int **special,
bool &success) {
nbor_time_avail=true;
success=true;
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
device->add_ans_object(ans);
hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag,
int **nspecial, int **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return NULL;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double AtomicGPUMemoryT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(AtomicGPUMemory<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void AtomicGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
if (_compiled)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
k_pair.set_function(*pair_program,"kernel_pair");
pos_tex.get_texture(*pair_program,"pos_tex");
_compiled=true;
}
template class AtomicGPUMemory<PRECISION,ACC_PRECISION>;

View File

@ -1,206 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef ATOMIC_GPU_MEMORY_H
#define ATOMIC_GPU_MEMORY_H
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
template <class numtyp, class acctyp>
class AtomicGPUMemory {
public:
AtomicGPUMemory();
virtual ~AtomicGPUMemory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success))
pos_tex.bind_float(atom->dev_x,4);
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
}
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
}
/// Zero timers
inline void zero_timers() {
nbor_time_avail=false;
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, bool &success);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int * compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
PairGPUDevice<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
PairGPUBalance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
PairGPUAtom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
PairGPUNbor *nbor;
/// True if we need to accumulate time for neighboring
bool nbor_time_avail;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_pair_fast, k_pair;
inline int block_size() { return _block_size; }
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
protected:
bool _compiled;
int _block_size, _threads_per_atom;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const char *pair_string);
virtual void loop(const bool _eflag, const bool _vflag) = 0;
};
#endif

View File

@ -1,470 +0,0 @@
/***************************************************************************
base_ellipsoid.cpp
-------------------
W. Michael Brown
Base class for acceleration of ellipsoid potentials
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Thu May 5 2011
email : brownw@ornl.gov
***************************************************************************/
#include "base_ellipsoid.h"
using namespace LAMMPS_AL;
#ifdef USE_OPENCL
#include "ellipsoid_nbor_cl.h"
#else
#include "ellipsoid_nbor_ptx.h"
#endif
#define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor();
}
template <class numtyp, class acctyp>
BaseEllipsoidT::~BaseEllipsoid() {
delete ans;
delete nbor;
}
template <class numtyp, class acctyp>
int BaseEllipsoidT::bytes_per_atom(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int BaseEllipsoidT::init_base(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split,
FILE *_screen, const int ntypes, int **h_form,
const char *ellipsoid_program,
const char *lj_program, const bool ellip_sphere) {
nbor_time_avail=false;
screen=_screen;
_ellipsoid_sphere=ellip_sphere;
bool gpu_nbor=false;
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_charge();
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,true);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_lj.init(*ucl_device);
time_nbor1.init(*ucl_device);
time_ellipsoid.init(*ucl_device);
time_nbor2.init(*ucl_device);
time_ellipsoid2.init(*ucl_device);
time_nbor3.init(*ucl_device);
time_ellipsoid3.init(*ucl_device);
time_lj.zero();
time_nbor1.zero();
time_ellipsoid.zero();
time_nbor2.zero();
time_ellipsoid2.zero();
time_nbor3.zero();
time_ellipsoid3.zero();
// See if we want fast GB-sphere or sphere-sphere calculations
_host_form=h_form;
_multiple_forms=false;
for (int i=1; i<ntypes; i++)
for (int j=i; j<ntypes; j++)
if (_host_form[i][j]!=ELLIPSE_ELLIPSE)
_multiple_forms=true;
if (_multiple_forms && host_nlocal>0) {
std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
exit(1);
}
if (_multiple_forms)
ans->dev_ans.zero();
// Memory for ilist ordered by particle type
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
return 0;
else return -3;
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
return 0;
}
template <class numtyp, class acctyp>
void BaseEllipsoidT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void BaseEllipsoidT::clear_base() {
// Output any timing information
output_times();
host_olist.clear();
if (_compiled) {
k_nbor_fast.clear();
k_nbor.clear();
k_ellipsoid.clear();
k_ellipsoid_sphere.clear();
k_sphere_ellipsoid.clear();
k_lj_fast.clear();
k_lj.clear();
delete nbor_program;
delete ellipsoid_program;
delete lj_program;
_compiled=false;
}
time_nbor1.clear();
time_ellipsoid.clear();
time_nbor2.clear();
time_ellipsoid2.clear();
time_nbor3.clear();
time_ellipsoid3.clear();
time_lj.clear();
hd_balancer.clear();
nbor->clear();
ans->clear();
device->clear();
}
template <class numtyp, class acctyp>
void BaseEllipsoidT::output_times() {
// Output any timing information
acc_timers();
double single[9], times[9];
single[0]=atom->transfer_time()+ans->transfer_time();
single[1]=nbor->time_nbor.total_seconds();
single[2]=time_nbor1.total_seconds()+time_nbor2.total_seconds()+
time_nbor3.total_seconds()+nbor->time_nbor.total_seconds();
single[3]=time_ellipsoid.total_seconds()+time_ellipsoid2.total_seconds()+
time_ellipsoid3.total_seconds();
if (_multiple_forms)
single[4]=time_lj.total_seconds();
else
single[4]=0;
single[5]=atom->cast_time()+ans->cast_time();
single[6]=_gpu_overhead;
single[7]=_driver_overhead;
single[8]=ans->cpu_idle_time();
MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
double avg_split=hd_balancer.all_avg_split();
_max_bytes+=atom->max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
device->replica());
double max_mb=mpi_max_bytes/(1024*1024);
if (device->replica_me()==0)
if (screen && times[5]>0.0) {
int replica_size=device->replica_size();
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (device->procs_per_gpu()==1) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
if (nbor->gpu_nbor())
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size);
else
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
}
fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
fprintf(screen,"-------------------------------------");
fprintf(screen,"--------------------------------\n\n");
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
}
_max_bytes=0.0;
}
// ---------------------------------------------------------------------------
// Pack neighbors to limit thread divergence for lj-lj and ellipse
// ---------------------------------------------------------------------------
template<class numtyp, class acctyp>
void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
const int inum, const int form_low,
const int form_high, const bool shared_types,
int ntypes) {
int stride=nbor->nbor_pitch();
if (shared_types) {
k_nbor_fast.set_size(GX,BX);
k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(),
&nbor->dev_nbor.begin(), &stride, &start, &inum,
&nbor->dev_packed.begin(), &form_low, &form_high);
} else {
k_nbor.set_size(GX,BX);
k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes,
&nbor->dev_nbor.begin(), &stride, &start, &inum,
&nbor->dev_packed.begin(), &form_low, &form_high);
}
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
const int osize, int *ilist,
int *numj, int *type, int **firstneigh,
bool &success) {
success=true;
nbor_time_avail=true;
int mn=nbor->max_nbor_loop(osize,numj,ilist);
resize_atom(nall,success);
resize_local(inum,0,mn,osize,success);
if (!success)
return;
if (_multiple_forms) {
int p=0;
for (int i=0; i<osize; i++) {
int itype=type[ilist[i]];
if (_host_form[itype][itype]==ELLIPSE_ELLIPSE) {
host_olist[p]=ilist[i];
p++;
}
}
_max_last_ellipse=p;
_last_ellipse=std::min(inum,_max_last_ellipse);
for (int i=0; i<osize; i++) {
int itype=type[ilist[i]];
if (_host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
host_olist[p]=ilist[i];
p++;
}
}
nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size());
nbor->copy_unpacked(inum,mn);
return;
}
_last_ellipse=inum;
_max_last_ellipse=inum;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
nbor->copy_unpacked(inum,mn);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, int *tag,
int **nspecial, int **special,
bool &success) {
nbor_time_avail=true;
success=true;
resize_atom(nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),0,success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
nbor->copy_unpacked(inum,mn);
_last_ellipse=inum;
_max_last_ellipse=inum;
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double **host_quat) {
acc_timers();
if (inum_full==0) {
host_start=0;
zero_timers();
return NULL;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
_last_ellipse=std::min(inum,_max_last_ellipse);
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, inum_full, ilist, numj, host_type, firstneigh,
success);
if (!success)
return NULL;
}
int *list;
if (_multiple_forms)
list=host_olist.begin();
else
list=ilist;
atom->cast_x_data(host_x,host_type);
atom->cast_quat_data(host_quat[0]);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
atom->add_quat_data();
loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom,list);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return list;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
double **host_quat) {
acc_timers();
if (inum_full==0) {
host_start=0;
zero_timers();
return NULL;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
_last_ellipse=std::min(inum,_max_last_ellipse);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
atom->cast_quat_data(host_quat[0]);
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
atom->cast_quat_data(host_quat[0]);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_quat_data();
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double BaseEllipsoidT::host_memory_usage_base() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(BaseEllipsoid<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
const char *ellipsoid_string,
const char *lj_string, const bool e_s) {
if (_compiled)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE);
nbor_program=new UCL_Program(dev);
nbor_program->load_string(ellipsoid_nbor,flags.c_str());
k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
k_nbor.set_function(*nbor_program,"kernel_nbor");
ellipsoid_program=new UCL_Program(dev);
ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid");
lj_program=new UCL_Program(dev);
lj_program->load_string(lj_string,flags.c_str());
k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid");
k_lj_fast.set_function(*lj_program,"kernel_lj_fast");
k_lj.set_function(*lj_program,"kernel_lj");
if (e_s)
k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere");
_compiled=true;
}
template class BaseEllipsoid<PRECISION,ACC_PRECISION>;

View File

@ -1,255 +0,0 @@
/***************************************************************************
base_ellipsoid.h
-------------------
W. Michael Brown
Base class for acceleration of ellipsoid potentials
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Thu May 5 2011
email : brownw@ornl.gov
***************************************************************************/
#ifndef BASE_ELLIPSOID_H
#define BASE_ELLIPSOID_H
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BaseEllipsoid {
public:
BaseEllipsoid();
virtual ~BaseEllipsoid();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_base(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const int ntypes,
int **h_form, const char *ellipsoid_program,
const char *lj_program, const bool ellipsoid_sphere=false);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int nall, bool &success) {
atom->resize(nall, success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \param olist_size size of list of particles from CPU neighboring
* \note host_inum is 0 if the host is performing neighboring
* \note if GPU is neighboring nlocal+host_inum=total number local particles
* \note if CPU is neighboring olist_size=total number of local particles
* \note if GPU is neighboring olist_size=0 **/
inline void resize_local(const int nlocal, const int host_inum,
const int max_nbors, const int olist_size,
bool &success) {
ans->resize(nlocal, success);
if (_multiple_forms) ans->dev_ans.zero();
if (olist_size>static_cast<int>(host_olist.numel())) {
host_olist.clear();
int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
}
nbor->resize(nlocal,host_inum,max_nbors,success);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_bytes)
_max_bytes=bytes;
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_base();
/// Output any timing information
void output_times();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_base() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_nbor.add_to_total();
nbor_time_avail=false;
}
time_nbor1.add_to_total();
time_ellipsoid.add_to_total();
if (_multiple_forms) {
time_nbor2.add_to_total();
time_ellipsoid2.add_to_total();
if (_ellipsoid_sphere) {
time_nbor3.add_to_total();
time_ellipsoid3.add_to_total();
}
time_lj.add_to_total();
}
atom->acc_timers();
ans->acc_timers();
}
}
/// Zero timers
inline void zero_timers() {
nbor_time_avail=false;
time_nbor1.zero();
time_ellipsoid.zero();
if (_multiple_forms) {
time_nbor2.zero();
time_ellipsoid2.zero();
if (_ellipsoid_sphere) {
time_nbor3.zero();
time_ellipsoid3.zero();
}
time_lj.zero();
}
atom->zero_timers();
ans->zero_timers();
}
/// Pack neighbors to limit thread divergence for lj-lj and ellipse
void pack_nbors(const int GX, const int BX, const int start, const int inum,
const int form_low, const int form_high,
const bool shared_types, int ntypes);
/// Copy neighbor list from host
void reset_nbors(const int nall, const int inum, const int osize, int *ilist,
int *numj, int *type, int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, bool &success);
/// Pair loop with host neighboring
int* compute(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double **quat);
/// Pair loop with device neighboring
int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
double **host_quat);
/// Build neighbor list on accelerator
void build_nbor_list(const int inum, const int host_inum, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, bool &success);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
PairGPUDevice<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_nbor1, time_ellipsoid, time_nbor2, time_ellipsoid2, time_lj;
UCL_Timer time_nbor3, time_ellipsoid3;
/// Host device load balancer
PairGPUBalance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
PairGPUAtom<numtyp,acctyp> *atom;
// --------------------------- TYPE DATA --------------------------
/// cut_form.x = cutsq, cut_form.y = form
UCL_D_Vec<numtyp2> cut_form;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
PairGPUNbor *nbor;
/// ilist with particles sorted by type
UCL_H_Vec<int> host_olist;
/// True if we need to accumulate time for neighboring
bool nbor_time_avail;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *nbor_program, *ellipsoid_program, *lj_program;
UCL_Kernel k_nbor_fast, k_nbor;
UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid;
UCL_Kernel k_lj_fast, k_lj;
inline int block_size() { return _block_size; }
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
UCL_Texture q_tex;
protected:
bool _compiled, _ellipsoid_sphere;
int _block_size, _threads_per_atom;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
// True if we want to use fast GB-sphere or sphere-sphere calculations
bool _multiple_forms;
int **_host_form;
int _last_ellipse, _max_last_ellipse;
void compile_kernels(UCL_Device &dev, const char *ellipsoid_string,
const char *lj_string, const bool e_s);
virtual void loop(const bool _eflag, const bool _vflag) = 0;
};
}
#endif

View File

@ -1,306 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include "charge_gpu_memory.h"
#define ChargeGPUMemoryT ChargeGPUMemory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor();
}
template <class numtyp, class acctyp>
ChargeGPUMemoryT::~ChargeGPUMemory() {
delete ans;
delete nbor;
}
template <class numtyp, class acctyp>
int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
nbor_time_avail=false;
screen=_screen;
bool gpu_nbor=false;
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_charge();
if (_threads_per_atom>1 && gpu_nbor==false) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
return success;
}
template <class numtyp, class acctyp>
void ChargeGPUMemoryT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void ChargeGPUMemoryT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
if (_compiled) {
k_pair_fast.clear();
k_pair.clear();
delete pair_program;
_compiled=false;
}
time_pair.clear();
hd_balancer.clear();
nbor->clear();
ans->clear();
device->clear();
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
int *numj, int **firstneigh, bool &success) {
success=true;
nbor_time_avail=true;
int mn=nbor->max_nbor_loop(inum,numj,ilist);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return false;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, int *tag,
int **nspecial, int **special,
bool &success) {
nbor_time_avail=true;
success=true;
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
atom->add_q_data();
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
boxlo, prd);
loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
device->add_ans_object(ans);
hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** ChargeGPUMemoryT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag,
int **nspecial, int **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success,
double *host_q, double *boxlo, double *prd) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return NULL;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
atom->cast_q_data(host_q);
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_q_data();
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
boxlo, prd);
loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double ChargeGPUMemoryT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(ChargeGPUMemory<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void ChargeGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
if (_compiled)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
k_pair.set_function(*pair_program,"kernel_pair");
pos_tex.get_texture(*pair_program,"pos_tex");
q_tex.get_texture(*pair_program,"q_tex");
_compiled=true;
}
template class ChargeGPUMemory<PRECISION,ACC_PRECISION>;

View File

@ -1,203 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CHARGE_GPU_MEMORY_H
#define CHARGE_GPU_MEMORY_H
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
template <class numtyp, class acctyp>
class ChargeGPUMemory {
public:
ChargeGPUMemory();
virtual ~ChargeGPUMemory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
}
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
}
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
}
/// Zero timers
inline void zero_timers() {
nbor_time_avail=false;
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, bool &success);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *charge,
const int nlocal, double *boxlo, double *prd);
/// Pair loop with device neighboring
int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
double *charge, double *boxlo, double *prd);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
PairGPUDevice<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
PairGPUBalance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
PairGPUAtom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
PairGPUNbor *nbor;
/// True if we need to accumulate time for neighboring
bool nbor_time_avail;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_pair_fast, k_pair;
inline int block_size() { return _block_size; }
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
UCL_Texture q_tex;
protected:
bool _compiled;
int _block_size, _block_bio_size, _threads_per_atom;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const char *pair_string);
virtual void loop(const bool _eflag, const bool _vflag) = 0;
};
#endif

View File

@ -1,122 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "cmm_cut_gpu_memory.h"
using namespace std;
static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
CMMMF.clear();
gpu_mode=CMMMF.device->gpu_mode();
double gpu_split=CMMMF.device->particle_split();
int first_gpu=CMMMF.device->first_device();
int last_gpu=CMMMF.device->last_device();
int world_me=CMMMF.device->world_me();
int gpu_rank=CMMMF.device->gpu_rank();
int procs_per_gpu=CMMMF.device->procs_per_gpu();
CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu);
bool message=false;
if (CMMMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
CMMMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
CMMMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
CMMMF.estimate_gpu_overhead();
return init_ok;
}
void cmm_gpu_clear() {
CMMMF.clear();
}
int** cmm_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double cmm_gpu_bytes() {
return CMMMF.host_memory_usage();
}

View File

@ -1,403 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMM_GPU_KERNEL
#define CMM_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (r2inv<lj1[mtype].x) {
r2inv=(numtyp)1.0/r2inv;
numtyp inv1,inv2;
if (lj1[mtype].y == 2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj1[mtype].y == 1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
lj3[mtype].z;
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,__global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].x) {
r2inv=(numtyp)1.0/r2inv;
numtyp inv1,inv2;
if (lj1[mtype].y == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj1[mtype].y == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
lj3[mtype].z;
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,155 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "cmm_cut_gpu_cl.h"
#else
#include "cmm_cut_gpu_ptx.h"
#endif
#include "cmm_cut_gpu_memory.h"
#include <cassert>
#define CMM_GPU_MemoryT CMM_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
CMM_GPU_MemoryT::CMM_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
CMM_GPU_MemoryT::~CMM_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cmm_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int cmm_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
cmm_types=max_shared_types;
shared_types=true;
}
_cmm_types=cmm_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<cmm_types*cmm_types; i++)
host_write[i]=0.0;
lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
host_cg_type,host_lj1,host_lj2);
lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void CMM_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double CMM_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(CMM_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class CMM_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,78 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMM_GPU_MEMORY_H
#define CMM_GPU_MEMORY_H
#include "atomic_gpu_memory.h"
template <class numtyp, class acctyp>
class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
public:
CMM_GPU_Memory();
~CMM_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, int **host_cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = cutsq, lj1.y=cg_type, lj1.z = lj1, lj1.w = lj2
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _cmm_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,130 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "cmmc_long_gpu_memory.h"
using namespace std;
static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
CMMLMF.clear();
gpu_mode=CMMLMF.device->gpu_mode();
double gpu_split=CMMLMF.device->particle_split();
int first_gpu=CMMLMF.device->first_device();
int last_gpu=CMMLMF.device->last_device();
int world_me=CMMLMF.device->world_me();
int gpu_rank=CMMLMF.device->gpu_rank();
int procs_per_gpu=CMMLMF.device->procs_per_gpu();
CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu);
bool message=false;
if (CMMLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
CMMLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e, g_ewald);
CMMLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
CMMLMF.estimate_gpu_overhead();
return init_ok;
}
void cmml_gpu_clear() {
CMMLMF.clear();
}
int** cmml_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q,boxlo,prd);
}
void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
}
double cmml_gpu_bytes() {
return CMMLMF.host_memory_usage();
}

View File

@ -1,485 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMML_GPU_KERNEL
#define CMML_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define EWALD_F (numtyp)1.12837917
#define EWALD_P (numtyp)0.3275911
#define A1 (numtyp)0.254829592
#define A2 (numtyp)-0.284496736
#define A3 (numtyp)1.421413741
#define A4 (numtyp)-1.453152027
#define A5 (numtyp)1.061405429
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].x) {
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
numtyp r2inv=(numtyp)1.0/rsq;
if (rsq < lj1[mtype].y) {
if (lj3[mtype].x == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj3[mtype].x == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].x) {
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
numtyp r2inv=(numtyp)1.0/rsq;
if (rsq < lj1[mtype].y) {
if (lj3[mtype].x == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj3[mtype].x == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,170 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "cmmc_long_gpu_cl.h"
#else
#include "cmmc_long_gpu_ptx.h"
#endif
#include "cmmc_long_gpu_memory.h"
#include <cassert>
#define CMML_GPU_MemoryT CMML_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
CMML_GPU_MemoryT::CMML_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
CMML_GPU_MemoryT::~CMML_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq,
const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cmmc_long_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq,
host_cut_ljsq,host_lj1,host_lj2);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3,
host_lj4,host_offset);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_g_ewald=g_ewald;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void CMML_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double CMML_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(CMML_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
}
this->time_pair.stop();
}
template class CMML_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,82 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMML_GPU_MEMORY_H
#define CMML_GPU_MEMORY_H
#include "charge_gpu_memory.h"
template <class numtyp, class acctyp>
class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
public:
CMML_GPU_Memory();
~CMML_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, int ** cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,131 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "cmmc_msm_gpu_memory.h"
using namespace std;
static CMMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int cmmm_gpu_init(const int ntypes, double **cutsq, int **cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const int smooth) {
CMMMMF.clear();
gpu_mode=CMMMMF.device->gpu_mode();
double gpu_split=CMMMMF.device->particle_split();
int first_gpu=CMMMMF.device->first_device();
int last_gpu=CMMMMF.device->last_device();
int world_me=CMMMMF.device->world_me();
int gpu_rank=CMMMMF.device->gpu_rank();
int procs_per_gpu=CMMMMF.device->procs_per_gpu();
CMMMMF.device->init_message(screen,"cg/cmm/coul/msm",first_gpu,last_gpu);
bool message=false;
if (CMMMMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=CMMMMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e,smooth);
CMMMMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=CMMMMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e,smooth);
CMMMMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
CMMMMF.estimate_gpu_overhead();
return init_ok;
}
void cmmm_gpu_clear() {
CMMMMF.clear();
}
int** cmmm_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return CMMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void cmmm_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
CMMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
}
double cmmm_gpu_bytes() {
return CMMMMF.host_memory_usage();
}

View File

@ -1,531 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMMM_GPU_KERNEL
#define CMMM_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const int smooth, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
__local numtyp _ia;
__local numtyp _ia2;
__local numtyp _ia3;
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
_ia=(numtyp)-1.0/sqrt(cut_coulsq);
_ia2=(numtyp)-1.0/cut_coulsq;
_ia3=_ia2*_ia;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].x) {
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
numtyp r2inv=(numtyp)1.0/rsq;
if (rsq < lj1[mtype].y) {
if (lj3[mtype].x == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj3[mtype].x == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
} else
force_lj = (numtyp)0.0;
numtyp ir, r2_ia2, r4_ia4, r6_ia6;
if (rsq < cut_coulsq) {
ir = (numtyp)1.0/sqrt(rsq);
prefactor = qqrd2e*qtmp*fetch_q(j,q_);
r2_ia2 = rsq*_ia2;
r4_ia4 = r2_ia2*r2_ia2;
if (smooth==0)
forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
(numtyp)1.875*r4_ia4)-ir/rsq-
factor_coul*ir);
else {
r6_ia6 = r2_ia2*r4_ia4;
forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
r2_ia2-(numtyp)8.4375*r4_ia4+
(numtyp)2.1875*r6_ia6)-ir/rsq-
factor_coul*ir);
}
} else {
forcecoul = (numtyp)0.0;
prefactor = (numtyp)0.0;
}
force = forcecoul + force_lj * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
if (smooth==0)
e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
(numtyp)1.3125*r4_ia4-
(numtyp)0.3125*r4_ia4*r2_ia2)-
factor_coul*ir);
else
e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
r2_ia2+(numtyp)2.953125*r4_ia4-
(numtyp)1.40625*r6_ia6+
(numtyp)0.2734375*r4_ia4*r4_ia4));
if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const int smooth, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__local numtyp _ia;
__local numtyp _ia2;
__local numtyp _ia3;
_ia=(numtyp)-1.0/sqrt(cut_coulsq);
_ia2=(numtyp)1.0/cut_coulsq;
_ia3=_ia2*_ia;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].x) {
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
numtyp r2inv=(numtyp)1.0/rsq;
if (rsq < lj1[mtype].y) {
if (lj3[mtype].x == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj3[mtype].x == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
} else
force_lj = (numtyp)0.0;
numtyp ir, r2_ia2, r4_ia4, r6_ia6;
if (rsq < cut_coulsq) {
ir = (numtyp)1.0/sqrt(rsq);
prefactor = qqrd2e*qtmp*fetch_q(j,q_);
r2_ia2 = rsq*_ia2;
r4_ia4 = r2_ia2*r2_ia2;
if (smooth==0)
forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
(numtyp)1.875*r4_ia4)-ir/rsq-
factor_coul*ir);
else {
r6_ia6 = r2_ia2*r4_ia4;
forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
r2_ia2-(numtyp)8.4375*r4_ia4+
(numtyp)2.1875*r6_ia6)-ir/rsq-
factor_coul*ir);
}
} else {
forcecoul = (numtyp)0.0;
prefactor = (numtyp)0.0;
}
force = forcecoul + force_lj * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
if (smooth==0)
e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
(numtyp)1.3125*r4_ia4-
(numtyp)0.3125*r4_ia4*r2_ia2)-
factor_coul*ir);
else
e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
r2_ia2+(numtyp)2.953125*r4_ia4-
(numtyp)1.40625*r6_ia6+
(numtyp)0.2734375*r4_ia4*r4_ia4));
if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,169 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "cmmc_msm_gpu_cl.h"
#else
#include "cmmc_msm_gpu_ptx.h"
#endif
#include "cmmc_msm_gpu_memory.h"
#include <cassert>
#define CMMM_GPU_MemoryT CMMM_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
CMMM_GPU_MemoryT::CMMM_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
CMMM_GPU_MemoryT::~CMMM_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int CMMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int CMMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq,
const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const int smooth) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cmmc_msm_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq,
host_cut_ljsq,host_lj1,host_lj2);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3,
host_lj4,host_offset);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_smooth=smooth;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void CMMM_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double CMMM_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(CMMM_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_smooth, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_smooth, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class CMMM_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,83 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMMM_GPU_MEMORY_H
#define CMMM_GPU_MEMORY_H
#include "charge_gpu_memory.h"
template <class numtyp, class acctyp>
class CMMM_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
public:
CMMM_GPU_Memory();
~CMMM_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, int ** cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const int smooth);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut_coulsq, _qqrd2e;
private:
bool _allocated;
int _smooth;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,124 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "coul_long_gpu_memory.h"
using namespace std;
static CL_GPU_Memory<PRECISION,ACC_PRECISION> CLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int cl_gpu_init(const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald) {
CLMF.clear();
gpu_mode=CLMF.device->gpu_mode();
double gpu_split=CLMF.device->particle_split();
int first_gpu=CLMF.device->first_device();
int last_gpu=CLMF.device->last_device();
int world_me=CLMF.device->world_me();
int gpu_rank=CLMF.device->gpu_rank();
int procs_per_gpu=CLMF.device->procs_per_gpu();
CLMF.device->init_message(screen,"coul/long",first_gpu,last_gpu);
bool message=false;
if (CLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
screen, host_cut_coulsq, host_special_coul, qqrd2e,
g_ewald);
CLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
screen, host_cut_coulsq, host_special_coul,
qqrd2e, g_ewald);
CLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
CLMF.estimate_gpu_overhead();
return init_ok;
}
void cl_gpu_clear() {
CLMF.clear();
}
int** cl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void cl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
}
double cl_gpu_bytes() {
return CLMF.host_memory_usage();
}

View File

@ -1,411 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CL_GPU_KERNEL
#define CL_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define EWALD_F (numtyp)1.12837917
#define EWALD_P (numtyp)0.3275911
#define A1 (numtyp)0.254829592
#define A2 (numtyp)-0.284496736
#define A3 (numtyp)1.421413741
#define A4 (numtyp)-1.453152027
#define A5 (numtyp)1.061405429
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_cl_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_cl[4];
sp_cl[0]=sp_cl_in[0];
sp_cl[1]=sp_cl_in[1];
sp_cl[2]=sp_cl_in[2];
sp_cl[3]=sp_cl_in[3];
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_coul;
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq < cut_coulsq) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp force, prefactor, _erfc;
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*(_erfc-factor_coul);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
e_coul=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=(acctyp)0;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_cl_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_cl[4];
if (tid<4)
sp_cl[tid]=sp_cl_in[tid];
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_coul;
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq < cut_coulsq) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp force, prefactor, _erfc;
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*(_erfc-factor_coul);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
e_coul=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=(acctyp)0;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,158 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "coul_long_gpu_cl.h"
#else
#include "coul_long_gpu_ptx.h"
#endif
#include "coul_long_gpu_memory.h"
#include <cassert>
#define CL_GPU_MemoryT CL_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
CL_GPU_MemoryT::CL_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
CL_GPU_MemoryT::~CL_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int CL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int CL_GPU_MemoryT::init(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
gpu_split,_screen,coul_long_gpu_kernel);
if (success!=0)
return success;
// we don't have atom types for coulomb only,
// but go with the minimum so that we can use
// the same infrastructure as lj/cut/coul/long/gpu.
int lj_types=1;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_coul[i];
}
ucl_copy(sp_cl,host_write,4,false);
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_g_ewald=g_ewald;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_cl.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void CL_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_cl.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double CL_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(CL_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void CL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_cl.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class CL_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,79 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CL_GPU_MEMORY_H
#define CL_GPU_MEMORY_H
#include "charge_gpu_memory.h"
template <class numtyp, class acctyp>
class CL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
public:
CL_GPU_Memory();
~CL_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1 dummy
UCL_D_Vec<numtyp4> lj1;
/// lj3 dummy
UCL_D_Vec<numtyp4> lj3;
/// Special Coul values [0-3]
UCL_D_Vec<numtyp> sp_cl;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,136 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "crml_gpu_memory.h"
using namespace std;
static CRML_GPU_Memory<PRECISION,ACC_PRECISION> CRMLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald, const double cut_lj_innersq,
const double denom_lj, double **epsilon,
double **sigma, const bool mix_arithmetic) {
CRMLMF.clear();
gpu_mode=CRMLMF.device->gpu_mode();
double gpu_split=CRMLMF.device->particle_split();
int first_gpu=CRMLMF.device->first_device();
int last_gpu=CRMLMF.device->last_device();
int world_me=CRMLMF.device->world_me();
int gpu_rank=CRMLMF.device->gpu_rank();
int procs_per_gpu=CRMLMF.device->procs_per_gpu();
CRMLMF.device->init_message(screen,"lj/charmm/coul/long",first_gpu,last_gpu);
bool message=false;
if (CRMLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
epsilon,sigma,mix_arithmetic);
CRMLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon,
sigma, mix_arithmetic);
CRMLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
CRMLMF.estimate_gpu_overhead();
return init_ok;
}
void crml_gpu_clear() {
CRMLMF.clear();
}
int** crml_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void crml_gpu_compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q, const int nlocal,
double *boxlo, double *prd) {
CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
nlocal,boxlo,prd);
}
double crml_gpu_bytes() {
return CRMLMF.host_memory_usage();
}

View File

@ -1,499 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CRML_GPU_KERNEL
#define CRML_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_BIO_PAIR 64
#endif
#define MAX_BIO_SHARED_TYPES 128
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define EWALD_F (numtyp)1.12837917
#define EWALD_P (numtyp)0.3275911
#define A1 (numtyp)0.254829592
#define A2 (numtyp)-0.284496736
#define A3 (numtyp)1.421413741
#define A4 (numtyp)-1.453152027
#define A5 (numtyp)1.061405429
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
const int lj_types, __global numtyp *sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp denom_lj,
const numtyp cut_bothsq, const numtyp cut_ljsq,
const numtyp cut_lj_innersq, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<cut_bothsq) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc, switch1;
if (rsq < cut_ljsq) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
if (rsq > cut_lj_innersq) {
switch1 = (cut_ljsq-rsq);
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
denom_lj;
switch1 *= switch1;
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
denom_lj;
switch2 *= r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
force_lj = force_lj*switch1+switch2;
}
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < cut_ljsq) {
numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
if (rsq > cut_lj_innersq)
e *= switch1;
energy+=factor_lj*e;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp denom_lj,
const numtyp cut_bothsq, const numtyp cut_ljsq,
const numtyp cut_lj_innersq,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
ljd[tid]=ljd_in[tid];
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cut_bothsq) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, prefactor, _erfc, switch1;
numtyp lj3, lj4;
if (rsq < cut_ljsq) {
numtyp eps = sqrt(ljd[itype].x*ljd[jtype].x);
numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
numtyp sig_r_6 = sig6*sig6*r2inv;
sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
lj4 = (numtyp)4.0*eps*sig_r_6;
lj3 = lj4*sig_r_6;
force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
if (rsq > cut_lj_innersq) {
switch1 = (cut_ljsq-rsq);
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
denom_lj;
switch1 *= switch1;
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
denom_lj;
switch2 *= lj3-lj4;
force_lj = force_lj*switch1+switch2;
}
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < cut_ljsq) {
numtyp e=lj3-lj4;
if (rsq > cut_lj_innersq)
e *= switch1;
energy+=factor_lj*e;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
#endif

View File

@ -1,175 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "crml_gpu_cl.h"
#else
#include "crml_gpu_ptx.h"
#endif
#include "crml_gpu_memory.h"
#include <cassert>
#define CRML_GPU_MemoryT CRML_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
CRML_GPU_MemoryT::CRML_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
CRML_GPU_MemoryT::~CRML_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int CRML_GPU_MemoryT::init(const int ntypes,
double host_cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald, const double cut_lj_innersq,
const double denom_lj, double **epsilon,
double **sigma, const bool mix_arithmetic) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,crml_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (this->_block_bio_size>=64 && mix_arithmetic)
shared_types=true;
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
int h_size=lj_types*lj_types;
int max_bio_shared_types=this->device->max_bio_shared_types();
if (h_size<max_bio_shared_types)
h_size=max_bio_shared_types;
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<h_size*32; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_lj3,host_lj4);
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_bothsq = host_cut_bothsq;
_cut_coulsq = host_cut_coulsq;
_cut_ljsq = host_cut_ljsq;
_cut_lj_innersq = cut_lj_innersq;
_qqrd2e=qqrd2e;
_g_ewald=g_ewald;
_denom_lj=denom_lj;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void CRML_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
ljd.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double CRML_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(CRML_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->_block_bio_size;
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
&sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
&this->_threads_per_atom);
}
this->time_pair.stop();
}
template class CRML_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,86 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CRML_GPU_MEMORY_H
#define CRML_GPU_MEMORY_H
#include "charge_gpu_memory.h"
template <class numtyp, class acctyp>
class CRML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
public:
CRML_GPU_Memory();
~CRML_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double host_cut_bothsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald,
const double cut_lj_innersq, const double denom_lj,
double **epsilon, double **sigma, const bool mix_arithmetic);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// x = lj1, y = lj2, z = lj3, w = lj4
UCL_D_Vec<numtyp4> lj1;
/// x = epsilon, y = sigma
UCL_D_Vec<numtyp2> ljd;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _qqrd2e, _g_ewald, _denom_lj;
numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,424 +0,0 @@
// **************************************************************************
// ellipsoid_extra.h
// -------------------
// W. Michael Brown
//
// Device code for Ellipsoid math routines
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef ELLIPSOID_EXTRA_H
#define ELLIPSOID_EXTRA_H
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
/* ----------------------------------------------------------------------
dot product of 2 vectors
------------------------------------------------------------------------- */
__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
{
return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
}
/* ----------------------------------------------------------------------
cross product of 2 vectors
------------------------------------------------------------------------- */
__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
{
ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
}
/* ----------------------------------------------------------------------
determinant of a matrix
------------------------------------------------------------------------- */
__inline numtyp gpu_det3(const numtyp m[9])
{
numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] -
m[3]*m[1]*m[8] + m[3]*m[2]*m[7] +
m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
return ans;
}
/* ----------------------------------------------------------------------
diagonal matrix times a full matrix
------------------------------------------------------------------------- */
__inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9],
numtyp ans[9])
{
ans[0] = shape.x*m[0];
ans[1] = shape.x*m[1];
ans[2] = shape.x*m[2];
ans[3] = shape.y*m[3];
ans[4] = shape.y*m[4];
ans[5] = shape.y*m[5];
ans[6] = shape.z*m[6];
ans[7] = shape.z*m[7];
ans[8] = shape.z*m[8];
}
/* ----------------------------------------------------------------------
add two matrices
------------------------------------------------------------------------- */
__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
{
ans[0] = m[0]+m2[0];
ans[1] = m[1]+m2[1];
ans[2] = m[2]+m2[2];
ans[3] = m[3]+m2[3];
ans[4] = m[4]+m2[4];
ans[5] = m[5]+m2[5];
ans[6] = m[6]+m2[6];
ans[7] = m[7]+m2[7];
ans[8] = m[8]+m2[8];
}
/* ----------------------------------------------------------------------
multiply the transpose of mat1 times mat2
------------------------------------------------------------------------- */
__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
numtyp ans[9])
{
ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
}
/* ----------------------------------------------------------------------
row vector times matrix
------------------------------------------------------------------------- */
__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
{
ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
}
/* ----------------------------------------------------------------------
solve Ax = b or M ans = v
use gaussian elimination & partial pivoting on matrix
error_flag set to 2 if bad matrix inversion attempted
------------------------------------------------------------------------- */
__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
__global int *error_flag)
{
// create augmented matrix for pivoting
numtyp aug[12], t;
aug[3] = v[0];
aug[0] = m[0];
aug[1] = m[1];
aug[2] = m[2];
aug[7] = v[1];
aug[4] = m[3];
aug[5] = m[4];
aug[6] = m[5];
aug[11] = v[2];
aug[8] = m[6];
aug[9] = m[7];
aug[10] = m[8];
if (fabs(aug[4]) > fabs(aug[0])) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
}
if (fabs(aug[8]) > fabs(aug[0])) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
}
if (aug[0] != (numtyp)0.0) {
if (0!=0) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
}
} else if (aug[4] != (numtyp)0.0) {
if (1!=0) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
}
} else if (aug[8] != (numtyp)0.0) {
if (2!=0) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
}
} else
*error_flag=2;
t = aug[4]/aug[0];
aug[5]-=t*aug[1];
aug[6]-=t*aug[2];
aug[7]-=t*aug[3];
t = aug[8]/aug[0];
aug[9]-=t*aug[1];
aug[10]-=t*aug[2];
aug[11]-=t*aug[3];
if (fabs(aug[9]) > fabs(aug[5])) {
numtyp swapt;
swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
}
if (aug[5] != (numtyp)0.0) {
if (1!=1) {
numtyp swapt;
swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
}
} else if (aug[9] != (numtyp)0.0) {
if (2!=1) {
numtyp swapt;
swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
}
}
t = aug[9]/aug[5];
aug[10]-=t*aug[6];
aug[11]-=t*aug[7];
if (aug[10] == (numtyp)0.0)
*error_flag=2;
ans[2] = aug[11]/aug[10];
t = (numtyp)0.0;
t += aug[6]*ans[2];
ans[1] = (aug[7]-t) / aug[5];
t = (numtyp)0.0;
t += aug[1]*ans[1];
t += aug[2]*ans[2];
ans[0] = (aug[3]-t) / aug[0];
}
/* ----------------------------------------------------------------------
compute rotation matrix from quaternion conjugate
quat = [w i j k]
------------------------------------------------------------------------- */
__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
numtyp mat[9])
{
numtyp4 q=qif[qi];
numtyp w2 = q.x*q.x;
numtyp i2 = q.y*q.y;
numtyp j2 = q.z*q.z;
numtyp k2 = q.w*q.w;
numtyp twoij = (numtyp)2.0*q.y*q.z;
numtyp twoik = (numtyp)2.0*q.y*q.w;
numtyp twojk = (numtyp)2.0*q.z*q.w;
numtyp twoiw = (numtyp)2.0*q.y*q.x;
numtyp twojw = (numtyp)2.0*q.z*q.x;
numtyp twokw = (numtyp)2.0*q.w*q.x;
mat[0] = w2+i2-j2-k2;
mat[3] = twoij-twokw;
mat[6] = twojw+twoik;
mat[1] = twoij+twokw;
mat[4] = w2-i2+j2-k2;
mat[7] = twojk-twoiw;
mat[2] = twoik-twojw;
mat[5] = twojk+twoiw;
mat[8] = w2-i2-j2+k2;
}
/* ----------------------------------------------------------------------
transposed matrix times diagonal matrix
------------------------------------------------------------------------- */
__inline void gpu_transpose_times_diag3(const numtyp m[9],
const numtyp4 d, numtyp ans[9])
{
ans[0] = m[0]*d.x;
ans[1] = m[3]*d.y;
ans[2] = m[6]*d.z;
ans[3] = m[1]*d.x;
ans[4] = m[4]*d.y;
ans[5] = m[7]*d.z;
ans[6] = m[2]*d.x;
ans[7] = m[5]*d.y;
ans[8] = m[8]*d.z;
}
/* ----------------------------------------------------------------------
multiply mat1 times mat2
------------------------------------------------------------------------- */
__inline void gpu_times3(const numtyp m[9], const numtyp m2[9],
numtyp ans[9])
{
ans[0] = m[0]*m2[0] + m[1]*m2[3] + m[2]*m2[6];
ans[1] = m[0]*m2[1] + m[1]*m2[4] + m[2]*m2[7];
ans[2] = m[0]*m2[2] + m[1]*m2[5] + m[2]*m2[8];
ans[3] = m[3]*m2[0] + m[4]*m2[3] + m[5]*m2[6];
ans[4] = m[3]*m2[1] + m[4]*m2[4] + m[5]*m2[7];
ans[5] = m[3]*m2[2] + m[4]*m2[5] + m[5]*m2[8];
ans[6] = m[6]*m2[0] + m[7]*m2[3] + m[8]*m2[6];
ans[7] = m[6]*m2[1] + m[7]*m2[4] + m[8]*m2[7];
ans[8] = m[6]*m2[2] + m[7]*m2[5] + m[8]*m2[8];
}
/* ----------------------------------------------------------------------
Apply principal rotation generator about x to rotation matrix m
------------------------------------------------------------------------- */
__inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9])
{
ans[0] = 0;
ans[1] = -m[2];
ans[2] = m[1];
ans[3] = 0;
ans[4] = -m[5];
ans[5] = m[4];
ans[6] = 0;
ans[7] = -m[8];
ans[8] = m[7];
}
/* ----------------------------------------------------------------------
Apply principal rotation generator about y to rotation matrix m
------------------------------------------------------------------------- */
__inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9])
{
ans[0] = m[2];
ans[1] = 0;
ans[2] = -m[0];
ans[3] = m[5];
ans[4] = 0;
ans[5] = -m[3];
ans[6] = m[8];
ans[7] = 0;
ans[8] = -m[6];
}
/* ----------------------------------------------------------------------
Apply principal rotation generator about z to rotation matrix m
------------------------------------------------------------------------- */
__inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])
{
ans[0] = -m[1];
ans[1] = m[0];
ans[2] = 0;
ans[3] = -m[4];
ans[4] = m[3];
ans[5] = 0;
ans[6] = -m[7];
ans[7] = m[6];
ans[8] = 0;
}
/* ----------------------------------------------------------------------
matrix times vector
------------------------------------------------------------------------- */
__inline void gpu_times_column3(const numtyp m[9], const numtyp v[3],
numtyp ans[3])
{
ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2];
ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2];
ans[2] = m[6]*v[0] + m[7]*v[1] + m[8]*v[2];
}
#endif

View File

@ -1,165 +0,0 @@
// **************************************************************************
// ellipsoid_nbor.cu
// -------------------
// W. Michael Brown
//
// Device code for Ellipsoid neighbor routines
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef ELLIPSOID_NBOR_H
#define ELLIPSOID_NBOR_H
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#else
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
// ---------------------------------------------------------------------------
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only unpack neighbors matching the specified inclusive range of forms
// -- Only unpack neighbors within cutoff
// ---------------------------------------------------------------------------
__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
const int ntypes, __global int *dev_nbor,
const int nbor_pitch, const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X+start;
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24(iw,ntypes);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
numtyp2 cf=cut_form[mtype];
if (cf.y>=form_low && cf.y<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cf.x) {
*packed=j;
packed+=nbor_pitch;
newj++;
}
}
}
dev_nbor[ii+nbor_pitch]=newj;
}
}
// ---------------------------------------------------------------------------
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only unpack neighbors matching the specified inclusive range of forms
// -- Only unpack neighbors within cutoff
// -- Fast version of routine that uses shared memory for LJ constants
// ---------------------------------------------------------------------------
__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
__global int *dev_nbor, const int nbor_pitch,
const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high) {
int ii=THREAD_ID_X;
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
cutsq[ii]=cut_form[ii].x;
form[ii]=cut_form[ii].y;
}
ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
if (form[mtype]>=form_low && form[mtype]<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cutsq[mtype]) {
*packed=j;
packed+=nbor_pitch;
newj++;
}
}
}
dev_nbor[ii+nbor_pitch]=newj;
}
}
#endif

View File

@ -1,307 +0,0 @@
/***************************************************************************
gayberne.cpp
-------------------
W. Michael Brown
Host code for Gay-Berne potential acceleration
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "gayberne_cl.h"
#else
#include "gayberne_ptx.h"
#endif
#include "gayberne.h"
#include <cassert>
using namespace LAMMPS_AL;
#define GayBerneT GayBerne<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
GayBerneT::GayBerne() : BaseEllipsoid<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
GayBerneT::~GayBerne() {
clear();
}
template <class numtyp, class acctyp>
int GayBerneT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int GayBerneT::init(const int ntypes, const double gamma,
const double upsilon, const double mu,
double **host_shape, double **host_well,
double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape,
int **h_form, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4,
double **host_offset, const double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,ntypes,h_form,gayberne,gayberne_lj);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
_shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) {
lj_types=max_shared_types;
_shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for copying type data
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
host_sigma,host_epsilon);
this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
host_cutsq,h_form);
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq,h_form);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
dev_error.alloc(1,*(this->ucl_device));
dev_error.zero();
// Allocate, cast and asynchronous memcpy of constant data
// Copy data for bonded interactions
gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY);
host_write[0]=static_cast<numtyp>(gamma);
host_write[1]=static_cast<numtyp>(upsilon);
host_write[2]=static_cast<numtyp>(mu);
host_write[3]=static_cast<numtyp>(host_special_lj[0]);
host_write[4]=static_cast<numtyp>(host_special_lj[1]);
host_write[5]=static_cast<numtyp>(host_special_lj[2]);
host_write[6]=static_cast<numtyp>(host_special_lj[3]);
ucl_copy(gamma_upsilon_mu,host_write,7,false);
lshape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
UCL_H_Vec<double> d_view;
d_view.view(host_lshape,lshape.numel(),*(this->ucl_device));
ucl_copy(lshape,d_view,false);
// Copy shape, well, sigma, epsilon, and cutsq onto GPU
// - cast if necessary
shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<ntypes; i++) {
host_write[i*4]=host_shape[i][0];
host_write[i*4+1]=host_shape[i][1];
host_write[i*4+2]=host_shape[i][2];
}
UCL_H_Vec<numtyp4> view4;
view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
ucl_copy(shape,view4,false);
well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<ntypes; i++) {
host_write[i*4]=host_well[i][0];
host_write[i*4+1]=host_well[i][1];
host_write[i*4+2]=host_well[i][2];
}
view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
ucl_copy(well,view4,false);
_allocated=true;
this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+
lshape.row_bytes()+shape.row_bytes()+well.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void GayBerneT::clear() {
if (!_allocated)
return;
UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
ucl_copy(err_flag,dev_error,false);
if (err_flag[0] == 2)
std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
err_flag.clear();
_allocated=false;
dev_error.clear();
lj1.clear();
lj3.clear();
sigma_epsilon.clear();
this->cut_form.clear();
shape.clear();
well.clear();
lshape.clear();
gamma_upsilon_mu.clear();
this->clear_base();
}
template <class numtyp, class acctyp>
double GayBerneT::host_memory_usage() const {
return this->host_memory_usage_base()+sizeof(GayBerneT)+
4*sizeof(numtyp);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void GayBerneT::loop(const bool _eflag, const bool _vflag) {
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=0, NGX;
int stride=this->nbor->nbor_pitch();
int ainum=this->ans->inum();
if (this->_multiple_forms) {
this->time_nbor1.start();
if (this->_last_ellipse>0) {
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/
(BX/this->_threads_per_atom)));
NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
this->time_nbor1.stop();
this->time_ellipsoid.start();
this->k_ellipsoid.set_size(GX,BX);
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
&this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(),
&this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
&stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
&this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
&this->_threads_per_atom);
this->time_ellipsoid.stop();
if (this->_last_ellipse==this->ans->inum()) {
this->time_nbor2.start();
this->time_nbor2.stop();
this->time_ellipsoid2.start();
this->time_ellipsoid2.stop();
this->time_lj.start();
this->time_lj.stop();
return;
}
// ------------ SPHERE_ELLIPSE ---------------
this->time_nbor2.start();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
this->_last_ellipse)/
(BX/this->_threads_per_atom)));
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
this->_last_ellipse)/BX));
this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
this->time_nbor2.stop();
this->time_ellipsoid2.start();
this->k_sphere_ellipsoid.set_size(GX,BX);
this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(),
&this->well.begin(), &this->gamma_upsilon_mu.begin(),
&this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(),
&this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
this->time_ellipsoid2.stop();
} else {
this->ans->dev_ans.zero();
this->ans->dev_engv.zero();
this->time_nbor1.stop();
this->time_ellipsoid.start();
this->time_ellipsoid.stop();
this->time_nbor2.start();
this->time_nbor2.stop();
this->time_ellipsoid2.start();
this->time_ellipsoid2.stop();
}
// ------------ LJ ---------------
this->time_lj.start();
if (this->_last_ellipse<this->ans->inum()) {
if (this->_shared_types) {
this->k_lj_fast.set_size(GX,BX);
this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
&this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride,
&this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &this->dev_error.begin(),
&eflag, &vflag, &this->_last_ellipse, &ainum,
&this->_threads_per_atom);
} else {
this->k_lj.set_size(GX,BX);
this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
&this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(),
&stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
}
}
this->time_lj.stop();
} else {
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
this->time_nbor1.start();
this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
this->time_nbor1.stop();
this->time_ellipsoid.start();
this->k_ellipsoid.set_size(GX,BX);
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
&this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(),
&this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
&stride, &this->ans->dev_ans.begin(), &ainum,
&this->ans->dev_engv.begin(), &this->dev_error.begin(),
&eflag, &vflag, &ainum, &this->_threads_per_atom);
this->time_ellipsoid.stop();
}
}
template class GayBerne<PRECISION,ACC_PRECISION>;

View File

@ -1,429 +0,0 @@
// **************************************************************************
// gayberne.cu
// -------------------
// W. Michael Brown
//
// Device code for Gay-Berne potential acceleration
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef GAYBERNE_CU
#define GAYBERNE_CU
#ifdef NV_KERNEL
#include "ellipsoid_extra.h"
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape,
numtyp ans[9])
{
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
den = (numtyp)1.0/den;
ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, const int astride,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp4 tor;
tor.x=(acctyp)0;
tor.y=(acctyp)0;
tor.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp a1[9], b1[9], g1[9];
numtyp4 ishape=shape[itype];
{
numtyp t[9];
gpu_quat_to_mat_trans(q,i,a1);
gpu_diag_times3(ishape,a1,t);
gpu_transpose_times3(a1,t,g1);
gpu_diag_times3(well[itype],a1,t);
gpu_transpose_times3(a1,t,b1);
}
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
numtyp g2[9];
{
gpu_diag_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
gpu_plus3(g1,g2,g12);
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- replace r12 with r12 hat
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
}
// torque for particle 1
{
numtyp tempv[3], tempv2[3];
tempv[0] = -uslj_rsq*kappa[0];
tempv[1] = -uslj_rsq*kappa[1];
tempv[2] = -uslj_rsq*kappa[2];
gpu_row_times3(kappa,g1,tempv2);
gpu_cross3(tempv,tempv2,tUr);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
// Compute teta
numtyp temp[9], tempv[3], tempv2[3];
compute_eta_torque(g12,a1,ishape,temp);
numtyp temp1 = -eta*gum[1];
tempv[0] = temp1*temp[0];
tempv[1] = temp1*temp[1];
tempv[2] = temp1*temp[2];
gpu_cross3(a1,tempv,tempv2);
teta[0] = tempv2[0];
teta[1] = tempv2[1];
teta[2] = tempv2[2];
tempv[0] = temp1*temp[3];
tempv[1] = temp1*temp[4];
tempv[2] = temp1*temp[5];
gpu_cross3(a1+3,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
tempv[0] = temp1*temp[6];
tempv[1] = temp1*temp[7];
tempv[2] = temp1*temp[8];
gpu_cross3(a1+6,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
}
numtyp chi, dchi[3], tchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b2[9], b12[9];
{
gpu_diag_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
gpu_plus3(b1,b2,b12);
}
// compute chi_12
r12[0]*=r;
r12[1]*=r;
r12[2]*=r;
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
chi = gpu_dot3(r12,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
// compute t_chi
numtyp tempv[3];
gpu_row_times3(iota,b1,tempv);
gpu_cross3(tempv,iota,tchi);
temp1 = (numtyp)-4.0*ir*ir;
tchi[0] *= temp1;
tchi[1] *= temp1;
tchi[2] *= temp1;
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-r;
r12[1]*=-r;
r12[2]*=-r;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
// Torque on 1
temp1 = -u_r*eta*factor_lj;
temp2 = -u_r*chi*factor_lj;
numtyp temp3 = -chi*eta*factor_lj;
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[7][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=tor.x;
red_acc[4][tid]=tor.y;
red_acc[5][tid]=tor.z;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
tor.x=red_acc[3][tid];
tor.y=red_acc[4][tid];
tor.z=red_acc[5][tid];
if (eflag>0 || vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
red_acc[6][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<7; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
energy=red_acc[6][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=astride;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=astride;
}
}
ans[ii]=f;
ans[ii+astride]=tor;
} // if ii
}
#endif

View File

@ -1,95 +0,0 @@
/***************************************************************************
gayberne.h
-------------------
W. Michael Brown
Host code for Gay-Berne potential acceleration
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifndef GAYBERNE_H
#define GAYBERNE_H
#include "base_ellipsoid.h"
#include "mpi.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
public:
GayBerne();
~GayBerne();
/// Clear any previous data and set up for a new LAMMPS run
/** \param gpu_nbor true if neighboring performed on device
* \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \return false if there is not sufficient memory or device init prob
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **host_shape,
double **host_well, double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape, int **h_form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
const double *host_special_lj, const int nlocal, const int nall,
const int max_nbors, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
/// Device Error Flag - Set if a bad matrix inversion occurs
UCL_D_Vec<int> dev_error;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
UCL_D_Vec<numtyp2> sigma_epsilon;
// 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
UCL_D_Vec<numtyp> gamma_upsilon_mu;
/// If atom type constants fit in shared memory, use fast kernels
bool _shared_types;
int _lj_types;
// --------------------------- ATOM DATA --------------------------
/// Aspherical Const Data for Atoms
UCL_D_Vec<numtyp4> shape, well;
/// Aspherical Const Data for Atoms
UCL_D_Vec<numtyp> lshape;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -1,141 +0,0 @@
/***************************************************************************
gayberne_ext.cpp
-------------------
W. Michael Brown
LAMMPS Wrappers for Gay-Berne Acceleration
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "gayberne.h"
using namespace std;
using namespace LAMMPS_AL;
static GayBerne<PRECISION,ACC_PRECISION> GBMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int gb_gpu_init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
GBMF.clear();
gpu_mode=GBMF.device->gpu_mode();
double gpu_split=GBMF.device->particle_split();
int first_gpu=GBMF.device->first_device();
int last_gpu=GBMF.device->last_device();
int world_me=GBMF.device->world_me();
int gpu_rank=GBMF.device->gpu_rank();
int procs_per_gpu=GBMF.device->procs_per_gpu();
GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
bool message=false;
if (GBMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
sigma, epsilon, host_lshape, form, host_lj1,
host_lj2, host_lj3, host_lj4, offset, special_lj,
inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
screen);
GBMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma,
epsilon, host_lshape, form, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, inum, nall,
max_nbors, maxspecial, cell_size, gpu_split, screen);
GBMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
GBMF.estimate_gpu_overhead();
return init_ok;
}
// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
void gb_gpu_clear() {
GBMF.clear();
}
int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
double **host_quat);
int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial, int **special,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, int **ilist,
int **jnum, const double cpu_time, bool &success,
double **host_quat) {
return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
tag, nspecial, special, eflag, vflag, eatom, vatom,
host_start, ilist, jnum, cpu_time, success, host_quat);
}
int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double **host_quat) {
return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
cpu_time, success, host_quat);
}
// ---------------------------------------------------------------------------
// Return memory usage
// ---------------------------------------------------------------------------
double gb_gpu_bytes() {
return GBMF.host_memory_usage();
}

View File

@ -1,594 +0,0 @@
// **************************************************************************
// gayberne_lj.cu
// -------------------
// W. Michael Brown
//
// Device code for Gay-Berne - Lennard-Jones potential acceleration
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef GAYBERNE_LJ_CU
#define GAYBERNE_LJ_CU
#ifdef NV_KERNEL
#include "ellipsoid_extra.h"
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
__global numtyp4* shape,__global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag,const int start, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+stride*numj;
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp oner=shape[itype].x;
numtyp one_well=well[itype].x;
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp r12hat[3];
r12hat[0]=r12[0]*ir;
r12hat[1]=r12[1]*ir;
r12hat[2]=r12[2]*ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], eta;
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
{
numtyp g2[9];
gpu_diag_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
g12[0]=g2[0]+oner;
g12[4]=g2[4]+oner;
g12[8]=g2[8]+oner;
g12[1]=g2[1];
g12[2]=g2[2];
g12[3]=g2[3];
g12[5]=g2[5];
g12[6]=g2[6];
g12[7]=g2[7];
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12hat,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12hat);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
}
numtyp chi, dchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b12[9];
{
numtyp b2[9];
gpu_diag_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
b12[0]=b2[0]+one_well;
b12[4]=b2[4]+one_well;
b12[8]=b2[8]+one_well;
b12[1]=b2[1];
b12[2]=b2[2];
b12[3]=b2[3];
b12[5]=b2[5];
b12[6]=b2[6];
b12[7]=b2[7];
}
// compute chi_12
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
chi = gpu_dot3(r12hat,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12hat);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-1;
r12[1]*=-1;
r12[2]*=-1;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int ii=itype*lj_types+jtype;
if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
energy+=factor_lj*(e-lj3[ii].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (tid<4)
sp_lj[tid]=gum[tid+3];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
#endif

View File

@ -1,456 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "gb_gpu_memory.h"
using namespace std;
static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF;
#define GBMT GB_GPU_Memory<numtyp,acctyp>
template<class numtyp, class acctyp>
void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
const int inum, const int form_low, const int form_high) {
int stride=gbm.nbor->nbor_pitch();
int anall=gbm.atom->nall();
if (gbm.shared_types) {
GBMF.k_gb_nbor_fast.set_size(GX,BX);
GBMF.k_gb_nbor_fast.run(&gbm.atom->dev_x.begin(),
&gbm.cut_form.begin(), &gbm.nbor->dev_nbor.begin(), &stride,
&start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
&form_high, &anall);
} else {
GBMF.k_gb_nbor.set_size(GX,BX);
GBMF.k_gb_nbor.run(&gbm.atom->dev_x.begin(), &gbm.cut_form.begin(),
&gbm._lj_types, &gbm.nbor->dev_nbor.begin(), &stride,
&start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
&form_high, &anall);
}
}
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int gb_gpu_init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const double cell_size, int &gpu_mode, FILE *screen) {
GBMF.clear();
gpu_mode=GBMF.device->gpu_mode();
double gpu_split=GBMF.device->particle_split();
int first_gpu=GBMF.device->first_device();
int last_gpu=GBMF.device->last_device();
int world_me=GBMF.device->world_me();
int gpu_rank=GBMF.device->gpu_rank();
int procs_per_gpu=GBMF.device->procs_per_gpu();
GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
bool message=false;
if (GBMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
sigma, epsilon, host_lshape, form, host_lj1,
host_lj2, host_lj3, host_lj4, offset, special_lj,
inum, nall, max_nbors, cell_size, gpu_split, screen);
GBMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma,
epsilon, host_lshape, form, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, inum, nall,
max_nbors, cell_size, gpu_split, screen);
GBMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
GBMF.estimate_gpu_overhead();
return init_ok;
}
// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
void gb_gpu_clear() {
GBMF.clear();
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class gbmtyp>
inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
const int host_inum, const int nall,
double **host_x, double **host_quat,
int *host_type, double *sublo,
double *subhi, bool &success) {
gbm.nbor_time_avail=true;
success=true;
gbm.resize_atom(inum,nall,success);
gbm.resize_local(inum,host_inum,gbm.nbor->max_nbors(),0,success);
if (!success)
return;
gbm.atom->cast_copy_x(host_x,host_type);
int mn;
gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
sublo, subhi, NULL, NULL, NULL, success, mn);
gbm.nbor->copy_unpacked(inum,mn);
gbm.last_ellipse=inum;
gbm.max_last_ellipse=inum;
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host and (if spheres) reorder so ellipses first
// ---------------------------------------------------------------------------
template <class gbmtyp>
void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
const int inum, const int osize,
int *ilist, int *numj,
int *type, int **firstneigh,
bool &success) {
success=true;
gbm.nbor_time_avail=true;
int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist);
gbm.resize_atom(inum,nall,success);
gbm.resize_local(inum,0,mn,osize,success);
if (!success)
return;
if (gbm.multiple_forms) {
int p=0;
for (int i=0; i<osize; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
p++;
}
}
gbm.max_last_ellipse=p;
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
for (int i=0; i<osize; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
p++;
}
}
gbm.nbor->get_host(inum,gbm.host_olist.begin(),numj,firstneigh,
gbm.block_size());
gbm.nbor->copy_unpacked(inum,mn);
return;
}
gbm.last_ellipse=inum;
gbm.max_last_ellipse=inum;
gbm.nbor->get_host(inum,ilist,numj,firstneigh,gbm.block_size());
gbm.nbor->copy_unpacked(inum,mn);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=gbm.block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/
(BX/gbm._threads_per_atom)));
int stride=gbm.nbor->nbor_pitch();
int ainum=gbm.ans->inum();
int anall=gbm.atom->nall();
if (gbm.multiple_forms) {
gbm.time_kernel.start();
if (gbm.last_ellipse>0) {
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
(BX/gbm._threads_per_atom)));
gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
GBMF.k_gayberne.set_size(GX,BX);
GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(),
&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
&stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall,
&gbm._threads_per_atom);
gbm.time_gayberne.stop();
if (gbm.last_ellipse==gbm.ans->inum()) {
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
gbm.time_gayberne2.stop();
gbm.time_pair.start();
gbm.time_pair.stop();
return;
}
// ------------ SPHERE_ELLIPSE ---------------
gbm.time_kernel2.start();
GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
gbm.last_ellipse)/
(BX/gbm._threads_per_atom)));
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
SPHERE_ELLIPSE,SPHERE_ELLIPSE);
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
GBMF.k_sphere_gb.set_size(GX,BX);
GBMF.k_sphere_gb.run(&gbm.atom->dev_x.begin(),&gbm.atom->dev_quat.begin(),
&gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(),
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
&vflag, &gbm.last_ellipse, &ainum, &anall,
&gbm._threads_per_atom);
gbm.time_gayberne2.stop();
} else {
gbm.ans->dev_ans.zero();
gbm.ans->dev_engv.zero();
gbm.time_kernel.stop();
gbm.time_gayberne.start();
gbm.time_gayberne.stop();
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
gbm.time_gayberne2.stop();
}
// ------------ LJ ---------------
gbm.time_pair.start();
if (gbm.last_ellipse<gbm.ans->inum()) {
if (gbm.shared_types) {
GBMF.k_lj_fast.set_size(GX,BX);
GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
&gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
&stride, &gbm.nbor->dev_packed.begin(),
&gbm.ans->dev_ans.begin(),
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
&gbm._threads_per_atom);
} else {
GBMF.k_lj.set_size(GX,BX);
GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
&gbm.lj3.begin(), &gbm._lj_types,
&gbm.gamma_upsilon_mu.begin(), &stride,
&gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
&gbm._threads_per_atom);
}
}
gbm.time_pair.stop();
} else {
gbm.time_kernel.start();
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
GBMF.k_gayberne.set_size(GX,BX);
GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(), &gbm.atom->dev_quat.begin(),
&gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
&stride, &gbm.ans->dev_ans.begin(), &ainum,
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom);
gbm.time_gayberne.stop();
}
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, torques, energies
// ---------------------------------------------------------------------------
template <class gbmtyp>
inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
const int inum_full, const int nall,
double **host_x, int *host_type,
double *sublo, double *subhi, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double **host_quat) {
gbm.acc_timers();
if (inum_full==0) {
host_start=0;
gbm.zero_timers();
return NULL;
}
gbm.hd_balancer.balance(cpu_time);
int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
gbm.ans->inum(inum);
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
host_quat, host_type, sublo, subhi, success);
if (!success)
return NULL;
gbm.atom->cast_quat_data(host_quat[0]);
gbm.hd_balancer.start_timer();
} else {
gbm.atom->cast_x_data(host_x,host_type);
gbm.atom->cast_quat_data(host_quat[0]);
gbm.hd_balancer.start_timer();
gbm.atom->add_x_data(host_x,host_type);
}
gbm.atom->add_quat_data();
*ilist=gbm.nbor->host_ilist.begin();
*jnum=gbm.nbor->host_acc.begin();
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
gbm.device->add_ans_object(gbm.ans);
gbm.hd_balancer.stop_timer();
return gbm.nbor->host_jlist.begin()-host_start;
}
int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double **host_quat) {
return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo,
subhi, eflag, vflag, eatom, vatom, host_start, ilist,
jnum, cpu_time, success, host_quat);
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, torques,..
// ---------------------------------------------------------------------------
template <class gbmtyp>
inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
const int nall,double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double **host_quat) {
gbm.acc_timers();
if (inum_full==0) {
host_start=0;
gbm.zero_timers();
return NULL;
}
int ago=gbm.hd_balancer.ago_first(f_ago);
int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time);
gbm.ans->inum(inum);
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
host_start=inum;
if (ago==0) {
_gb_gpu_reset_nbors(gbm, nall, inum, inum_full, ilist, numj, host_type,
firstneigh, success);
if (!success)
return NULL;
}
int *list;
if (gbm.multiple_forms)
list=gbm.host_olist.begin();
else
list=ilist;
gbm.atom->cast_x_data(host_x,host_type);
gbm.atom->cast_quat_data(host_quat[0]);
gbm.hd_balancer.start_timer();
gbm.atom->add_x_data(host_x,host_type);
gbm.atom->add_quat_data();
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
gbm.device->add_ans_object(gbm.ans);
gbm.hd_balancer.stop_timer();
return list;
}
int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double **host_quat) {
return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x,
host_type, ilist, numj, firstneigh, eflag, vflag,
eatom, vatom, host_start, cpu_time, success,
host_quat);
}
// ---------------------------------------------------------------------------
// Return memory usage
// ---------------------------------------------------------------------------
double gb_gpu_bytes() {
return GBMF.host_memory_usage();
}

View File

@ -1,315 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_EXTRA_H
#define GB_GPU_EXTRA_H
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
/* ----------------------------------------------------------------------
dot product of 2 vectors
------------------------------------------------------------------------- */
__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
{
return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
}
/* ----------------------------------------------------------------------
cross product of 2 vectors
------------------------------------------------------------------------- */
__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
{
ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
}
/* ----------------------------------------------------------------------
determinant of a matrix
------------------------------------------------------------------------- */
__inline numtyp gpu_det3(const numtyp m[9])
{
numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] -
m[3]*m[1]*m[8] + m[3]*m[2]*m[7] +
m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
return ans;
}
/* ----------------------------------------------------------------------
diagonal matrix times a full matrix
------------------------------------------------------------------------- */
__inline void gpu_times3(const numtyp4 shape, const numtyp m[9],
numtyp ans[9])
{
ans[0] = shape.x*m[0];
ans[1] = shape.x*m[1];
ans[2] = shape.x*m[2];
ans[3] = shape.y*m[3];
ans[4] = shape.y*m[4];
ans[5] = shape.y*m[5];
ans[6] = shape.z*m[6];
ans[7] = shape.z*m[7];
ans[8] = shape.z*m[8];
}
/* ----------------------------------------------------------------------
add two matrices
------------------------------------------------------------------------- */
__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
{
ans[0] = m[0]+m2[0];
ans[1] = m[1]+m2[1];
ans[2] = m[2]+m2[2];
ans[3] = m[3]+m2[3];
ans[4] = m[4]+m2[4];
ans[5] = m[5]+m2[5];
ans[6] = m[6]+m2[6];
ans[7] = m[7]+m2[7];
ans[8] = m[8]+m2[8];
}
/* ----------------------------------------------------------------------
multiply the transpose of mat1 times mat2
------------------------------------------------------------------------- */
__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
numtyp ans[9])
{
ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
}
/* ----------------------------------------------------------------------
row vector times matrix
------------------------------------------------------------------------- */
__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
{
ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
}
/* ----------------------------------------------------------------------
solve Ax = b or M ans = v
use gaussian elimination & partial pivoting on matrix
error_flag set to 2 if bad matrix inversion attempted
------------------------------------------------------------------------- */
__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
__global int *error_flag)
{
// create augmented matrix for pivoting
numtyp aug[12], t;
aug[3] = v[0];
aug[0] = m[0];
aug[1] = m[1];
aug[2] = m[2];
aug[7] = v[1];
aug[4] = m[3];
aug[5] = m[4];
aug[6] = m[5];
aug[11] = v[2];
aug[8] = m[6];
aug[9] = m[7];
aug[10] = m[8];
if (fabs(aug[4]) > fabs(aug[0])) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
}
if (fabs(aug[8]) > fabs(aug[0])) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
}
if (aug[0] != (numtyp)0.0) {
if (0!=0) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
}
} else if (aug[4] != (numtyp)0.0) {
if (1!=0) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
}
} else if (aug[8] != (numtyp)0.0) {
if (2!=0) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
}
} else
*error_flag=2;
t = aug[4]/aug[0];
aug[5]-=t*aug[1];
aug[6]-=t*aug[2];
aug[7]-=t*aug[3];
t = aug[8]/aug[0];
aug[9]-=t*aug[1];
aug[10]-=t*aug[2];
aug[11]-=t*aug[3];
if (fabs(aug[9]) > fabs(aug[5])) {
numtyp swapt;
swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
}
if (aug[5] != (numtyp)0.0) {
if (1!=1) {
numtyp swapt;
swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
}
} else if (aug[9] != (numtyp)0.0) {
if (2!=1) {
numtyp swapt;
swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
}
}
t = aug[9]/aug[5];
aug[10]-=t*aug[6];
aug[11]-=t*aug[7];
if (aug[10] == (numtyp)0.0)
*error_flag=2;
ans[2] = aug[11]/aug[10];
t = (numtyp)0.0;
t += aug[6]*ans[2];
ans[1] = (aug[7]-t) / aug[5];
t = (numtyp)0.0;
t += aug[1]*ans[1];
t += aug[2]*ans[2];
ans[0] = (aug[3]-t) / aug[0];
}
/* ----------------------------------------------------------------------
compute rotation matrix from quaternion conjugate
quat = [w i j k]
------------------------------------------------------------------------- */
__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
numtyp mat[9])
{
numtyp4 q=qif[qi];
numtyp w2 = q.x*q.x;
numtyp i2 = q.y*q.y;
numtyp j2 = q.z*q.z;
numtyp k2 = q.w*q.w;
numtyp twoij = (numtyp)2.0*q.y*q.z;
numtyp twoik = (numtyp)2.0*q.y*q.w;
numtyp twojk = (numtyp)2.0*q.z*q.w;
numtyp twoiw = (numtyp)2.0*q.y*q.x;
numtyp twojw = (numtyp)2.0*q.z*q.x;
numtyp twokw = (numtyp)2.0*q.w*q.x;
mat[0] = w2+i2-j2-k2;
mat[3] = twoij-twokw;
mat[6] = twojw+twoik;
mat[1] = twoij+twokw;
mat[4] = w2-i2+j2-k2;
mat[7] = twojk-twoiw;
mat[2] = twoik-twojw;
mat[5] = twojk+twoiw;
mat[8] = w2-i2-j2+k2;
}
#endif

View File

@ -1,431 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_KERNEL
#define GB_GPU_KERNEL
#ifdef NV_KERNEL
#include "gb_gpu_extra.h"
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape,
numtyp ans[9])
{
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
den = (numtyp)1.0/den;
ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
__kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, const int astride,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int inum,
const int nall, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp4 tor;
tor.x=(acctyp)0;
tor.y=(acctyp)0;
tor.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp a1[9], b1[9], g1[9];
numtyp4 ishape=shape[itype];
{
numtyp t[9];
gpu_quat_to_mat_trans(q,i,a1);
gpu_times3(ishape,a1,t);
gpu_transpose_times3(a1,t,g1);
gpu_times3(well[itype],a1,t);
gpu_transpose_times3(a1,t,b1);
}
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
numtyp g2[9];
{
gpu_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
gpu_plus3(g1,g2,g12);
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- replace r12 with r12 hat
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
}
// torque for particle 1
{
numtyp tempv[3], tempv2[3];
tempv[0] = -uslj_rsq*kappa[0];
tempv[1] = -uslj_rsq*kappa[1];
tempv[2] = -uslj_rsq*kappa[2];
gpu_row_times3(kappa,g1,tempv2);
gpu_cross3(tempv,tempv2,tUr);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
// Compute teta
numtyp temp[9], tempv[3], tempv2[3];
compute_eta_torque(g12,a1,ishape,temp);
numtyp temp1 = -eta*gum[1];
tempv[0] = temp1*temp[0];
tempv[1] = temp1*temp[1];
tempv[2] = temp1*temp[2];
gpu_cross3(a1,tempv,tempv2);
teta[0] = tempv2[0];
teta[1] = tempv2[1];
teta[2] = tempv2[2];
tempv[0] = temp1*temp[3];
tempv[1] = temp1*temp[4];
tempv[2] = temp1*temp[5];
gpu_cross3(a1+3,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
tempv[0] = temp1*temp[6];
tempv[1] = temp1*temp[7];
tempv[2] = temp1*temp[8];
gpu_cross3(a1+6,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
}
numtyp chi, dchi[3], tchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b2[9], b12[9];
{
gpu_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
gpu_plus3(b1,b2,b12);
}
// compute chi_12
r12[0]*=r;
r12[1]*=r;
r12[2]*=r;
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
chi = gpu_dot3(r12,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
// compute t_chi
numtyp tempv[3];
gpu_row_times3(iota,b1,tempv);
gpu_cross3(tempv,iota,tchi);
temp1 = (numtyp)-4.0*ir*ir;
tchi[0] *= temp1;
tchi[1] *= temp1;
tchi[2] *= temp1;
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-r;
r12[1]*=-r;
r12[2]*=-r;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
// Torque on 1
temp1 = -u_r*eta*factor_lj;
temp2 = -u_r*chi*factor_lj;
numtyp temp3 = -chi*eta*factor_lj;
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[7][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=tor.x;
red_acc[4][tid]=tor.y;
red_acc[5][tid]=tor.z;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
tor.x=red_acc[3][tid];
tor.y=red_acc[4][tid];
tor.z=red_acc[5][tid];
if (eflag>0 || vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
red_acc[6][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<7; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
energy=red_acc[6][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=astride;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=astride;
}
}
ans[ii]=f;
ans[ii+astride]=tor;
} // if ii
}
#endif

View File

@ -1,594 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_KERNEL_LJ
#define GB_GPU_KERNEL_LJ
#ifdef NV_KERNEL
#include "gb_gpu_extra.h"
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
__global numtyp4* shape,__global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag,const int start, const int inum,
const int nall, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+stride*numj;
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp oner=shape[itype].x;
numtyp one_well=well[itype].x;
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp r12hat[3];
r12hat[0]=r12[0]*ir;
r12hat[1]=r12[1]*ir;
r12hat[2]=r12[2]*ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], eta;
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
{
numtyp g2[9];
gpu_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
g12[0]=g2[0]+oner;
g12[4]=g2[4]+oner;
g12[8]=g2[8]+oner;
g12[1]=g2[1];
g12[2]=g2[2];
g12[3]=g2[3];
g12[5]=g2[5];
g12[6]=g2[6];
g12[7]=g2[7];
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12hat,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12hat);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
}
numtyp chi, dchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b12[9];
{
numtyp b2[9];
gpu_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
b12[0]=b2[0]+one_well;
b12[4]=b2[4]+one_well;
b12[8]=b2[8]+one_well;
b12[1]=b2[1];
b12[2]=b2[2];
b12[3]=b2[3];
b12[5]=b2[5];
b12[6]=b2[6];
b12[7]=b2[7];
}
// compute chi_12
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
chi = gpu_dot3(r12hat,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12hat);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-1;
r12[1]*=-1;
r12[2]*=-1;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int nall, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int ii=itype*lj_types+jtype;
if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
energy+=factor_lj*(e-lj3[ii].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int nall, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (tid<4)
sp_lj[tid]=gum[tid+3];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
#endif

View File

@ -1,169 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_KERNEL_H
#define PAIR_GPU_KERNEL_H
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#else
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#endif
// ---------------------------------------------------------------------------
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only unpack neighbors matching the specified inclusive range of forms
// -- Only unpack neighbors within cutoff
// ---------------------------------------------------------------------------
__kernel void kernel_gb_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
const int ntypes, __global int *dev_nbor,
const int nbor_pitch,
const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high, const int nall) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X+start;
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24(iw,ntypes);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j>=nall)
j%=nall;
numtyp4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
numtyp2 cf=cut_form[mtype];
if (cf.y>=form_low && cf.y<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cf.x) {
*packed=j;
packed+=nbor_pitch;
newj++;
}
}
}
dev_nbor[ii+nbor_pitch]=newj;
}
}
// ---------------------------------------------------------------------------
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only unpack neighbors matching the specified inclusive range of forms
// -- Only unpack neighbors within cutoff
// -- Fast version of routine that uses shared memory for LJ constants
// ---------------------------------------------------------------------------
__kernel void kernel_gb_nbor_fast(__global numtyp4 *x_,
__global numtyp2 *cut_form,
__global int *dev_nbor,
const int nbor_pitch,
const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high, const int nall) {
int ii=THREAD_ID_X;
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
cutsq[ii]=cut_form[ii].x;
form[ii]=cut_form[ii].y;
}
ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j>=nall)
j%=nall;
numtyp4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
if (form[mtype]>=form_low && form[mtype]<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cutsq[mtype]) {
*packed=j;
packed+=nbor_pitch;
newj++;
}
}
}
dev_nbor[ii+nbor_pitch]=newj;
}
}
#endif

View File

@ -1,361 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "gb_gpu_cl.h"
#include "gb_gpu_nbor_cl.h"
#else
#include "gb_gpu_ptx.h"
#endif
#include "gb_gpu_memory.h"
#include <cassert>
#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
_max_bytes(0.0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor;
}
template <class numtyp, class acctyp>
GB_GPU_MemoryT::~GB_GPU_Memory() {
clear();
delete ans;
delete nbor;
}
template <class numtyp, class acctyp>
int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int GB_GPU_MemoryT::init(const int ntypes, const double gamma,
const double upsilon, const double mu,
double **host_shape, double **host_well,
double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape,
int **h_form, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4,
double **host_offset, const double *host_special_lj,
const int nlocal, const int nall,
const int max_nbors, const double cell_size,
const double gpu_split, FILE *_screen) {
nbor_time_avail=false;
screen=_screen;
bool gpu_nbor=false;
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
_gpu_host,max_nbors,cell_size,true);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
compile_kernels(*ucl_device);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=device->max_shared_types();
if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for copying type data
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*ucl_device,
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
sigma_epsilon.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
host_sigma,host_epsilon);
cut_form.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,cut_form,host_write,
host_cutsq,h_form);
lj1.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq,h_form);
lj3.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
dev_error.alloc(1,*ucl_device);
dev_error.zero();
_allocated=true;
host_form=h_form;
// Initialize timers for the selected GPU
time_kernel.init(*ucl_device);
time_gayberne.init(*ucl_device);
time_kernel2.init(*ucl_device);
time_gayberne2.init(*ucl_device);
time_kernel.zero();
time_gayberne.zero();
time_kernel2.zero();
time_gayberne2.zero();
// Allocate, cast and asynchronous memcpy of constant data
// Copy data for bonded interactions
gamma_upsilon_mu.alloc(7,*ucl_device,UCL_READ_ONLY);
host_write[0]=static_cast<numtyp>(gamma);
host_write[1]=static_cast<numtyp>(upsilon);
host_write[2]=static_cast<numtyp>(mu);
host_write[3]=static_cast<numtyp>(host_special_lj[0]);
host_write[4]=static_cast<numtyp>(host_special_lj[1]);
host_write[5]=static_cast<numtyp>(host_special_lj[2]);
host_write[6]=static_cast<numtyp>(host_special_lj[3]);
ucl_copy(gamma_upsilon_mu,host_write,7,false);
lshape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
UCL_H_Vec<double> d_view;
d_view.view(host_lshape,lshape.numel(),*ucl_device);
ucl_copy(lshape,d_view,false);
// Copy shape, well, sigma, epsilon, and cutsq onto GPU
// - cast if necessary
shape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
for (int i=0; i<ntypes; i++) {
host_write[i*4]=host_shape[i][0];
host_write[i*4+1]=host_shape[i][1];
host_write[i*4+2]=host_shape[i][2];
}
UCL_H_Vec<numtyp4> view4;
view4.view((numtyp4*)host_write.begin(),shape.numel(),*ucl_device);
ucl_copy(shape,view4,false);
well.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
for (int i=0; i<ntypes; i++) {
host_write[i*4]=host_well[i][0];
host_write[i*4+1]=host_well[i][1];
host_write[i*4+2]=host_well[i][2];
}
view4.view((numtyp4*)host_write.begin(),well.numel(),*ucl_device);
ucl_copy(well,view4,false);
// See if we want fast GB-sphere or sphere-sphere calculations
multiple_forms=false;
for (int i=1; i<ntypes; i++)
for (int j=i; j<ntypes; j++)
if (host_form[i][j]!=ELLIPSE_ELLIPSE)
multiple_forms=true;
if (multiple_forms && host_nlocal>0) {
std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
exit(1);
}
if (multiple_forms)
ans->dev_ans.zero();
_max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
// Memory for ilist ordered by particle type
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
return 0;
else return -3;
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::clear() {
if (!_allocated)
return;
UCL_H_Vec<int> err_flag(1,*ucl_device);
ucl_copy(err_flag,dev_error,false);
if (err_flag[0] == 2)
std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
err_flag.clear();
_allocated=false;
// Output any timing information
acc_timers();
double single[9], times[9];
single[0]=atom->transfer_time()+ans->transfer_time();
single[1]=nbor->time_nbor.total_seconds();
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
nbor->time_kernel.total_seconds();
single[3]=time_gayberne.total_seconds()+time_gayberne2.total_seconds();
if (multiple_forms)
single[4]=time_pair.total_seconds();
else
single[4]=0;
single[5]=atom->cast_time()+ans->cast_time();
single[6]=_gpu_overhead;
single[7]=_driver_overhead;
single[8]=ans->cpu_idle_time();
MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
double avg_split=hd_balancer.all_avg_split();
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
device->replica());
double max_mb=mpi_max_bytes/(1024*1024);
if (device->replica_me()==0)
if (screen && times[3]>0.0) {
int replica_size=device->replica_size();
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (device->procs_per_gpu()==1) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
if (nbor->gpu_nbor())
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size);
else
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
}
fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
fprintf(screen,"-------------------------------------");
fprintf(screen,"--------------------------------\n\n");
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
}
_max_bytes=0.0;
dev_error.clear();
lj1.clear();
lj3.clear();
sigma_epsilon.clear();
cut_form.clear();
shape.clear();
well.clear();
lshape.clear();
gamma_upsilon_mu.clear();
host_olist.clear();
time_kernel.clear();
time_gayberne.clear();
time_kernel2.clear();
time_gayberne2.clear();
time_pair.clear();
hd_balancer.clear();
if (_compiled) {
k_gb_nbor_fast.clear();
k_gb_nbor.clear();
k_gayberne.clear();
k_sphere_gb.clear();
k_lj_fast.clear();
k_lj.clear();
delete pair_program;
delete gb_program;
delete gb_lj_program;
_compiled=false;
}
nbor->clear();
ans->clear();
device->clear();
}
template <class numtyp, class acctyp>
double GB_GPU_MemoryT::host_memory_usage() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
nbor->max_atoms()*sizeof(int);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::compile_kernels(UCL_Device &dev) {
if (_compiled)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE);
pair_program=new UCL_Program(dev);
pair_program->load_string(gb_gpu_kernel_nbor,flags.c_str());
k_gb_nbor_fast.set_function(*pair_program,"kernel_gb_nbor_fast");
k_gb_nbor.set_function(*pair_program,"kernel_gb_nbor");
gb_program=new UCL_Program(dev);
gb_program->load_string(gb_gpu_kernel,flags.c_str());
k_gayberne.set_function(*gb_program,"kernel_gayberne");
gb_lj_program=new UCL_Program(dev);
gb_lj_program->load_string(gb_gpu_kernel_lj,flags.c_str());
k_sphere_gb.set_function(*gb_lj_program,"kernel_sphere_gb");
k_lj_fast.set_function(*gb_lj_program,"kernel_lj_fast");
k_lj.set_function(*gb_lj_program,"kernel_lj");
_compiled=true;
}
template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,214 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_MEMORY_H
#define GB_GPU_MEMORY_H
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
template <class numtyp, class acctyp>
class GB_GPU_Memory {
public:
GB_GPU_Memory();
~GB_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param gpu_nbor true if neighboring performed on device
* \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \return false if there is not sufficient memory or device init prob
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **host_shape,
double **host_well, double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape, int **h_form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
const double *host_special_lj, const int nlocal, const int nall,
const int max_nbors, const double cell_size,
const double gpu_split, FILE *screen);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
atom->resize(nall, success);
ans->resize(inum, success);
if (multiple_forms) ans->dev_ans.zero();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_bytes)
_max_bytes=bytes;
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \param olist_size size of list of particles from CPU neighboring
* \note host_inum is 0 if the host is performing neighboring
* \note if GPU is neighboring nlocal+host_inum=total number local particles
* \note if CPU is neighboring olist_size=total number of local particles
* \note if GPU is neighboring olist_size=0 **/
inline void resize_local(const int nlocal, const int host_inum,
const int max_nbors, const int olist_size,
bool &success) {
if (olist_size>static_cast<int>(host_olist.numel())) {
host_olist.clear();
int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
}
nbor->resize(nlocal,host_inum,max_nbors,success);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_bytes)
_max_bytes=bytes;
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
}
time_kernel.add_to_total();
time_gayberne.add_to_total();
if (multiple_forms) {
time_kernel2.add_to_total();
time_gayberne2.add_to_total();
time_pair.add_to_total();
}
atom->acc_timers();
ans->acc_timers();
}
}
/// Accumulate timers
inline void zero_timers() {
nbor_time_avail=false;
time_kernel.zero();
time_gayberne.zero();
if (multiple_forms) {
time_kernel2.zero();
time_gayberne2.zero();
time_pair.zero();
}
atom->zero_timers();
ans->zero_timers();
}
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
PairGPUDevice<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Error Flag - Set if a bad matrix inversion occurs
UCL_D_Vec<int> dev_error;
/// Device timers
UCL_Timer time_kernel, time_gayberne, time_kernel2, time_gayberne2, time_pair;
/// Host device load balancer
PairGPUBalance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
UCL_D_Vec<numtyp2> sigma_epsilon;
/// cut_form.x = cutsq, cut_form.y = form
UCL_D_Vec<numtyp2> cut_form;
// 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
UCL_D_Vec<numtyp> gamma_upsilon_mu;
// True if we want to use fast GB-sphere or sphere-sphere calculations
bool multiple_forms;
int **host_form;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
int _lj_types;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
PairGPUAtom<numtyp,acctyp> *atom;
/// Aspherical Const Data for Atoms
UCL_D_Vec<numtyp4> shape, well;
/// Aspherical Const Data for Atoms
UCL_D_Vec<numtyp> lshape;
int last_ellipse, max_last_ellipse;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
PairGPUNbor *nbor;
/// ilist with particles sorted by type
UCL_H_Vec<int> host_olist;
/// True if we should accumulate the neighbor timer
bool nbor_time_avail;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program, *gb_program, *gb_lj_program;
UCL_Kernel k_gb_nbor_fast, k_gb_nbor;
UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
inline int block_size() { return _block_size; }
int _threads_per_atom;
private:
bool _allocated, _compiled;
int _block_size;
double _max_bytes;
double _gpu_overhead, _driver_overhead;
void compile_kernels(UCL_Device &dev);
};
#endif

View File

@ -1,997 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2010 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
/* $Revision: 11708 $ on $Date: 2010-06-14 12:06:24 +0530 (Mon, 14 Jun 2010) $ */
#ifndef __OPENCL_CL_H
#define __OPENCL_CL_H
#ifdef __APPLE__
#include <OpenCL/cl_platform.h>
#else
#include <CL/cl_platform.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************/
typedef struct _cl_platform_id * cl_platform_id;
typedef struct _cl_device_id * cl_device_id;
typedef struct _cl_context * cl_context;
typedef struct _cl_command_queue * cl_command_queue;
typedef struct _cl_mem * cl_mem;
typedef struct _cl_program * cl_program;
typedef struct _cl_kernel * cl_kernel;
typedef struct _cl_event * cl_event;
typedef struct _cl_sampler * cl_sampler;
typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
typedef cl_ulong cl_bitfield;
typedef cl_bitfield cl_device_type;
typedef cl_uint cl_platform_info;
typedef cl_uint cl_device_info;
typedef cl_bitfield cl_device_fp_config;
typedef cl_uint cl_device_mem_cache_type;
typedef cl_uint cl_device_local_mem_type;
typedef cl_bitfield cl_device_exec_capabilities;
typedef cl_bitfield cl_command_queue_properties;
typedef intptr_t cl_context_properties;
typedef cl_uint cl_context_info;
typedef cl_uint cl_command_queue_info;
typedef cl_uint cl_channel_order;
typedef cl_uint cl_channel_type;
typedef cl_bitfield cl_mem_flags;
typedef cl_uint cl_mem_object_type;
typedef cl_uint cl_mem_info;
typedef cl_uint cl_image_info;
typedef cl_uint cl_buffer_create_type;
typedef cl_uint cl_addressing_mode;
typedef cl_uint cl_filter_mode;
typedef cl_uint cl_sampler_info;
typedef cl_bitfield cl_map_flags;
typedef cl_uint cl_program_info;
typedef cl_uint cl_program_build_info;
typedef cl_int cl_build_status;
typedef cl_uint cl_kernel_info;
typedef cl_uint cl_kernel_work_group_info;
typedef cl_uint cl_event_info;
typedef cl_uint cl_command_type;
typedef cl_uint cl_profiling_info;
typedef struct _cl_image_format {
cl_channel_order image_channel_order;
cl_channel_type image_channel_data_type;
} cl_image_format;
typedef struct _cl_buffer_region {
size_t origin;
size_t size;
} cl_buffer_region;
/******************************************************************************/
/* Error Codes */
#define CL_SUCCESS 0
#define CL_DEVICE_NOT_FOUND -1
#define CL_DEVICE_NOT_AVAILABLE -2
#define CL_COMPILER_NOT_AVAILABLE -3
#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
#define CL_OUT_OF_RESOURCES -5
#define CL_OUT_OF_HOST_MEMORY -6
#define CL_PROFILING_INFO_NOT_AVAILABLE -7
#define CL_MEM_COPY_OVERLAP -8
#define CL_IMAGE_FORMAT_MISMATCH -9
#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
#define CL_BUILD_PROGRAM_FAILURE -11
#define CL_MAP_FAILURE -12
#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13
#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
#define CL_INVALID_VALUE -30
#define CL_INVALID_DEVICE_TYPE -31
#define CL_INVALID_PLATFORM -32
#define CL_INVALID_DEVICE -33
#define CL_INVALID_CONTEXT -34
#define CL_INVALID_QUEUE_PROPERTIES -35
#define CL_INVALID_COMMAND_QUEUE -36
#define CL_INVALID_HOST_PTR -37
#define CL_INVALID_MEM_OBJECT -38
#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
#define CL_INVALID_IMAGE_SIZE -40
#define CL_INVALID_SAMPLER -41
#define CL_INVALID_BINARY -42
#define CL_INVALID_BUILD_OPTIONS -43
#define CL_INVALID_PROGRAM -44
#define CL_INVALID_PROGRAM_EXECUTABLE -45
#define CL_INVALID_KERNEL_NAME -46
#define CL_INVALID_KERNEL_DEFINITION -47
#define CL_INVALID_KERNEL -48
#define CL_INVALID_ARG_INDEX -49
#define CL_INVALID_ARG_VALUE -50
#define CL_INVALID_ARG_SIZE -51
#define CL_INVALID_KERNEL_ARGS -52
#define CL_INVALID_WORK_DIMENSION -53
#define CL_INVALID_WORK_GROUP_SIZE -54
#define CL_INVALID_WORK_ITEM_SIZE -55
#define CL_INVALID_GLOBAL_OFFSET -56
#define CL_INVALID_EVENT_WAIT_LIST -57
#define CL_INVALID_EVENT -58
#define CL_INVALID_OPERATION -59
#define CL_INVALID_GL_OBJECT -60
#define CL_INVALID_BUFFER_SIZE -61
#define CL_INVALID_MIP_LEVEL -62
#define CL_INVALID_GLOBAL_WORK_SIZE -63
/* OpenCL Version */
#define CL_VERSION_1_0 1
#define CL_VERSION_1_1 1
/* cl_bool */
#define CL_FALSE 0
#define CL_TRUE 1
/* cl_platform_info */
#define CL_PLATFORM_PROFILE 0x0900
#define CL_PLATFORM_VERSION 0x0901
#define CL_PLATFORM_NAME 0x0902
#define CL_PLATFORM_VENDOR 0x0903
#define CL_PLATFORM_EXTENSIONS 0x0904
/* cl_device_type - bitfield */
#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
#define CL_DEVICE_TYPE_CPU (1 << 1)
#define CL_DEVICE_TYPE_GPU (1 << 2)
#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
/* cl_device_info */
#define CL_DEVICE_TYPE 0x1000
#define CL_DEVICE_VENDOR_ID 0x1001
#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
#define CL_DEVICE_ADDRESS_BITS 0x100D
#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
#define CL_DEVICE_IMAGE_SUPPORT 0x1016
#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
#define CL_DEVICE_MAX_SAMPLERS 0x1018
#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
#define CL_DEVICE_ENDIAN_LITTLE 0x1026
#define CL_DEVICE_AVAILABLE 0x1027
#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
#define CL_DEVICE_QUEUE_PROPERTIES 0x102A
#define CL_DEVICE_NAME 0x102B
#define CL_DEVICE_VENDOR 0x102C
#define CL_DRIVER_VERSION 0x102D
#define CL_DEVICE_PROFILE 0x102E
#define CL_DEVICE_VERSION 0x102F
#define CL_DEVICE_EXTENSIONS 0x1030
#define CL_DEVICE_PLATFORM 0x1031
/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */
/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
/* cl_device_fp_config - bitfield */
#define CL_FP_DENORM (1 << 0)
#define CL_FP_INF_NAN (1 << 1)
#define CL_FP_ROUND_TO_NEAREST (1 << 2)
#define CL_FP_ROUND_TO_ZERO (1 << 3)
#define CL_FP_ROUND_TO_INF (1 << 4)
#define CL_FP_FMA (1 << 5)
#define CL_FP_SOFT_FLOAT (1 << 6)
/* cl_device_mem_cache_type */
#define CL_NONE 0x0
#define CL_READ_ONLY_CACHE 0x1
#define CL_READ_WRITE_CACHE 0x2
/* cl_device_local_mem_type */
#define CL_LOCAL 0x1
#define CL_GLOBAL 0x2
/* cl_device_exec_capabilities - bitfield */
#define CL_EXEC_KERNEL (1 << 0)
#define CL_EXEC_NATIVE_KERNEL (1 << 1)
/* cl_command_queue_properties - bitfield */
#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
/* cl_context_info */
#define CL_CONTEXT_REFERENCE_COUNT 0x1080
#define CL_CONTEXT_DEVICES 0x1081
#define CL_CONTEXT_PROPERTIES 0x1082
#define CL_CONTEXT_NUM_DEVICES 0x1083
/* cl_context_info + cl_context_properties */
#define CL_CONTEXT_PLATFORM 0x1084
/* cl_command_queue_info */
#define CL_QUEUE_CONTEXT 0x1090
#define CL_QUEUE_DEVICE 0x1091
#define CL_QUEUE_REFERENCE_COUNT 0x1092
#define CL_QUEUE_PROPERTIES 0x1093
/* cl_mem_flags - bitfield */
#define CL_MEM_READ_WRITE (1 << 0)
#define CL_MEM_WRITE_ONLY (1 << 1)
#define CL_MEM_READ_ONLY (1 << 2)
#define CL_MEM_USE_HOST_PTR (1 << 3)
#define CL_MEM_ALLOC_HOST_PTR (1 << 4)
#define CL_MEM_COPY_HOST_PTR (1 << 5)
/* cl_channel_order */
#define CL_R 0x10B0
#define CL_A 0x10B1
#define CL_RG 0x10B2
#define CL_RA 0x10B3
#define CL_RGB 0x10B4
#define CL_RGBA 0x10B5
#define CL_BGRA 0x10B6
#define CL_ARGB 0x10B7
#define CL_INTENSITY 0x10B8
#define CL_LUMINANCE 0x10B9
#define CL_Rx 0x10BA
#define CL_RGx 0x10BB
#define CL_RGBx 0x10BC
/* cl_channel_type */
#define CL_SNORM_INT8 0x10D0
#define CL_SNORM_INT16 0x10D1
#define CL_UNORM_INT8 0x10D2
#define CL_UNORM_INT16 0x10D3
#define CL_UNORM_SHORT_565 0x10D4
#define CL_UNORM_SHORT_555 0x10D5
#define CL_UNORM_INT_101010 0x10D6
#define CL_SIGNED_INT8 0x10D7
#define CL_SIGNED_INT16 0x10D8
#define CL_SIGNED_INT32 0x10D9
#define CL_UNSIGNED_INT8 0x10DA
#define CL_UNSIGNED_INT16 0x10DB
#define CL_UNSIGNED_INT32 0x10DC
#define CL_HALF_FLOAT 0x10DD
#define CL_FLOAT 0x10DE
/* cl_mem_object_type */
#define CL_MEM_OBJECT_BUFFER 0x10F0
#define CL_MEM_OBJECT_IMAGE2D 0x10F1
#define CL_MEM_OBJECT_IMAGE3D 0x10F2
/* cl_mem_info */
#define CL_MEM_TYPE 0x1100
#define CL_MEM_FLAGS 0x1101
#define CL_MEM_SIZE 0x1102
#define CL_MEM_HOST_PTR 0x1103
#define CL_MEM_MAP_COUNT 0x1104
#define CL_MEM_REFERENCE_COUNT 0x1105
#define CL_MEM_CONTEXT 0x1106
#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107
#define CL_MEM_OFFSET 0x1108
/* cl_image_info */
#define CL_IMAGE_FORMAT 0x1110
#define CL_IMAGE_ELEMENT_SIZE 0x1111
#define CL_IMAGE_ROW_PITCH 0x1112
#define CL_IMAGE_SLICE_PITCH 0x1113
#define CL_IMAGE_WIDTH 0x1114
#define CL_IMAGE_HEIGHT 0x1115
#define CL_IMAGE_DEPTH 0x1116
/* cl_addressing_mode */
#define CL_ADDRESS_NONE 0x1130
#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
#define CL_ADDRESS_CLAMP 0x1132
#define CL_ADDRESS_REPEAT 0x1133
#define CL_ADDRESS_MIRRORED_REPEAT 0x1134
/* cl_filter_mode */
#define CL_FILTER_NEAREST 0x1140
#define CL_FILTER_LINEAR 0x1141
/* cl_sampler_info */
#define CL_SAMPLER_REFERENCE_COUNT 0x1150
#define CL_SAMPLER_CONTEXT 0x1151
#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
#define CL_SAMPLER_ADDRESSING_MODE 0x1153
#define CL_SAMPLER_FILTER_MODE 0x1154
/* cl_map_flags - bitfield */
#define CL_MAP_READ (1 << 0)
#define CL_MAP_WRITE (1 << 1)
/* cl_program_info */
#define CL_PROGRAM_REFERENCE_COUNT 0x1160
#define CL_PROGRAM_CONTEXT 0x1161
#define CL_PROGRAM_NUM_DEVICES 0x1162
#define CL_PROGRAM_DEVICES 0x1163
#define CL_PROGRAM_SOURCE 0x1164
#define CL_PROGRAM_BINARY_SIZES 0x1165
#define CL_PROGRAM_BINARIES 0x1166
/* cl_program_build_info */
#define CL_PROGRAM_BUILD_STATUS 0x1181
#define CL_PROGRAM_BUILD_OPTIONS 0x1182
#define CL_PROGRAM_BUILD_LOG 0x1183
/* cl_build_status */
#define CL_BUILD_SUCCESS 0
#define CL_BUILD_NONE -1
#define CL_BUILD_ERROR -2
#define CL_BUILD_IN_PROGRESS -3
/* cl_kernel_info */
#define CL_KERNEL_FUNCTION_NAME 0x1190
#define CL_KERNEL_NUM_ARGS 0x1191
#define CL_KERNEL_REFERENCE_COUNT 0x1192
#define CL_KERNEL_CONTEXT 0x1193
#define CL_KERNEL_PROGRAM 0x1194
/* cl_kernel_work_group_info */
#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4
/* cl_event_info */
#define CL_EVENT_COMMAND_QUEUE 0x11D0
#define CL_EVENT_COMMAND_TYPE 0x11D1
#define CL_EVENT_REFERENCE_COUNT 0x11D2
#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
#define CL_EVENT_CONTEXT 0x11D4
/* cl_command_type */
#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
#define CL_COMMAND_TASK 0x11F1
#define CL_COMMAND_NATIVE_KERNEL 0x11F2
#define CL_COMMAND_READ_BUFFER 0x11F3
#define CL_COMMAND_WRITE_BUFFER 0x11F4
#define CL_COMMAND_COPY_BUFFER 0x11F5
#define CL_COMMAND_READ_IMAGE 0x11F6
#define CL_COMMAND_WRITE_IMAGE 0x11F7
#define CL_COMMAND_COPY_IMAGE 0x11F8
#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
#define CL_COMMAND_MAP_BUFFER 0x11FB
#define CL_COMMAND_MAP_IMAGE 0x11FC
#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
#define CL_COMMAND_MARKER 0x11FE
#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
#define CL_COMMAND_READ_BUFFER_RECT 0x1201
#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202
#define CL_COMMAND_COPY_BUFFER_RECT 0x1203
#define CL_COMMAND_USER 0x1204
/* command execution status */
#define CL_COMPLETE 0x0
#define CL_RUNNING 0x1
#define CL_SUBMITTED 0x2
#define CL_QUEUED 0x3
/* cl_buffer_create_type */
#define CL_BUFFER_CREATE_TYPE_REGION 0x1220
/* cl_profiling_info */
#define CL_PROFILING_COMMAND_QUEUED 0x1280
#define CL_PROFILING_COMMAND_SUBMIT 0x1281
#define CL_PROFILING_COMMAND_START 0x1282
#define CL_PROFILING_COMMAND_END 0x1283
/********************************************************************************************************/
/* Platform API */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformIDs(cl_uint /* num_entries */,
cl_platform_id * /* platforms */,
cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformInfo(cl_platform_id /* platform */,
cl_platform_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Device APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDs(cl_platform_id /* platform */,
cl_device_type /* device_type */,
cl_uint /* num_entries */,
cl_device_id * /* devices */,
cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceInfo(cl_device_id /* device */,
cl_device_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Context APIs */
extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContext(const cl_context_properties * /* properties */,
cl_uint /* num_devices */,
const cl_device_id * /* devices */,
void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
void * /* user_data */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContextFromType(const cl_context_properties * /* properties */,
cl_device_type /* device_type */,
void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *),
void * /* user_data */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetContextInfo(cl_context /* context */,
cl_context_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Command Queue APIs */
extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueue(cl_context /* context */,
cl_device_id /* device */,
cl_command_queue_properties /* properties */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetCommandQueueInfo(cl_command_queue /* command_queue */,
cl_command_queue_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
#warning CL_USE_DEPRECATED_OPENCL_1_0_APIS is defined. These APIs are unsupported and untested in OpenCL 1.1!
/*
* WARNING:
* This API introduces mutable state into the OpenCL implementation. It has been REMOVED
* to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the
* OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
* It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
*
* Software developers previously relying on this API are instructed to set the command queue
* properties when creating the queue, instead.
*/
extern CL_API_ENTRY cl_int CL_API_CALL
clSetCommandQueueProperty(cl_command_queue /* command_queue */,
cl_command_queue_properties /* properties */,
cl_bool /* enable */,
cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
/* Memory Object APIs */
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBuffer(cl_context /* context */,
cl_mem_flags /* flags */,
size_t /* size */,
void * /* host_ptr */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateSubBuffer(cl_mem /* buffer */,
cl_mem_flags /* flags */,
cl_buffer_create_type /* buffer_create_type */,
const void * /* buffer_create_info */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImage2D(cl_context /* context */,
cl_mem_flags /* flags */,
const cl_image_format * /* image_format */,
size_t /* image_width */,
size_t /* image_height */,
size_t /* image_row_pitch */,
void * /* host_ptr */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImage3D(cl_context /* context */,
cl_mem_flags /* flags */,
const cl_image_format * /* image_format */,
size_t /* image_width */,
size_t /* image_height */,
size_t /* image_depth */,
size_t /* image_row_pitch */,
size_t /* image_slice_pitch */,
void * /* host_ptr */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedImageFormats(cl_context /* context */,
cl_mem_flags /* flags */,
cl_mem_object_type /* image_type */,
cl_uint /* num_entries */,
cl_image_format * /* image_formats */,
cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetMemObjectInfo(cl_mem /* memobj */,
cl_mem_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetImageInfo(cl_mem /* image */,
cl_image_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetMemObjectDestructorCallback( cl_mem /* memobj */,
void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1;
/* Sampler APIs */
extern CL_API_ENTRY cl_sampler CL_API_CALL
clCreateSampler(cl_context /* context */,
cl_bool /* normalized_coords */,
cl_addressing_mode /* addressing_mode */,
cl_filter_mode /* filter_mode */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetSamplerInfo(cl_sampler /* sampler */,
cl_sampler_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Program Object APIs */
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithSource(cl_context /* context */,
cl_uint /* count */,
const char ** /* strings */,
const size_t * /* lengths */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithBinary(cl_context /* context */,
cl_uint /* num_devices */,
const cl_device_id * /* device_list */,
const size_t * /* lengths */,
const unsigned char ** /* binaries */,
cl_int * /* binary_status */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clBuildProgram(cl_program /* program */,
cl_uint /* num_devices */,
const cl_device_id * /* device_list */,
const char * /* options */,
void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramInfo(cl_program /* program */,
cl_program_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramBuildInfo(cl_program /* program */,
cl_device_id /* device */,
cl_program_build_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Kernel Object APIs */
extern CL_API_ENTRY cl_kernel CL_API_CALL
clCreateKernel(cl_program /* program */,
const char * /* kernel_name */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateKernelsInProgram(cl_program /* program */,
cl_uint /* num_kernels */,
cl_kernel * /* kernels */,
cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArg(cl_kernel /* kernel */,
cl_uint /* arg_index */,
size_t /* arg_size */,
const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelInfo(cl_kernel /* kernel */,
cl_kernel_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelWorkGroupInfo(cl_kernel /* kernel */,
cl_device_id /* device */,
cl_kernel_work_group_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Event Object APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clWaitForEvents(cl_uint /* num_events */,
const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventInfo(cl_event /* event */,
cl_event_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateUserEvent(cl_context /* context */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetUserEventStatus(cl_event /* event */,
cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetEventCallback( cl_event /* event */,
cl_int /* command_exec_callback_type */,
void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
/* Profiling APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventProfilingInfo(cl_event /* event */,
cl_profiling_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Flush and Finish APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
/* Enqueued Commands APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadBuffer(cl_command_queue /* command_queue */,
cl_mem /* buffer */,
cl_bool /* blocking_read */,
size_t /* offset */,
size_t /* cb */,
void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadBufferRect(cl_command_queue /* command_queue */,
cl_mem /* buffer */,
cl_bool /* blocking_read */,
const size_t * /* buffer_offset */,
const size_t * /* host_offset */,
const size_t * /* region */,
size_t /* buffer_row_pitch */,
size_t /* buffer_slice_pitch */,
size_t /* host_row_pitch */,
size_t /* host_slice_pitch */,
void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteBuffer(cl_command_queue /* command_queue */,
cl_mem /* buffer */,
cl_bool /* blocking_write */,
size_t /* offset */,
size_t /* cb */,
const void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteBufferRect(cl_command_queue /* command_queue */,
cl_mem /* buffer */,
cl_bool /* blocking_read */,
const size_t * /* buffer_offset */,
const size_t * /* host_offset */,
const size_t * /* region */,
size_t /* buffer_row_pitch */,
size_t /* buffer_slice_pitch */,
size_t /* host_row_pitch */,
size_t /* host_slice_pitch */,
const void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBuffer(cl_command_queue /* command_queue */,
cl_mem /* src_buffer */,
cl_mem /* dst_buffer */,
size_t /* src_offset */,
size_t /* dst_offset */,
size_t /* cb */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBufferRect(cl_command_queue /* command_queue */,
cl_mem /* src_buffer */,
cl_mem /* dst_buffer */,
const size_t * /* src_origin */,
const size_t * /* dst_origin */,
const size_t * /* region */,
size_t /* src_row_pitch */,
size_t /* src_slice_pitch */,
size_t /* dst_row_pitch */,
size_t /* dst_slice_pitch */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadImage(cl_command_queue /* command_queue */,
cl_mem /* image */,
cl_bool /* blocking_read */,
const size_t * /* origin[3] */,
const size_t * /* region[3] */,
size_t /* row_pitch */,
size_t /* slice_pitch */,
void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteImage(cl_command_queue /* command_queue */,
cl_mem /* image */,
cl_bool /* blocking_write */,
const size_t * /* origin[3] */,
const size_t * /* region[3] */,
size_t /* input_row_pitch */,
size_t /* input_slice_pitch */,
const void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImage(cl_command_queue /* command_queue */,
cl_mem /* src_image */,
cl_mem /* dst_image */,
const size_t * /* src_origin[3] */,
const size_t * /* dst_origin[3] */,
const size_t * /* region[3] */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
cl_mem /* src_image */,
cl_mem /* dst_buffer */,
const size_t * /* src_origin[3] */,
const size_t * /* region[3] */,
size_t /* dst_offset */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
cl_mem /* src_buffer */,
cl_mem /* dst_image */,
size_t /* src_offset */,
const size_t * /* dst_origin[3] */,
const size_t * /* region[3] */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapBuffer(cl_command_queue /* command_queue */,
cl_mem /* buffer */,
cl_bool /* blocking_map */,
cl_map_flags /* map_flags */,
size_t /* offset */,
size_t /* cb */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapImage(cl_command_queue /* command_queue */,
cl_mem /* image */,
cl_bool /* blocking_map */,
cl_map_flags /* map_flags */,
const size_t * /* origin[3] */,
const size_t * /* region[3] */,
size_t * /* image_row_pitch */,
size_t * /* image_slice_pitch */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
cl_mem /* memobj */,
void * /* mapped_ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
cl_kernel /* kernel */,
cl_uint /* work_dim */,
const size_t * /* global_work_offset */,
const size_t * /* global_work_size */,
const size_t * /* local_work_size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueTask(cl_command_queue /* command_queue */,
cl_kernel /* kernel */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNativeKernel(cl_command_queue /* command_queue */,
void (*user_func)(void *),
void * /* args */,
size_t /* cb_args */,
cl_uint /* num_mem_objects */,
const cl_mem * /* mem_list */,
const void ** /* args_mem_loc */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMarker(cl_command_queue /* command_queue */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
cl_uint /* num_events */,
const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
/* Extension function access
*
* Returns the extension function address for the given function name,
* or NULL if a valid function can not be found. The client must
* check to make sure the address is not NULL, before using or
* calling the returned function address.
*/
extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_H */

View File

@ -1,213 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2010 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
/* $Revision: 11687 $ on $Date: 2010-06-12 03:47:22 +0530 (Sat, 12 Jun 2010) $ */
/* cl_ext.h contains OpenCL extensions which don't have external */
/* (OpenGL, D3D) dependencies. */
#ifndef __CL_EXT_H
#define __CL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <AvailabilityMacros.h>
#else
#include <CL/cl.h>
#endif
/* cl_khr_fp64 extension - no extension #define since it has no functions */
#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
/* cl_khr_fp16 extension - no extension #define since it has no functions */
#define CL_DEVICE_HALF_FP_CONFIG 0x1033
/* Memory object destruction
*
* Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
*
* Registers a user callback function that will be called when the memory object is deleted and its resources
* freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
* stack associated with memobj. The registered user callback functions are called in the reverse order in
* which they were registered. The user callback functions are called and then the memory object is deleted
* and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
* notified when the memory referenced by host_ptr, specified when the memory object is created and used as
* the storage bits for the memory object, can be reused or freed.
*
* The application may not call CL api's with the cl_mem object passed to the pfn_notify.
*
* Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*/
#define cl_APPLE_SetMemObjectDestructor 1
cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */,
void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
/* Context Logging Functions
*
* The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
* Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*
* clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
*/
#define cl_APPLE_ContextLoggingFunctions 1
extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */,
const void * /* private_info */,
size_t /* cb */,
void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */,
const void * /* private_info */,
size_t /* cb */,
void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */,
const void * /* private_info */,
size_t /* cb */,
void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
/************************
* cl_khr_icd extension *
************************/
#define cl_khr_icd 1
/* cl_platform_info */
#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
/* Additional Error Codes */
#define CL_PLATFORM_NOT_FOUND_KHR -1001
extern CL_API_ENTRY cl_int CL_API_CALL
clIcdGetPlatformIDsKHR(cl_uint /* num_entries */,
cl_platform_id * /* platforms */,
cl_uint * /* num_platforms */);
typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
cl_uint /* num_entries */,
cl_platform_id * /* platforms */,
cl_uint * /* num_platforms */);
/******************************************
* cl_nv_device_attribute_query extension *
******************************************/
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
/*********************************
* cl_amd_device_attribute_query *
*********************************/
#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036
#ifdef CL_VERSION_1_1
/***********************************
* cl_ext_device_fission extension *
***********************************/
#define cl_ext_device_fission 1
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
typedef cl_ulong cl_device_partition_property_ext;
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevicesEXT( cl_device_id /*in_device*/,
const cl_device_partition_property_ext * /* properties */,
cl_uint /*num_entries*/,
cl_device_id * /*out_devices*/,
cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int
( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/,
const cl_device_partition_property_ext * /* properties */,
cl_uint /*num_entries*/,
cl_device_id * /*out_devices*/,
cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
/* cl_device_partition_property_ext */
#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050
#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051
#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053
/* clDeviceGetInfo selectors */
#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054
#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055
#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056
#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057
#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058
/* error codes */
#define CL_DEVICE_PARTITION_FAILED_EXT -1057
#define CL_INVALID_PARTITION_COUNT_EXT -1058
#define CL_INVALID_PARTITION_NAME_EXT -1059
/* CL_AFFINITY_DOMAINs */
#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1
#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2
#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3
#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4
#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10
#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100
/* cl_device_partition_property_ext list terminators */
#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1)
#endif /* CL_VERSION_1_1 */
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_H */

View File

@ -1,155 +0,0 @@
/**********************************************************************************
* Copyright (c) 2008-2010 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
/* $Revision: 11708 $ on $Date: 2010-06-14 12:06:24 +0530 (Mon, 14 Jun 2010) $ */
/*
* cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
* OpenGL dependencies. The application is responsible for #including
* OpenGL or OpenGL ES headers before #including cl_gl.h.
*/
#ifndef __OPENCL_CL_GL_H
#define __OPENCL_CL_GL_H
#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <OpenGL/CGLDevice.h>
#else
#include <CL/cl.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef cl_uint cl_gl_object_type;
typedef cl_uint cl_gl_texture_info;
typedef cl_uint cl_gl_platform_info;
typedef struct __GLsync *cl_GLsync;
/* cl_gl_object_type */
#define CL_GL_OBJECT_BUFFER 0x2000
#define CL_GL_OBJECT_TEXTURE2D 0x2001
#define CL_GL_OBJECT_TEXTURE3D 0x2002
#define CL_GL_OBJECT_RENDERBUFFER 0x2003
/* cl_gl_texture_info */
#define CL_GL_TEXTURE_TARGET 0x2004
#define CL_GL_MIPMAP_LEVEL 0x2005
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLBuffer(cl_context /* context */,
cl_mem_flags /* flags */,
cl_GLuint /* bufobj */,
int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture2D(cl_context /* context */,
cl_mem_flags /* flags */,
cl_GLenum /* target */,
cl_GLint /* miplevel */,
cl_GLuint /* texture */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture3D(cl_context /* context */,
cl_mem_flags /* flags */,
cl_GLenum /* target */,
cl_GLint /* miplevel */,
cl_GLuint /* texture */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLRenderbuffer(cl_context /* context */,
cl_mem_flags /* flags */,
cl_GLuint /* renderbuffer */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLObjectInfo(cl_mem /* memobj */,
cl_gl_object_type * /* gl_object_type */,
cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLTextureInfo(cl_mem /* memobj */,
cl_gl_texture_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */,
cl_uint /* num_objects */,
const cl_mem * /* mem_objects */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */,
cl_uint /* num_objects */,
const cl_mem * /* mem_objects */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
/* cl_khr_gl_sharing extension */
#define cl_khr_gl_sharing 1
typedef cl_uint cl_gl_context_info;
/* Additional Error Codes */
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
/* cl_gl_context_info */
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
/* Additional cl_context_properties */
#define CL_GL_CONTEXT_KHR 0x2008
#define CL_EGL_DISPLAY_KHR 0x2009
#define CL_GLX_DISPLAY_KHR 0x200A
#define CL_WGL_HDC_KHR 0x200B
#define CL_CGL_SHAREGROUP_KHR 0x200C
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
cl_gl_context_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
const cl_context_properties * properties,
cl_gl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret);
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_H */

View File

@ -1,69 +0,0 @@
/**********************************************************************************
* Copyright (c) 2008-2010 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
/* $Revision: 11708 $ on $Date: 2010-06-14 12:06:24 +0530 (Mon, 14 Jun 2010) $ */
/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */
/* OpenGL dependencies. */
#ifndef __OPENCL_CL_GL_EXT_H
#define __OPENCL_CL_GL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __APPLE__
#include <OpenCL/cl_gl.h>
#else
#include <CL/cl_gl.h>
#endif
/*
* For each extension, follow this template
* /* cl_VEN_extname extension */
/* #define cl_VEN_extname 1
* ... define new types, if any
* ... define new tokens, if any
* ... define new APIs, if any
*
* If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
* This allows us to avoid having to decide whether to include GL headers or GLES here.
*/
/*
* cl_khr_gl_event extension
* See section 9.9 in the OpenCL 1.1 spec for more information
*/
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromGLsyncKHR(cl_context /* context */,
cl_GLsync /* cl_GLsync */,
cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_EXT_H */

File diff suppressed because it is too large Load Diff

View File

@ -1,22 +0,0 @@
/*
** Copyright 1998-2002, NVIDIA Corporation.
** All Rights Reserved.
**
** THE INFORMATION CONTAINED HEREIN IS PROPRIETARY AND CONFIDENTIAL TO
** NVIDIA, CORPORATION. USE, REPRODUCTION OR DISCLOSURE TO ANY THIRD PARTY
** IS SUBJECT TO WRITTEN PRE-APPROVAL BY NVIDIA, CORPORATION.
**
**
*/
#ifndef __CLEXT_H
#define __CLEXT_H
#define CL_NV_DEVICE_COMPUTE_CAPABILITY_MAJOR 0x4000
#define CL_NV_DEVICE_COMPUTE_CAPABILITY_MINOR 0x4001
#define CL_NV_DEVICE_REGISTERS_PER_BLOCK 0x4002
#define CL_NV_DEVICE_WARP_SIZE 0x4003
#define CL_NV_DEVICE_GPU_OVERLAP 0x4004
#define CL_NV_DEVICE_KERNEL_EXEC_TIMEOUT 0x4005
#define CL_NV_DEVICE_INTEGRATED_MEMORY 0x4006
#endif

View File

@ -1,54 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2010 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
/* $Revision: 11708 $ on $Date: 2010-06-14 12:06:24 +0530 (Mon, 14 Jun 2010) $ */
#ifndef __OPENCL_H
#define __OPENCL_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <OpenCL/cl_gl.h>
#include <OpenCL/cl_gl_ext.h>
#include <OpenCL/cl_ext.h>
#else
#include <CL/cl.h>
#include <CL/cl_gl.h>
#include <CL/cl_gl_ext.h>
#include <CL/cl_ext.h>
#endif
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_H */

View File

@ -1,876 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2009 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
/* $Revision: 10424 $ on $Date: 2010-02-17 14:34:49 -0800 (Wed, 17 Feb 2010) $ */
#ifndef __OPENCL_CL_H
#define __OPENCL_CL_H
#ifdef __APPLE__
#include <OpenCL/cl_platform.h>
#else
#include <CL/cl_platform.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************/
typedef struct _cl_platform_id * cl_platform_id;
typedef struct _cl_device_id * cl_device_id;
typedef struct _cl_context * cl_context;
typedef struct _cl_command_queue * cl_command_queue;
typedef struct _cl_mem * cl_mem;
typedef struct _cl_program * cl_program;
typedef struct _cl_kernel * cl_kernel;
typedef struct _cl_event * cl_event;
typedef struct _cl_sampler * cl_sampler;
typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
typedef cl_ulong cl_bitfield;
typedef cl_bitfield cl_device_type;
typedef cl_uint cl_platform_info;
typedef cl_uint cl_device_info;
typedef cl_bitfield cl_device_address_info;
typedef cl_bitfield cl_device_fp_config;
typedef cl_uint cl_device_mem_cache_type;
typedef cl_uint cl_device_local_mem_type;
typedef cl_bitfield cl_device_exec_capabilities;
typedef cl_bitfield cl_command_queue_properties;
typedef intptr_t cl_context_properties;
typedef cl_uint cl_context_info;
typedef cl_uint cl_command_queue_info;
typedef cl_uint cl_channel_order;
typedef cl_uint cl_channel_type;
typedef cl_bitfield cl_mem_flags;
typedef cl_uint cl_mem_object_type;
typedef cl_uint cl_mem_info;
typedef cl_uint cl_image_info;
typedef cl_uint cl_addressing_mode;
typedef cl_uint cl_filter_mode;
typedef cl_uint cl_sampler_info;
typedef cl_bitfield cl_map_flags;
typedef cl_uint cl_program_info;
typedef cl_uint cl_program_build_info;
typedef cl_int cl_build_status;
typedef cl_uint cl_kernel_info;
typedef cl_uint cl_kernel_work_group_info;
typedef cl_uint cl_event_info;
typedef cl_uint cl_command_type;
typedef cl_uint cl_profiling_info;
typedef struct _cl_image_format {
cl_channel_order image_channel_order;
cl_channel_type image_channel_data_type;
} cl_image_format;
/******************************************************************************/
/* Error Codes */
#define CL_SUCCESS 0
#define CL_DEVICE_NOT_FOUND -1
#define CL_DEVICE_NOT_AVAILABLE -2
#define CL_COMPILER_NOT_AVAILABLE -3
#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
#define CL_OUT_OF_RESOURCES -5
#define CL_OUT_OF_HOST_MEMORY -6
#define CL_PROFILING_INFO_NOT_AVAILABLE -7
#define CL_MEM_COPY_OVERLAP -8
#define CL_IMAGE_FORMAT_MISMATCH -9
#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
#define CL_BUILD_PROGRAM_FAILURE -11
#define CL_MAP_FAILURE -12
#define CL_INVALID_VALUE -30
#define CL_INVALID_DEVICE_TYPE -31
#define CL_INVALID_PLATFORM -32
#define CL_INVALID_DEVICE -33
#define CL_INVALID_CONTEXT -34
#define CL_INVALID_QUEUE_PROPERTIES -35
#define CL_INVALID_COMMAND_QUEUE -36
#define CL_INVALID_HOST_PTR -37
#define CL_INVALID_MEM_OBJECT -38
#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
#define CL_INVALID_IMAGE_SIZE -40
#define CL_INVALID_SAMPLER -41
#define CL_INVALID_BINARY -42
#define CL_INVALID_BUILD_OPTIONS -43
#define CL_INVALID_PROGRAM -44
#define CL_INVALID_PROGRAM_EXECUTABLE -45
#define CL_INVALID_KERNEL_NAME -46
#define CL_INVALID_KERNEL_DEFINITION -47
#define CL_INVALID_KERNEL -48
#define CL_INVALID_ARG_INDEX -49
#define CL_INVALID_ARG_VALUE -50
#define CL_INVALID_ARG_SIZE -51
#define CL_INVALID_KERNEL_ARGS -52
#define CL_INVALID_WORK_DIMENSION -53
#define CL_INVALID_WORK_GROUP_SIZE -54
#define CL_INVALID_WORK_ITEM_SIZE -55
#define CL_INVALID_GLOBAL_OFFSET -56
#define CL_INVALID_EVENT_WAIT_LIST -57
#define CL_INVALID_EVENT -58
#define CL_INVALID_OPERATION -59
#define CL_INVALID_GL_OBJECT -60
#define CL_INVALID_BUFFER_SIZE -61
#define CL_INVALID_MIP_LEVEL -62
#define CL_INVALID_GLOBAL_WORK_SIZE -63
/* OpenCL Version */
#define CL_VERSION_1_0 1
/* cl_bool */
#define CL_FALSE 0
#define CL_TRUE 1
/* cl_platform_info */
#define CL_PLATFORM_PROFILE 0x0900
#define CL_PLATFORM_VERSION 0x0901
#define CL_PLATFORM_NAME 0x0902
#define CL_PLATFORM_VENDOR 0x0903
#define CL_PLATFORM_EXTENSIONS 0x0904
/* cl_device_type - bitfield */
#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
#define CL_DEVICE_TYPE_CPU (1 << 1)
#define CL_DEVICE_TYPE_GPU (1 << 2)
#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
/* cl_device_info */
#define CL_DEVICE_TYPE 0x1000
#define CL_DEVICE_VENDOR_ID 0x1001
#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
#define CL_DEVICE_ADDRESS_BITS 0x100D
#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
#define CL_DEVICE_IMAGE_SUPPORT 0x1016
#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
#define CL_DEVICE_MAX_SAMPLERS 0x1018
#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
#define CL_DEVICE_ENDIAN_LITTLE 0x1026
#define CL_DEVICE_AVAILABLE 0x1027
#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
#define CL_DEVICE_QUEUE_PROPERTIES 0x102A
#define CL_DEVICE_NAME 0x102B
#define CL_DEVICE_VENDOR 0x102C
#define CL_DRIVER_VERSION 0x102D
#define CL_DEVICE_PROFILE 0x102E
#define CL_DEVICE_VERSION 0x102F
#define CL_DEVICE_EXTENSIONS 0x1030
#define CL_DEVICE_PLATFORM 0x1031
/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */
/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
/* cl_device_fp_config - bitfield */
#define CL_FP_DENORM (1 << 0)
#define CL_FP_INF_NAN (1 << 1)
#define CL_FP_ROUND_TO_NEAREST (1 << 2)
#define CL_FP_ROUND_TO_ZERO (1 << 3)
#define CL_FP_ROUND_TO_INF (1 << 4)
#define CL_FP_FMA (1 << 5)
/* cl_device_mem_cache_type */
#define CL_NONE 0x0
#define CL_READ_ONLY_CACHE 0x1
#define CL_READ_WRITE_CACHE 0x2
/* cl_device_local_mem_type */
#define CL_LOCAL 0x1
#define CL_GLOBAL 0x2
/* cl_device_exec_capabilities - bitfield */
#define CL_EXEC_KERNEL (1 << 0)
#define CL_EXEC_NATIVE_KERNEL (1 << 1)
/* cl_command_queue_properties - bitfield */
#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
/* cl_context_info */
#define CL_CONTEXT_REFERENCE_COUNT 0x1080
#define CL_CONTEXT_DEVICES 0x1081
#define CL_CONTEXT_PROPERTIES 0x1082
/* cl_context_info + cl_context_properties */
#define CL_CONTEXT_PLATFORM 0x1084
/* cl_command_queue_info */
#define CL_QUEUE_CONTEXT 0x1090
#define CL_QUEUE_DEVICE 0x1091
#define CL_QUEUE_REFERENCE_COUNT 0x1092
#define CL_QUEUE_PROPERTIES 0x1093
/* cl_mem_flags - bitfield */
#define CL_MEM_READ_WRITE (1 << 0)
#define CL_MEM_WRITE_ONLY (1 << 1)
#define CL_MEM_READ_ONLY (1 << 2)
#define CL_MEM_USE_HOST_PTR (1 << 3)
#define CL_MEM_ALLOC_HOST_PTR (1 << 4)
#define CL_MEM_COPY_HOST_PTR (1 << 5)
/* cl_channel_order */
#define CL_R 0x10B0
#define CL_A 0x10B1
#define CL_RG 0x10B2
#define CL_RA 0x10B3
#define CL_RGB 0x10B4
#define CL_RGBA 0x10B5
#define CL_BGRA 0x10B6
#define CL_ARGB 0x10B7
#define CL_INTENSITY 0x10B8
#define CL_LUMINANCE 0x10B9
/* cl_channel_type */
#define CL_SNORM_INT8 0x10D0
#define CL_SNORM_INT16 0x10D1
#define CL_UNORM_INT8 0x10D2
#define CL_UNORM_INT16 0x10D3
#define CL_UNORM_SHORT_565 0x10D4
#define CL_UNORM_SHORT_555 0x10D5
#define CL_UNORM_INT_101010 0x10D6
#define CL_SIGNED_INT8 0x10D7
#define CL_SIGNED_INT16 0x10D8
#define CL_SIGNED_INT32 0x10D9
#define CL_UNSIGNED_INT8 0x10DA
#define CL_UNSIGNED_INT16 0x10DB
#define CL_UNSIGNED_INT32 0x10DC
#define CL_HALF_FLOAT 0x10DD
#define CL_FLOAT 0x10DE
/* cl_mem_object_type */
#define CL_MEM_OBJECT_BUFFER 0x10F0
#define CL_MEM_OBJECT_IMAGE2D 0x10F1
#define CL_MEM_OBJECT_IMAGE3D 0x10F2
/* cl_mem_info */
#define CL_MEM_TYPE 0x1100
#define CL_MEM_FLAGS 0x1101
#define CL_MEM_SIZE 0x1102
#define CL_MEM_HOST_PTR 0x1103
#define CL_MEM_MAP_COUNT 0x1104
#define CL_MEM_REFERENCE_COUNT 0x1105
#define CL_MEM_CONTEXT 0x1106
/* cl_image_info */
#define CL_IMAGE_FORMAT 0x1110
#define CL_IMAGE_ELEMENT_SIZE 0x1111
#define CL_IMAGE_ROW_PITCH 0x1112
#define CL_IMAGE_SLICE_PITCH 0x1113
#define CL_IMAGE_WIDTH 0x1114
#define CL_IMAGE_HEIGHT 0x1115
#define CL_IMAGE_DEPTH 0x1116
/* cl_addressing_mode */
#define CL_ADDRESS_NONE 0x1130
#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
#define CL_ADDRESS_CLAMP 0x1132
#define CL_ADDRESS_REPEAT 0x1133
/* cl_filter_mode */
#define CL_FILTER_NEAREST 0x1140
#define CL_FILTER_LINEAR 0x1141
/* cl_sampler_info */
#define CL_SAMPLER_REFERENCE_COUNT 0x1150
#define CL_SAMPLER_CONTEXT 0x1151
#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
#define CL_SAMPLER_ADDRESSING_MODE 0x1153
#define CL_SAMPLER_FILTER_MODE 0x1154
/* cl_map_flags - bitfield */
#define CL_MAP_READ (1 << 0)
#define CL_MAP_WRITE (1 << 1)
/* cl_program_info */
#define CL_PROGRAM_REFERENCE_COUNT 0x1160
#define CL_PROGRAM_CONTEXT 0x1161
#define CL_PROGRAM_NUM_DEVICES 0x1162
#define CL_PROGRAM_DEVICES 0x1163
#define CL_PROGRAM_SOURCE 0x1164
#define CL_PROGRAM_BINARY_SIZES 0x1165
#define CL_PROGRAM_BINARIES 0x1166
/* cl_program_build_info */
#define CL_PROGRAM_BUILD_STATUS 0x1181
#define CL_PROGRAM_BUILD_OPTIONS 0x1182
#define CL_PROGRAM_BUILD_LOG 0x1183
/* cl_build_status */
#define CL_BUILD_SUCCESS 0
#define CL_BUILD_NONE -1
#define CL_BUILD_ERROR -2
#define CL_BUILD_IN_PROGRESS -3
/* cl_kernel_info */
#define CL_KERNEL_FUNCTION_NAME 0x1190
#define CL_KERNEL_NUM_ARGS 0x1191
#define CL_KERNEL_REFERENCE_COUNT 0x1192
#define CL_KERNEL_CONTEXT 0x1193
#define CL_KERNEL_PROGRAM 0x1194
/* cl_kernel_work_group_info */
#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
/* cl_event_info */
#define CL_EVENT_COMMAND_QUEUE 0x11D0
#define CL_EVENT_COMMAND_TYPE 0x11D1
#define CL_EVENT_REFERENCE_COUNT 0x11D2
#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
/* cl_command_type */
#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
#define CL_COMMAND_TASK 0x11F1
#define CL_COMMAND_NATIVE_KERNEL 0x11F2
#define CL_COMMAND_READ_BUFFER 0x11F3
#define CL_COMMAND_WRITE_BUFFER 0x11F4
#define CL_COMMAND_COPY_BUFFER 0x11F5
#define CL_COMMAND_READ_IMAGE 0x11F6
#define CL_COMMAND_WRITE_IMAGE 0x11F7
#define CL_COMMAND_COPY_IMAGE 0x11F8
#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
#define CL_COMMAND_MAP_BUFFER 0x11FB
#define CL_COMMAND_MAP_IMAGE 0x11FC
#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
#define CL_COMMAND_MARKER 0x11FE
#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
/* command execution status */
#define CL_COMPLETE 0x0
#define CL_RUNNING 0x1
#define CL_SUBMITTED 0x2
#define CL_QUEUED 0x3
/* cl_profiling_info */
#define CL_PROFILING_COMMAND_QUEUED 0x1280
#define CL_PROFILING_COMMAND_SUBMIT 0x1281
#define CL_PROFILING_COMMAND_START 0x1282
#define CL_PROFILING_COMMAND_END 0x1283
/********************************************************************************************************/
/* Platform API */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformIDs(cl_uint /* num_entries */,
cl_platform_id * /* platforms */,
cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformInfo(cl_platform_id /* platform */,
cl_platform_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Device APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDs(cl_platform_id /* platform */,
cl_device_type /* device_type */,
cl_uint /* num_entries */,
cl_device_id * /* devices */,
cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceInfo(cl_device_id /* device */,
cl_device_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Context APIs */
extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContext(const cl_context_properties * /* properties */,
cl_uint /* num_devices */,
const cl_device_id * /* devices */,
void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
void * /* user_data */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContextFromType(const cl_context_properties * /* properties */,
cl_device_type /* device_type */,
void (*pfn_notify)(const char *, const void *, size_t, void *) /* pfn_notify */,
void * /* user_data */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetContextInfo(cl_context /* context */,
cl_context_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Command Queue APIs */
extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueue(cl_context /* context */,
cl_device_id /* device */,
cl_command_queue_properties /* properties */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetCommandQueueInfo(cl_command_queue /* command_queue */,
cl_command_queue_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetCommandQueueProperty(cl_command_queue /* command_queue */,
cl_command_queue_properties /* properties */,
cl_bool /* enable */,
cl_command_queue_properties * /* old_properties */) CL_API_SUFFIX__VERSION_1_0;
/* Memory Object APIs */
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBuffer(cl_context /* context */,
cl_mem_flags /* flags */,
size_t /* size */,
void * /* host_ptr */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImage2D(cl_context /* context */,
cl_mem_flags /* flags */,
const cl_image_format * /* image_format */,
size_t /* image_width */,
size_t /* image_height */,
size_t /* image_row_pitch */,
void * /* host_ptr */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImage3D(cl_context /* context */,
cl_mem_flags /* flags */,
const cl_image_format * /* image_format */,
size_t /* image_width */,
size_t /* image_height */,
size_t /* image_depth */,
size_t /* image_row_pitch */,
size_t /* image_slice_pitch */,
void * /* host_ptr */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedImageFormats(cl_context /* context */,
cl_mem_flags /* flags */,
cl_mem_object_type /* image_type */,
cl_uint /* num_entries */,
cl_image_format * /* image_formats */,
cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetMemObjectInfo(cl_mem /* memobj */,
cl_mem_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetImageInfo(cl_mem /* image */,
cl_image_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Sampler APIs */
extern CL_API_ENTRY cl_sampler CL_API_CALL
clCreateSampler(cl_context /* context */,
cl_bool /* normalized_coords */,
cl_addressing_mode /* addressing_mode */,
cl_filter_mode /* filter_mode */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetSamplerInfo(cl_sampler /* sampler */,
cl_sampler_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Program Object APIs */
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithSource(cl_context /* context */,
cl_uint /* count */,
const char ** /* strings */,
const size_t * /* lengths */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithBinary(cl_context /* context */,
cl_uint /* num_devices */,
const cl_device_id * /* device_list */,
const size_t * /* lengths */,
const unsigned char ** /* binaries */,
cl_int * /* binary_status */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clBuildProgram(cl_program /* program */,
cl_uint /* num_devices */,
const cl_device_id * /* device_list */,
const char * /* options */,
void (*pfn_notify)(cl_program /* program */, void * /* user_data */),
void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramInfo(cl_program /* program */,
cl_program_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramBuildInfo(cl_program /* program */,
cl_device_id /* device */,
cl_program_build_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Kernel Object APIs */
extern CL_API_ENTRY cl_kernel CL_API_CALL
clCreateKernel(cl_program /* program */,
const char * /* kernel_name */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateKernelsInProgram(cl_program /* program */,
cl_uint /* num_kernels */,
cl_kernel * /* kernels */,
cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArg(cl_kernel /* kernel */,
cl_uint /* arg_index */,
size_t /* arg_size */,
const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelInfo(cl_kernel /* kernel */,
cl_kernel_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelWorkGroupInfo(cl_kernel /* kernel */,
cl_device_id /* device */,
cl_kernel_work_group_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Event Object APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clWaitForEvents(cl_uint /* num_events */,
const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventInfo(cl_event /* event */,
cl_event_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
/* Profiling APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventProfilingInfo(cl_event /* event */,
cl_profiling_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
/* Flush and Finish APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
/* Enqueued Commands APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadBuffer(cl_command_queue /* command_queue */,
cl_mem /* buffer */,
cl_bool /* blocking_read */,
size_t /* offset */,
size_t /* cb */,
void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteBuffer(cl_command_queue /* command_queue */,
cl_mem /* buffer */,
cl_bool /* blocking_write */,
size_t /* offset */,
size_t /* cb */,
const void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBuffer(cl_command_queue /* command_queue */,
cl_mem /* src_buffer */,
cl_mem /* dst_buffer */,
size_t /* src_offset */,
size_t /* dst_offset */,
size_t /* cb */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadImage(cl_command_queue /* command_queue */,
cl_mem /* image */,
cl_bool /* blocking_read */,
const size_t * /* origin[3] */,
const size_t * /* region[3] */,
size_t /* row_pitch */,
size_t /* slice_pitch */,
void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteImage(cl_command_queue /* command_queue */,
cl_mem /* image */,
cl_bool /* blocking_write */,
const size_t * /* origin[3] */,
const size_t * /* region[3] */,
size_t /* input_row_pitch */,
size_t /* input_slice_pitch */,
const void * /* ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImage(cl_command_queue /* command_queue */,
cl_mem /* src_image */,
cl_mem /* dst_image */,
const size_t * /* src_origin[3] */,
const size_t * /* dst_origin[3] */,
const size_t * /* region[3] */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
cl_mem /* src_image */,
cl_mem /* dst_buffer */,
const size_t * /* src_origin[3] */,
const size_t * /* region[3] */,
size_t /* dst_offset */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
cl_mem /* src_buffer */,
cl_mem /* dst_image */,
size_t /* src_offset */,
const size_t * /* dst_origin[3] */,
const size_t * /* region[3] */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapBuffer(cl_command_queue /* command_queue */,
cl_mem /* buffer */,
cl_bool /* blocking_map */,
cl_map_flags /* map_flags */,
size_t /* offset */,
size_t /* cb */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapImage(cl_command_queue /* command_queue */,
cl_mem /* image */,
cl_bool /* blocking_map */,
cl_map_flags /* map_flags */,
const size_t * /* origin[3] */,
const size_t * /* region[3] */,
size_t * /* image_row_pitch */,
size_t * /* image_slice_pitch */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
cl_mem /* memobj */,
void * /* mapped_ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
cl_kernel /* kernel */,
cl_uint /* work_dim */,
const size_t * /* global_work_offset */,
const size_t * /* global_work_size */,
const size_t * /* local_work_size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueTask(cl_command_queue /* command_queue */,
cl_kernel /* kernel */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNativeKernel(cl_command_queue /* command_queue */,
void (*user_func)(void *),
void * /* args */,
size_t /* cb_args */,
cl_uint /* num_mem_objects */,
const cl_mem * /* mem_list */,
const void ** /* args_mem_loc */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMarker(cl_command_queue /* command_queue */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
cl_uint /* num_events */,
const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
/* Extension function access
*
* Returns the extension function address for the given function name,
* or NULL if a valid function can not be found. The client must
* check to make sure the address is not NULL, before using or
* calling the returned function address.
*/
extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_H */

View File

@ -1,60 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2009 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
/* $Revision: 10424 $ on $Date: 2010-02-17 14:34:49 -0800 (Wed, 17 Feb 2010) $ */
#ifndef __CL_EXT_H
#define __CL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
/* cl_khr_fp64 extension - no extension #define since it has no functions */
#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
/* cl_khr_fp16 extension - no extension #define since it has no functions */
#define CL_DEVICE_HALF_FP_CONFIG 0x1033
/* cl_khr_icd extension */
#define cl_khr_icd 1
/* cl_platform_info */
#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
/* Additional Error Codes */
#define CL_PLATFORM_NOT_FOUND_KHR -1001
extern CL_API_ENTRY cl_int CL_API_CALL
clIcdGetPlatformIDsKHR(cl_uint /* num_entries */,
cl_platform_id * /* platforms */,
cl_uint * /* num_platforms */);
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_H */

View File

@ -1,146 +0,0 @@
/**********************************************************************************
* Copyright (c) 2008-2009 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
/* $Revision: 10424 $ on $Date: 2010-02-17 14:34:49 -0800 (Wed, 17 Feb 2010) $ */
/*
* cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
* OpenGL dependencies. The application is responsible for #including
* OpenGL or OpenGL ES headers before #including cl_gl.h.
*/
#ifndef __OPENCL_CL_GL_H
#define __OPENCL_CL_GL_H
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef cl_uint cl_gl_object_type;
typedef cl_uint cl_gl_texture_info;
typedef cl_uint cl_gl_platform_info;
/* cl_gl_object_type */
#define CL_GL_OBJECT_BUFFER 0x2000
#define CL_GL_OBJECT_TEXTURE2D 0x2001
#define CL_GL_OBJECT_TEXTURE3D 0x2002
#define CL_GL_OBJECT_RENDERBUFFER 0x2003
/* cl_gl_texture_info */
#define CL_GL_TEXTURE_TARGET 0x2004
#define CL_GL_MIPMAP_LEVEL 0x2005
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLBuffer(cl_context /* context */,
cl_mem_flags /* flags */,
cl_GLuint /* bufobj */,
int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture2D(cl_context /* context */,
cl_mem_flags /* flags */,
cl_GLenum /* target */,
cl_GLint /* miplevel */,
cl_GLuint /* texture */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture3D(cl_context /* context */,
cl_mem_flags /* flags */,
cl_GLenum /* target */,
cl_GLint /* miplevel */,
cl_GLuint /* texture */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLRenderbuffer(cl_context /* context */,
cl_mem_flags /* flags */,
cl_GLuint /* renderbuffer */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLObjectInfo(cl_mem /* memobj */,
cl_gl_object_type * /* gl_object_type */,
cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLTextureInfo(cl_mem /* memobj */,
cl_gl_texture_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */,
cl_uint /* num_objects */,
const cl_mem * /* mem_objects */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */,
cl_uint /* num_objects */,
const cl_mem * /* mem_objects */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
/* cl_khr_gl_sharing extension */
#define cl_khr_gl_sharing 1
typedef cl_uint cl_gl_context_info;
/* Additional Error Codes */
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
/* cl_gl_context_info */
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
/* Additional cl_context_properties */
#define CL_GL_CONTEXT_KHR 0x2008
#define CL_EGL_DISPLAY_KHR 0x2009
#define CL_GLX_DISPLAY_KHR 0x200A
#define CL_WGL_HDC_KHR 0x200B
#define CL_CGL_SHAREGROUP_KHR 0x200C
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
cl_gl_context_info /* param_name */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_H */

File diff suppressed because it is too large Load Diff

View File

@ -1,22 +0,0 @@
/*
** Copyright 1998-2002, NVIDIA Corporation.
** All Rights Reserved.
**
** THE INFORMATION CONTAINED HEREIN IS PROPRIETARY AND CONFIDENTIAL TO
** NVIDIA, CORPORATION. USE, REPRODUCTION OR DISCLOSURE TO ANY THIRD PARTY
** IS SUBJECT TO WRITTEN PRE-APPROVAL BY NVIDIA, CORPORATION.
**
**
*/
#ifndef __CLEXT_H
#define __CLEXT_H
#define CL_NV_DEVICE_COMPUTE_CAPABILITY_MAJOR 0x4000
#define CL_NV_DEVICE_COMPUTE_CAPABILITY_MINOR 0x4001
#define CL_NV_DEVICE_REGISTERS_PER_BLOCK 0x4002
#define CL_NV_DEVICE_WARP_SIZE 0x4003
#define CL_NV_DEVICE_GPU_OVERLAP 0x4004
#define CL_NV_DEVICE_KERNEL_EXEC_TIMEOUT 0x4005
#define CL_NV_DEVICE_INTEGRATED_MEMORY 0x4006
#endif

View File

@ -1,121 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "lj96_cut_gpu_memory.h"
using namespace std;
static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
LJ96MF.clear();
gpu_mode=LJ96MF.device->gpu_mode();
double gpu_split=LJ96MF.device->particle_split();
int first_gpu=LJ96MF.device->first_device();
int last_gpu=LJ96MF.device->last_device();
int world_me=LJ96MF.device->world_me();
int gpu_rank=LJ96MF.device->gpu_rank();
int procs_per_gpu=LJ96MF.device->procs_per_gpu();
LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
bool message=false;
if (LJ96MF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
LJ96MF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
LJ96MF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
LJ96MF.estimate_gpu_overhead();
return init_ok;
}
void lj96_gpu_clear() {
LJ96MF.clear();
}
int** lj96_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double lj96_gpu_bytes() {
return LJ96MF.host_memory_usage();
}

View File

@ -1,387 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJ96_GPU_KERNEL
#define LJ96_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (r2inv<lj1[mtype].z) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp r3inv = sqrt(r6inv);
numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp r3inv = sqrt(r6inv);
numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,155 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "lj96_cut_gpu_cl.h"
#else
#include "lj96_cut_gpu_ptx.h"
#endif
#include "lj96_cut_gpu_memory.h"
#include <cassert>
#define LJ96_GPU_MemoryT LJ96_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
LJ96_GPU_MemoryT::LJ96_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
LJ96_GPU_MemoryT::~LJ96_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJ96_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj96_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void LJ96_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJ96_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJ96_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class LJ96_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,78 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJ96_GPU_MEMORY_H
#define LJ96_GPU_MEMORY_H
#include "atomic_gpu_memory.h"
template <class numtyp, class acctyp>
class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
public:
LJ96_GPU_Memory();
~LJ96_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,168 +0,0 @@
/***************************************************************************
lj_class2_long.cpp
-------------------
W. Michael Brown
Host code for COMPASS LJ long potential acceleration
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Mon May 16 2011
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "lj_class2_long_cl.h"
#else
#include "lj_class2_long_ptx.h"
#endif
#include "lj_class2_long.h"
#include <cassert>
using namespace LAMMPS_AL;
#define LJClass2LongT LJClass2Long<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
LJClass2LongT::LJClass2Long() : ChargeGPUMemory<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
LJClass2LongT::~LJClass2Long() {
clear();
}
template <class numtyp, class acctyp>
int LJClass2LongT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJClass2LongT::init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_class2_long);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_cut_ljsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_g_ewald=g_ewald;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void LJClass2LongT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJClass2LongT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJClass2Long<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class LJClass2Long<PRECISION,ACC_PRECISION>;

View File

@ -1,470 +0,0 @@
// **************************************************************************
// lj_class2_long.cu
// -------------------
// W. Michael Brown
//
// Device code for COMPASS LJ long acceleration
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin : Mon May 16 2011
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef LJCL_GPU_KERNEL
#define LJCL_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define EWALD_F (numtyp)1.12837917
#define EWALD_P (numtyp)0.3275911
#define A1 (numtyp)0.254829592
#define A2 (numtyp)-0.284496736
#define A3 (numtyp)1.421413741
#define A4 (numtyp)-1.453152027
#define A5 (numtyp)1.061405429
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].z) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
if (rsq < lj1[mtype].w) {
numtyp rinv=sqrt(r2inv);
r3inv=r2inv*rinv;
r6inv = r3inv*r3inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].w) {
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].z) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
if (rsq < lj1[mtype].w) {
numtyp rinv=sqrt(r2inv);
r3inv=r2inv*rinv;
r6inv = r3inv*r3inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].w) {
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,84 +0,0 @@
/***************************************************************************
lj_class2_long.h
-------------------
W. Michael Brown
Host code for COMPASS LJ long potential acceleration
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Mon May 16 2011
email : brownw@ornl.gov
***************************************************************************/
#ifndef LJ_CLASS2_LONG_H
#define LJ_CLASS2_LONG_H
#include "charge_gpu_memory.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class LJClass2Long : public ChargeGPUMemory<numtyp, acctyp> {
public:
LJClass2Long();
~LJClass2Long();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -1,129 +0,0 @@
/***************************************************************************
lj_class2_long_ext.cpp
-------------------
W. Michael Brown
LAMMPS Wrappers for COMMPASS LJ long Acceleration
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : Mon May 16 2011
email : brownw@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lj_class2_long.h"
using namespace std;
using namespace LAMMPS_AL;
static LJClass2Long<PRECISION,ACC_PRECISION> C2CLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
C2CLMF.clear();
gpu_mode=C2CLMF.device->gpu_mode();
double gpu_split=C2CLMF.device->particle_split();
int first_gpu=C2CLMF.device->first_device();
int last_gpu=C2CLMF.device->last_device();
int world_me=C2CLMF.device->world_me();
int gpu_rank=C2CLMF.device->gpu_rank();
int procs_per_gpu=C2CLMF.device->procs_per_gpu();
C2CLMF.device->init_message(screen,"lj/class2/coul/long",first_gpu,last_gpu);
bool message=false;
if (C2CLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
C2CLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
C2CLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
C2CLMF.estimate_gpu_overhead();
return init_ok;
}
void c2cl_gpu_clear() {
C2CLMF.clear();
}
int** c2cl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return C2CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void c2cl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
C2CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
}
double c2cl_gpu_bytes() {
return C2CLMF.host_memory_usage();
}

View File

@ -1,121 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "lj_cut_gpu_memory.h"
using namespace std;
static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
LJLMF.clear();
gpu_mode=LJLMF.device->gpu_mode();
double gpu_split=LJLMF.device->particle_split();
int first_gpu=LJLMF.device->first_device();
int last_gpu=LJLMF.device->last_device();
int world_me=LJLMF.device->world_me();
int gpu_rank=LJLMF.device->gpu_rank();
int procs_per_gpu=LJLMF.device->procs_per_gpu();
LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
bool message=false;
if (LJLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
LJLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
LJLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
LJLMF.estimate_gpu_overhead();
return init_ok;
}
void ljl_gpu_clear() {
LJLMF.clear();
}
int ** ljl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double ljl_gpu_bytes() {
return LJLMF.host_memory_usage();
}

View File

@ -1,385 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJ_GPU_KERNEL
#define LJ_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (r2inv<lj1[mtype].z) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,155 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "lj_cut_gpu_cl.h"
#else
#include "lj_cut_gpu_ptx.h"
#endif
#include "lj_cut_gpu_memory.h"
#include <cassert>
#define LJL_GPU_MemoryT LJL_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
LJL_GPU_MemoryT::LJL_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
LJL_GPU_MemoryT::~LJL_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJL_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void LJL_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJL_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJL_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class LJL_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,78 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJL_GPU_MEMORY_H
#define LJL_GPU_MEMORY_H
#include "atomic_gpu_memory.h"
template <class numtyp, class acctyp>
class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
public:
LJL_GPU_Memory();
~LJL_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,122 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "lj_expand_gpu_memory.h"
using namespace std;
static LJE_GPU_Memory<PRECISION,ACC_PRECISION> LJEMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double **shift, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
LJEMF.clear();
gpu_mode=LJEMF.device->gpu_mode();
double gpu_split=LJEMF.device->particle_split();
int first_gpu=LJEMF.device->first_device();
int last_gpu=LJEMF.device->last_device();
int world_me=LJEMF.device->world_me();
int gpu_rank=LJEMF.device->gpu_rank();
int procs_per_gpu=LJEMF.device->procs_per_gpu();
LJEMF.device->init_message(screen,"lj/expand",first_gpu,last_gpu);
bool message=false;
if (LJEMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, shift, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
LJEMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, shift, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split,screen);
LJEMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
LJEMF.estimate_gpu_overhead();
return init_ok;
}
void lje_gpu_clear() {
LJEMF.clear();
}
int** lje_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void lje_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
LJEMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double lje_gpu_bytes() {
return LJEMF.host_memory_usage();
}

View File

@ -1,392 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
------------------------------------------------------------------------- */
#ifndef LJE_GPU_KERNEL
#define LJE_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (r2inv<lj1[mtype].z) {
numtyp r = sqrt(r2inv);
numtyp rshift = r - lj1[mtype].w;
numtyp rshiftsq = rshift*rshift;
r2inv = (numtyp) 1.0/rshiftsq;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj/rshift/r;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z) {
numtyp r = sqrt(r2inv);
numtyp rshift = r - lj1[mtype].w;
numtyp rshiftsq = rshift*rshift;
r2inv = 1.0/rshiftsq;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj/rshift/r;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,155 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "lj_expand_gpu_cl.h"
#else
#include "lj_expand_gpu_ptx.h"
#endif
#include "lj_expand_gpu_memory.h"
#include <cassert>
#define LJE_GPU_MemoryT LJE_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
LJE_GPU_MemoryT::LJE_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
LJE_GPU_MemoryT::~LJE_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int LJE_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJE_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4,
double **host_offset, double **host_shift,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_expand_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_shift);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void LJE_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJE_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJE_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class LJE_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,78 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
------------------------------------------------------------------------- */
#ifndef LJE_GPU_MEMORY_H
#define LJE_GPU_MEMORY_H
#include "atomic_gpu_memory.h"
template <class numtyp, class acctyp>
class LJE_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
public:
LJE_GPU_Memory();
~LJE_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double **host_shift, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = shift
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,129 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "ljc_cut_gpu_memory.h"
using namespace std;
static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
LJCMF.clear();
gpu_mode=LJCMF.device->gpu_mode();
double gpu_split=LJCMF.device->particle_split();
int first_gpu=LJCMF.device->first_device();
int last_gpu=LJCMF.device->last_device();
int world_me=LJCMF.device->world_me();
int gpu_rank=LJCMF.device->gpu_rank();
int procs_per_gpu=LJCMF.device->procs_per_gpu();
LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
bool message=false;
if (LJCMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e);
LJCMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e);
LJCMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
LJCMF.estimate_gpu_overhead();
return init_ok;
}
void ljc_gpu_clear() {
LJCMF.clear();
}
int** ljc_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
nlocal,boxlo,prd);
}
double ljc_gpu_bytes() {
return LJCMF.host_memory_usage();
}

View File

@ -1,448 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJC_GPU_KERNEL
#define LJC_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp *cutsq, const numtyp qqrd2e,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<cutsq[mtype]) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, r6inv;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w)
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*sqrt(r2inv)*factor_coul;
else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
e_coul += forcecoul;
if (rsq < lj1[mtype].z) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp *_cutsq, const numtyp qqrd2e,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
cutsq[tid]=_cutsq[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[mtype]) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, r6inv;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w)
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*sqrt(r2inv)*factor_coul;
else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
e_coul += forcecoul;
if (rsq < lj1[mtype].z) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,170 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "ljc_cut_gpu_cl.h"
#else
#include "ljc_cut_gpu_ptx.h"
#endif
#include "ljc_cut_gpu_memory.h"
#include <cassert>
#define LJC_GPU_MemoryT LJC_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
LJC_GPU_MemoryT::LJC_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
LJC_GPU_MemoryT::~LJC_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJC_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,ljc_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cut_ljsq, host_cut_coulsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_qqrd2e=qqrd2e;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void LJC_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
cutsq.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJC_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJC_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch,
&this->atom->dev_q.begin(), &cutsq.begin(),
&_qqrd2e, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(),
&cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class LJC_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,84 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJC_GPU_MEMORY_H
#define LJC_GPU_MEMORY_H
#include "charge_gpu_memory.h"
template <class numtyp, class acctyp>
class LJC_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
public:
LJC_GPU_Memory();
~LJC_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
double **host_cut_coulsq, double *host_special_coul,
const double qqrd2e);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// cutsq
UCL_D_Vec<numtyp> cutsq;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _qqrd2e;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,130 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "ljcl_cut_gpu_memory.h"
using namespace std;
static LJCL_GPU_Memory<PRECISION,ACC_PRECISION> LJCLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
LJCLMF.clear();
gpu_mode=LJCLMF.device->gpu_mode();
double gpu_split=LJCLMF.device->particle_split();
int first_gpu=LJCLMF.device->first_device();
int last_gpu=LJCLMF.device->last_device();
int world_me=LJCLMF.device->world_me();
int gpu_rank=LJCLMF.device->gpu_rank();
int procs_per_gpu=LJCLMF.device->procs_per_gpu();
LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu);
bool message=false;
if (LJCLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
LJCLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
LJCLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
LJCLMF.estimate_gpu_overhead();
return init_ok;
}
void ljcl_gpu_clear() {
LJCLMF.clear();
}
int** ljcl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
}
double ljcl_gpu_bytes() {
return LJCLMF.host_memory_usage();
}

View File

@ -1,468 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJCL_GPU_KERNEL
#define LJCL_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define EWALD_F (numtyp)1.12837917
#define EWALD_P (numtyp)0.3275911
#define A1 (numtyp)0.254829592
#define A2 (numtyp)-0.284496736
#define A3 (numtyp)1.421413741
#define A4 (numtyp)-1.453152027
#define A5 (numtyp)1.061405429
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].z) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
if (rsq < lj1[mtype].w) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].w) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].z) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
if (rsq < lj1[mtype].w) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].w) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,168 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "ljcl_cut_gpu_cl.h"
#else
#include "ljcl_cut_gpu_ptx.h"
#endif
#include "ljcl_cut_gpu_memory.h"
#include <cassert>
#define LJCL_GPU_MemoryT LJCL_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
LJCL_GPU_MemoryT::LJCL_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
LJCL_GPU_MemoryT::~LJCL_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJCL_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,ljcl_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_cut_ljsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_g_ewald=g_ewald;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void LJCL_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJCL_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJCL_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class LJCL_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,82 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJCL_GPU_MEMORY_H
#define LJCL_GPU_MEMORY_H
#include "charge_gpu_memory.h"
template <class numtyp, class acctyp>
class LJCL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
public:
LJCL_GPU_Memory();
~LJCL_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,122 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "morse_gpu_memory.h"
using namespace std;
static MOR_GPU_Memory<PRECISION,ACC_PRECISION> MORMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int mor_gpu_init(const int ntypes, double **cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
MORMF.clear();
gpu_mode=MORMF.device->gpu_mode();
double gpu_split=MORMF.device->particle_split();
int first_gpu=MORMF.device->first_device();
int last_gpu=MORMF.device->last_device();
int world_me=MORMF.device->world_me();
int gpu_rank=MORMF.device->gpu_rank();
int procs_per_gpu=MORMF.device->procs_per_gpu();
MORMF.device->init_message(screen,"morse",first_gpu,last_gpu);
bool message=false;
if (MORMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
MORMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
MORMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
MORMF.estimate_gpu_overhead();
return init_ok;
}
void mor_gpu_clear() {
MORMF.clear();
}
int** mor_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void mor_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
MORMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double mor_gpu_bytes() {
return MORMF.host_memory_usage();
}

View File

@ -1,388 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef MORSE_GPU_KERNEL
#define MORSE_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
__global numtyp2* mor2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (r<mor1[mtype].x) {
r=sqrt(r);
numtyp dexp=r-mor1[mtype].z;
dexp=exp(-mor1[mtype].w*dexp);
numtyp dm=dexp*dexp-dexp;
numtyp force = mor1[mtype].y*dm/r*factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y;
energy+=e*factor_lj;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
__global numtyp2* mor2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
mor1[tid]=mor1_in[tid];
if (eflag>0)
mor2[tid]=mor2_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r = delx*delx+dely*dely+delz*delz;
if (r<mor1[mtype].x) {
r=sqrt(r);
numtyp dexp=r-mor1[mtype].z;
dexp=exp(-mor1[mtype].w*dexp);
numtyp dm=dexp*dexp-dexp;
numtyp force = mor1[mtype].y*dm/r*factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y;
energy+=e*factor_lj;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -1,155 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "morse_gpu_cl.h"
#else
#include "morse_gpu_ptx.h"
#endif
#include "morse_gpu_memory.h"
#include <cassert>
#define MOR_GPU_MemoryT MOR_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
MOR_GPU_MemoryT::MOR_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
MOR_GPU_MemoryT::~MOR_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int MOR_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int MOR_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_morse1,
double **host_r0, double **host_alpha,
double **host_d0, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,morse_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (types<=max_shared_types && this->_block_size>=max_shared_types) {
types=max_shared_types;
shared_types=true;
}
_types=types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<types*types; i++)
host_write[i]=0.0;
mor1.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,types,mor1,host_write,host_cutsq,host_morse1,
host_r0,host_alpha);
mor2.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,types,mor2,host_write,host_d0,host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=mor1.row_bytes()+mor2.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void MOR_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
mor1.clear();
mor2.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double MOR_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(MOR_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(),
&mor2.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
&_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class MOR_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,78 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef MOR_GPU_MEMORY_H
#define MOR_GPU_MEMORY_H
#include "atomic_gpu_memory.h"
template <class numtyp, class acctyp>
class MOR_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
public:
MOR_GPU_Memory();
~MOR_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_morse1, double **host_r0, double **host_alpha,
double **host_d0, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// mor1.x = cutsq, mor1.y = morse1, mor1.z = r0, mor1.w = alpha
UCL_D_Vec<numtyp4> mor1;
/// mor2.x = d0, mor2.y = offset
UCL_D_Vec<numtyp2> mor2;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -1,55 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
/*************************************************************************
See pair_gpu_dev_kernel.cu for definitions
of preprocessor constants
*************************************************************************/
#ifndef NV_KERNEL_DEF
#define NV_KERNEL_DEF
#include "geryon/ucl_nv_kernel.h"
#ifdef __CUDA_ARCH__
#define ARCH __CUDA_ARCH__
#else
#define ARCH 100
#endif
#if (ARCH < 200)
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 8
#define BLOCK_NBOR_BUILD 64
#define BLOCK_PAIR 64
#define BLOCK_BIO_PAIR 64
#define MAX_SHARED_TYPES 8
#else
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 8
#define BLOCK_NBOR_BUILD 128
#define BLOCK_PAIR 128
#define BLOCK_BIO_PAIR 128
#define MAX_SHARED_TYPES 11
#endif
#define WARP_SIZE 32
#endif

View File

@ -1,407 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include "pair_gpu_ans.h"
#define PairGPUAnsT PairGPUAns<numtyp,acctyp>
template <class numtyp, class acctyp>
PairGPUAnsT::PairGPUAns() : _allocated(false),_eflag(false),_vflag(false),
_inum(0),_ilist(NULL),_newton(false) {
}
template <class numtyp, class acctyp>
int PairGPUAnsT::bytes_per_atom() const {
int bytes=11*sizeof(acctyp);
if (_rot)
bytes+=4*sizeof(acctyp);
if (_charge)
bytes+=sizeof(acctyp);
return bytes;
}
template <class numtyp, class acctyp>
bool PairGPUAnsT::alloc(const int inum) {
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
bool success=true;
int ans_elements=4;
if (_rot)
ans_elements+=4;
// Ignore host/device transfers?
bool cpuview=false;
if (dev->device_type()==UCL_CPU)
cpuview=true;
// -------------------------- Host allocations
success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
// --------------------------- Device allocations
if (cpuview) {
dev_engv.view(host_engv);
dev_ans.view(host_ans);
} else {
success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (dev_ans.alloc(ans_elements*_max_local,
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
}
_gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
_allocated=true;
return success;
}
template <class numtyp, class acctyp>
bool PairGPUAnsT::init(const int inum, const bool charge, const bool rot,
UCL_Device &devi) {
clear();
bool success=true;
_charge=charge;
_rot=rot;
_other=_charge || _rot;
dev=&devi;
_e_fields=1;
if (_charge)
_e_fields++;
_ev_fields=6+_e_fields;
// Initialize atom and nbor data
int ef_inum=inum;
if (ef_inum==0)
ef_inum=1000;
// Initialize timers for the selected device
time_answer.init(*dev);
time_answer.zero();
_time_cast=0.0;
_time_cpu_idle=0.0;
return success && alloc(ef_inum);
}
template <class numtyp, class acctyp>
bool PairGPUAnsT::add_fields(const bool charge, const bool rot) {
bool realloc=false;
if (charge && _charge==false) {
_charge=true;
_e_fields++;
_ev_fields++;
realloc=true;
}
if (rot && _rot==false) {
_rot=true;
realloc=true;
}
if (realloc) {
_other=_charge || _rot;
int inum=_max_local;
clear_resize();
return alloc(inum);
}
return true;
}
template <class numtyp, class acctyp>
void PairGPUAnsT::clear_resize() {
if (!_allocated)
return;
_allocated=false;
dev_ans.clear();
dev_engv.clear();
host_ans.clear();
host_engv.clear();
}
template <class numtyp, class acctyp>
void PairGPUAnsT::clear() {
_gpu_bytes=0;
if (!_allocated)
return;
time_answer.clear();
clear_resize();
_inum=0;
_ilist=NULL;
_eflag=false;
_vflag=false;
}
template <class numtyp, class acctyp>
double PairGPUAnsT::host_memory_usage() const {
int atom_bytes=4;
if (_charge)
atom_bytes+=1;
if (_rot)
atom_bytes+=4;
int ans_bytes=atom_bytes+_ev_fields;
return ans_bytes*(_max_local)*sizeof(acctyp)+
sizeof(PairGPUAns<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom) {
time_answer.start();
_eflag=eflag;
_vflag=vflag;
_ef_atom=ef_atom;
_vf_atom=vf_atom;
int csize=_ev_fields;
if (!eflag)
csize-=_e_fields;
if (!vflag)
csize-=6;
if (csize>0)
ucl_copy(host_engv,dev_engv,_inum*csize,true);
if (_rot)
ucl_copy(host_ans,dev_ans,_inum*4*2,true);
else
ucl_copy(host_ans,dev_ans,_inum*4,true);
time_answer.stop();
}
template <class numtyp, class acctyp>
void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom,
int *ilist) {
_ilist=ilist;
copy_answers(eflag,vflag,ef_atom,vf_atom);
}
template <class numtyp, class acctyp>
double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
double *virial) {
if (_eflag==false && _vflag==false)
return 0.0;
double evdwl=0.0;
double virial_acc[6];
for (int i=0; i<6; i++) virial_acc[i]=0.0;
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[i]+=*ap*0.5;
ap+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=*ap*0.5;
virial_acc[j]+=*ap;
ap+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=*ap;
ap+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
} else {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[ii]+=*ap*0.5;
ap+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=*ap*0.5;
virial_acc[j]+=*ap;
ap+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=*ap;
ap+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
}
evdwl*=0.5;
return evdwl;
}
template <class numtyp, class acctyp>
double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
double *virial, double &ecoul) {
if (_eflag==false && _vflag==false)
return 0.0;
if (_charge==false)
return energy_virial(eatom,vatom,virial);
double evdwl=0.0;
double _ecoul=0.0;
double virial_acc[6];
for (int i=0; i<6; i++) virial_acc[i]=0.0;
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[i]+=*ap*0.5;
ap+=_inum;
_ecoul+=*ap;
eatom[i]+=*ap*0.5;
ap+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
_ecoul+=*ap;
ap+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=*ap*0.5;
virial_acc[j]+=*ap;
ap+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=*ap;
ap+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
} else {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[ii]+=*ap*0.5;
ap+=_inum;
_ecoul+=*ap;
eatom[ii]+=*ap*0.5;
ap+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
_ecoul+=*ap;
ap+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=*ap*0.5;
virial_acc[j]+=*ap;
ap+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=*ap;
ap+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
}
evdwl*=0.5;
ecoul+=_ecoul*0.5;
return evdwl;
}
template <class numtyp, class acctyp>
void PairGPUAnsT::get_answers(double **f, double **tor) {
acctyp *ap=host_ans.begin();
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
f[i][0]+=*ap;
ap++;
f[i][1]+=*ap;
ap++;
f[i][2]+=*ap;
ap+=2;
}
if (_rot) {
for (int i=0; i<_inum; i++) {
tor[i][0]+=*ap;
ap++;
tor[i][1]+=*ap;
ap++;
tor[i][2]+=*ap;
ap+=2;
}
}
} else {
for (int i=0; i<_inum; i++) {
int ii=_ilist[i];
f[ii][0]+=*ap;
ap++;
f[ii][1]+=*ap;
ap++;
f[ii][2]+=*ap;
ap+=2;
}
if (_rot) {
for (int i=0; i<_inum; i++) {
int ii=_ilist[i];
tor[ii][0]+=*ap;
ap++;
tor[ii][1]+=*ap;
ap++;
tor[ii][2]+=*ap;
ap+=2;
}
}
}
}
template class PairGPUAns<PRECISION,ACC_PRECISION>;

View File

@ -1,170 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_ANS_H
#define PAIR_GPU_ANS_H
#include <math.h>
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
using namespace ucl_opencl;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
using namespace ucl_cudadr;
#endif
#include "pair_gpu_precision.h"
template <class numtyp, class acctyp>
class PairGPUAns {
public:
PairGPUAns();
~PairGPUAns() { clear(); }
/// Current number of local atoms stored
inline int inum() const { return _inum; }
/// Set number of local atoms for future copy operations
inline void inum(const int n) { _inum=n; }
/// Memory usage per atom in this class
int bytes_per_atom() const;
/// Clear any previous data and set up for a new LAMMPS run
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor True if neighboring will be performed on device **/
bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
/// Check if we have enough device storage and realloc if not
inline void resize(const int inum, bool &success) {
_inum=inum;
if (inum>_max_local) {
clear_resize();
success = success && alloc(inum);
}
}
/// If already initialized by another LAMMPS style, add fields as necessary
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor True if neighboring will be performed on device **/
bool add_fields(const bool charge, const bool rot);
/// Free all memory on host and device needed to realloc for more atoms
void clear_resize();
/// Free all memory on host and device
void clear();
/// Return the total amount of host memory used by class in bytes
double host_memory_usage() const;
/// Add copy times to timers
inline void acc_timers() {
time_answer.add_to_total();
}
/// Add copy times to timers
inline void zero_timers() {
time_answer.zero();
}
/// Return the total time for host/device data transfer
inline double transfer_time() {
return time_answer.total_seconds();
}
/// Return the total time for data cast/pack
inline double cast_time() { return _time_cast; }
/// Return number of bytes used on device
inline double gpu_bytes() { return _gpu_bytes; }
// -------------------------COPY FROM GPU -------------------------------
/// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom);
/// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom, int *ilist);
/// Copy energy and virial data into LAMMPS memory
double energy_virial(double *eatom, double **vatom, double *virial);
/// Copy energy and virial data into LAMMPS memory
double energy_virial(double *eatom, double **vatom, double *virial,
double &ecoul);
/// Add forces and torques from the GPU into a LAMMPS pointer
void get_answers(double **f, double **tor);
inline double get_answers(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
double ta=MPI_Wtime();
time_answer.sync_stop();
_time_cpu_idle+=MPI_Wtime()-ta;
double ts=MPI_Wtime();
double evdw=energy_virial(eatom,vatom,virial,ecoul);
get_answers(f,tor);
_time_cast+=MPI_Wtime()-ts;
return evdw;
}
/// Return the time the CPU was idle waiting for GPU
inline double cpu_idle_time() { return _time_cpu_idle; }
// ------------------------------ DATA ----------------------------------
/// Force and possibly torque
UCL_D_Vec<acctyp> dev_ans;
/// Energy and virial per-atom storage
UCL_D_Vec<acctyp> dev_engv;
/// Force and possibly torque data on host
UCL_H_Vec<acctyp> host_ans;
/// Energy/virial data on host
UCL_H_Vec<acctyp> host_engv;
/// Device timers
UCL_Timer time_answer;
/// Geryon device
UCL_Device *dev;
private:
bool alloc(const int inum);
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
int _max_local, _inum, _e_fields, _ev_fields;
int *_ilist;
double _time_cast, _time_cpu_idle;
double _gpu_bytes;
bool _newton;
};
#endif

View File

@ -1,335 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include "pair_gpu_atom.h"
#define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
#ifdef WINDLL
#include <windows.h>
typedef bool (*__win_sort_alloc)(const int max_atoms);
typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin,
int *particle_begin);
__win_sort_alloc _win_sort_alloc;
__win_sort _win_sort;
#endif
template <class numtyp, class acctyp>
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
_max_gpu_bytes(0) {
#ifndef USE_OPENCL
sort_config.op = CUDPP_ADD;
sort_config.datatype = CUDPP_UINT;
sort_config.algorithm = CUDPP_SORT_RADIX;
sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
#ifdef WINDLL
HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
if (hinstLib == NULL) {
printf("\nUnable to load gpu.dll\n");
exit(1);
}
_win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc");
_win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort");
#endif
#endif
}
template <class numtyp, class acctyp>
int PairGPUAtomT::bytes_per_atom() const {
int id_space=0;
if (_gpu_nbor)
id_space=2;
int bytes=4*sizeof(numtyp)+id_space;
if (_rot)
bytes+=4*sizeof(numtyp);
if (_charge)
bytes+=sizeof(numtyp);
return bytes;
}
template <class numtyp, class acctyp>
bool PairGPUAtomT::alloc(const int nall) {
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
bool success=true;
// Ignore host/device transfers?
bool cpuview=false;
if (dev->device_type()==UCL_CPU)
cpuview=true;
// Allocate storage for CUDPP sort
#ifndef USE_OPENCL
#ifdef WINDLL
_win_sort_alloc(_max_atoms);
#else
if (_gpu_nbor) {
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
if (CUDPP_SUCCESS != result)
return false;
}
#endif
#endif
// -------------------------- Host allocations
// Get a host write only buffer
#ifdef GPU_CAST
success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
success=success && (host_type_cast.alloc(_max_atoms,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
#else
success=success && (host_x.alloc(_max_atoms*4,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
#endif
// Buffer for casting only if different precisions
if (_charge)
success=success && (host_q.alloc(_max_atoms,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
// Buffer for casting only if different precisions
if (_rot)
success=success && (host_quat.alloc(_max_atoms*4,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
// --------------------------- Device allocations
int gpu_bytes=0;
if (cpuview) {
#ifdef GPU_CAST
assert(0==1);
#else
dev_x.view(host_x);
#endif
if (_rot)
dev_quat.view(host_quat);
if (_charge)
dev_q.view(host_q);
} else {
#ifdef GPU_CAST
success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
success=success && (UCL_SUCCESS==
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
success=success && (UCL_SUCCESS==
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
#else
success=success && (UCL_SUCCESS==
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
#endif
if (_charge) {
success=success && (dev_q.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_q.row_bytes();
}
if (_rot) {
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_quat.row_bytes();
}
}
if (_gpu_nbor) {
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
if (_bonds) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_tag.row_bytes();
}
}
gpu_bytes+=dev_x.row_bytes();
if (gpu_bytes>_max_gpu_bytes)
_max_gpu_bytes=gpu_bytes;
_allocated=true;
return success;
}
template <class numtyp, class acctyp>
bool PairGPUAtomT::add_fields(const bool charge, const bool rot,
const bool gpu_nbor, const bool bonds) {
bool realloc=false;
if (charge && _charge==false) {
_charge=true;
realloc=true;
}
if (rot && _rot==false) {
_rot=true;
realloc=true;
}
if (gpu_nbor && _gpu_nbor==false) {
_gpu_nbor=true;
realloc=true;
}
if (bonds && _bonds==false) {
_bonds=true;
realloc=true;
}
if (realloc) {
_other=_charge || _rot;
int max_atoms=_max_atoms;
clear_resize();
return alloc(max_atoms);
}
return true;
}
template <class numtyp, class acctyp>
bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
UCL_Device &devi, const bool gpu_nbor,
const bool bonds) {
clear();
bool success=true;
_x_avail=false;
_q_avail=false;
_quat_avail=false;
_resized=false;
_gpu_nbor=gpu_nbor;
_bonds=bonds;
_charge=charge;
_rot=rot;
_other=_charge || _rot;
dev=&devi;
// Initialize atom and nbor data
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
// Initialize timers for the selected device
time_pos.init(*dev);
time_q.init(*dev);
time_quat.init(*dev);
time_pos.zero();
time_q.zero();
time_quat.zero();
_time_cast=0.0;
#ifdef GPU_CAST
compile_kernels(*dev);
#endif
return success && alloc(ef_nall);
}
template <class numtyp, class acctyp>
void PairGPUAtomT::clear_resize() {
if (!_allocated)
return;
_allocated=false;
dev_x.clear();
if (_charge) {
dev_q.clear();
host_q.clear();
}
if (_rot) {
dev_quat.clear();
host_quat.clear();
}
#ifndef GPU_CAST
host_x.clear();
#else
host_x_cast.clear();
host_type_cast.clear();
#endif
dev_cell_id.clear();
dev_particle_id.clear();
dev_tag.clear();
#ifdef GPU_CAST
dev_x_cast.clear();
dev_type_cast.clear();
#endif
#ifndef USE_OPENCL
#ifndef WINDLL
if (_gpu_nbor) cudppDestroyPlan(sort_plan);
#endif
#endif
}
template <class numtyp, class acctyp>
void PairGPUAtomT::clear() {
_max_gpu_bytes=0;
if (!_allocated)
return;
time_pos.clear();
time_q.clear();
time_quat.clear();
clear_resize();
#ifdef GPU_CAST
if (_compiled) {
k_cast_x.clear();
delete atom_program;
_compiled=false;
}
#endif
}
template <class numtyp, class acctyp>
double PairGPUAtomT::host_memory_usage() const {
int atom_bytes=4;
if (_charge)
atom_bytes+=1;
if (_rot)
atom_bytes+=4;
return _max_atoms*atom_bytes*sizeof(numtyp)+
sizeof(PairGPUAtom<numtyp,acctyp>);
}
// Sort arrays for neighbor list calculation
template <class numtyp, class acctyp>
void PairGPUAtomT::sort_neighbor(const int num_atoms) {
#ifndef USE_OPENCL
#ifdef WINDLL
_win_sort(num_atoms,(unsigned *)dev_cell_id.begin(),
(int *)dev_particle_id.begin());
#else
CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
(int *)dev_particle_id.begin(),
8*sizeof(unsigned), num_atoms);
if (CUDPP_SUCCESS != result) {
printf("Error in cudppSort\n");
NVD_GERYON_EXIT;
}
#endif
#endif
}
#ifdef GPU_CAST
#ifdef USE_OPENCL
#include "pair_gpu_atom_cl.h"
#else
#include "pair_gpu_atom_ptx.h"
#endif
template <class numtyp, class acctyp>
void PairGPUAtomT::compile_kernels(UCL_Device &dev) {
atom_program=new UCL_Program(dev);
atom_program->load_string(pair_gpu_atom_kernel,"");
k_cast_x.set_function(*atom_program,"kernel_cast_x");
_compiled=true;
}
#endif
template class PairGPUAtom<PRECISION,ACC_PRECISION>;

View File

@ -1,417 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_ATOM_H
#define PAIR_GPU_ATOM_H
#include <math.h>
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h"
using namespace ucl_opencl;
#else
#include "cudpp.h"
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
#include "geryon/nvd_kernel.h"
using namespace ucl_cudadr;
#endif
#include "pair_gpu_precision.h"
template <class numtyp, class acctyp>
class PairGPUAtom {
public:
PairGPUAtom();
~PairGPUAtom() { clear(); }
/// Maximum number of atoms that can be stored with current allocation
inline int max_atoms() const { return _max_atoms; }
/// Current number of local+ghost atoms stored
inline int nall() const { return _nall; }
/// Set number of local+ghost atoms for future copy operations
inline void nall(const int n) { _nall=n; }
/// Memory usage per atom in this class
int bytes_per_atom() const;
/// Clear any previous data and set up for a new LAMMPS run
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor True if neighboring will be performed on device **/
bool init(const int nall, const bool charge, const bool rot,
UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
/// Check if we have enough device storage and realloc if not
/** Returns true if resized with any call during this timestep **/
inline bool resize(const int nall, bool &success) {
_nall=nall;
if (nall>_max_atoms) {
clear_resize();
success = success && alloc(nall);
_resized=true;
}
return _resized;
}
/// If already initialized by another LAMMPS style, add fields as necessary
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor True if neighboring will be performed on device **/
bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
const bool bonds);
/// Returns true if GPU is using charges
bool charge() { return _charge; }
/// Returns true if GPU is using quaternions
bool quat() { return _rot; }
/// Only free matrices of length inum or nall for resizing
void clear_resize();
/// Free all memory on host and device
void clear();
/// Return the total amount of host memory used by class in bytes
double host_memory_usage() const;
/// Sort arrays for neighbor list calculation on device
void sort_neighbor(const int num_atoms);
/// Add copy times to timers
inline void acc_timers() {
time_pos.add_to_total();
if (_charge)
time_q.add_to_total();
if (_rot)
time_quat.add_to_total();
}
/// Add copy times to timers
inline void zero_timers() {
time_pos.zero();
if (_charge)
time_q.zero();
if (_rot)
time_quat.zero();
}
/// Return the total time for host/device data transfer
/** Zeros the total so that the atom times are only included once **/
inline double transfer_time() {
double total=time_pos.total_seconds();
time_pos.zero_total();
if (_charge) {
total+=time_q.total_seconds();
time_q.zero_total();
}
if (_rot) {
total+=time_q.total_seconds();
time_quat.zero_total();
}
return total;
}
/// Return the total time for data cast/pack
/** Zeros the time so that atom times are only included once **/
inline double cast_time()
{ double t=_time_cast; _time_cast=0.0; return t; }
/// Pack LAMMPS atom type constants into matrix and copy to device
template <class dev_typ, class t1>
inline void type_pack1(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii]=static_cast<numtyp>(one[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
/// Pack LAMMPS atom type constants into 2 vectors and copy to device
template <class dev_typ, class t1, class t2>
inline void type_pack2(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one, t2 **two) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii*2]=static_cast<numtyp>(one[i][j]);
buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
/// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
template <class dev_typ, class t1, class t2, class t3>
inline void type_pack4(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one, t2 **two, t3 **three) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii*4]=static_cast<numtyp>(one[i][j]);
buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
/// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
template <class dev_typ, class t1, class t2, class t3, class t4>
inline void type_pack4(const int n, const int m_size,
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
t1 **one, t2 **two, t3 **three, t4 **four) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
buffer[ii*4]=static_cast<numtyp>(one[i][j]);
buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
ii++;
}
ii+=m_size-n;
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
ucl_copy(dev_v,view,false);
}
/// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
template <class dev_typ, class t1, class t2>
inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
for (int i=0; i<n; i++) {
buffer[i*2]=static_cast<numtyp>(one[i][i]);
buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
}
UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),n,*dev);
ucl_copy(dev_v,view,false);
}
// -------------------------COPY TO GPU ----------------------------------
/// Signal that we need to transfer atom data for next timestep
inline void data_unavail()
{ _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
/// Cast positions and types to write buffer
inline void cast_x_data(double **host_ptr, const int *host_type) {
if (_x_avail==false) {
double t=MPI_Wtime();
#ifdef GPU_CAST
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
#else
numtyp *_write_loc=host_x.begin();
for (int i=0; i<_nall; i++) {
*_write_loc=host_ptr[i][0];
_write_loc++;
*_write_loc=host_ptr[i][1];
_write_loc++;
*_write_loc=host_ptr[i][2];
_write_loc++;
*_write_loc=host_type[i];
_write_loc++;
}
#endif
_time_cast+=MPI_Wtime()-t;
}
}
/// Copy positions and types to device asynchronously
/** Copies nall() elements **/
inline void add_x_data(double **host_ptr, int *host_type) {
time_pos.start();
if (_x_avail==false) {
#ifdef GPU_CAST
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
int block_size=64;
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
k_cast_x.set_size(GX,block_size);
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
&_nall);
#else
ucl_copy(dev_x,host_x,_nall*4,true);
#endif
_x_avail=true;
}
time_pos.stop();
}
/// Calls cast_x_data and add_x_data and times the routines
inline void cast_copy_x(double **host_ptr, int *host_type) {
cast_x_data(host_ptr,host_type);
add_x_data(host_ptr,host_type);
}
// Cast charges to write buffer
template<class cpytyp>
inline void cast_q_data(cpytyp *host_ptr) {
if (_q_avail==false) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_q.view((numtyp*)host_ptr,_nall,*dev);
dev_q.view(host_q);
} else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
}
_time_cast+=MPI_Wtime()-t;
}
}
// Copy charges to device asynchronously
inline void add_q_data() {
if (_q_avail==false) {
ucl_copy(dev_q,host_q,_nall,true);
_q_avail=true;
}
}
// Cast quaternions to write buffer
template<class cpytyp>
inline void cast_quat_data(cpytyp *host_ptr) {
if (_quat_avail==false) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
dev_quat.view(host_quat);
} else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
}
_time_cast+=MPI_Wtime()-t;
}
}
// Copy quaternions to device
/** Copies nall()*4 elements **/
inline void add_quat_data() {
if (_quat_avail==false) {
ucl_copy(dev_quat,host_quat,_nall*4,true);
_quat_avail=true;
}
}
/// Return number of bytes used on device
inline double max_gpu_bytes()
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
// ------------------------------ DATA ----------------------------------
/// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
UCL_D_Vec<numtyp> dev_x;
/// Charges
UCL_D_Vec<numtyp> dev_q;
/// Quaterions
UCL_D_Vec<numtyp> dev_quat;
#ifdef GPU_CAST
UCL_D_Vec<double> dev_x_cast;
UCL_D_Vec<int> dev_type_cast;
UCL_H_Vec<double> host_x_cast;
UCL_H_Vec<int> host_type_cast;
#endif
/// Buffer for moving positions to device
UCL_H_Vec<numtyp> host_x;
/// Buffer for moving charge data to GPU
UCL_H_Vec<numtyp> host_q;
/// Buffer for moving quat data to GPU
UCL_H_Vec<numtyp> host_quat;
/// Cell list identifiers for device nbor builds
UCL_D_Vec<unsigned> dev_cell_id;
/// Cell list identifiers for device nbor builds
UCL_D_Vec<int> dev_particle_id;
/// Atom tag information for device nbor builds
UCL_D_Vec<int> dev_tag;
/// Device timers
UCL_Timer time_pos, time_q, time_quat;
/// Geryon device
UCL_Device *dev;
private:
#ifdef GPU_CAST
UCL_Program *atom_program;
UCL_Kernel k_cast_x;
void compile_kernels(UCL_Device &dev);
#endif
bool _compiled;
// True if data has been copied to device already
bool _x_avail, _q_avail, _quat_avail, _resized;
bool alloc(const int nall);
bool _allocated, _rot, _charge, _other;
int _max_atoms, _nall;
bool _gpu_nbor, _bonds;
double _time_cast;
double _max_gpu_bytes;
#ifndef USE_OPENCL
CUDPPConfiguration sort_config;
CUDPPHandle sort_plan;
#endif
};
#endif

View File

@ -1,46 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp4 double4
#else
#define numtyp float
#define numtyp4 float4
#endif
__kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x,
__global int *type, const int nall) {
int ii=GLOBAL_ID_X;
if (ii<nall) {
numtyp4 xt;
xt.w=type[ii];
int i=ii*3;
xt.x=x[i];
xt.y=x[i+1];
xt.z=x[i+2];
x_type[ii]=xt;
} // if ii
}

View File

@ -1,206 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_BALANCE_H
#define PAIR_GPU_BALANCE_H
#include "pair_gpu_device.h"
#include <math.h>
#define _HD_BALANCE_EVERY 25
#define _HD_BALANCE_WEIGHT 0.5
#define _HD_BALANCE_GAP 1.10
/// Host/device load balancer
template<class numtyp, class acctyp>
class PairGPUBalance {
public:
inline PairGPUBalance() : _init_done(false), _measure_this_step(false) {}
inline ~PairGPUBalance() { clear(); }
/// Clear any old data and setup for new LAMMPS run
inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const bool gpu_nbor,
const double split);
/// Clear all host and device data
inline void clear() {
if (_init_done) {
_device_time.clear();
_measure_this_step=false;
_init_done=false;
}
}
/// Return the timestep since initialization
inline int timestep() { return _timestep; }
/// Get a count of the number of particles host will handle for initial alloc
inline int first_host_count(const int nlocal, const double gpu_split,
const bool gpu_nbor) const {
int host_nlocal=0;
if (gpu_nbor && gpu_split!=1.0) {
if (gpu_split>0)
host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
else
host_nlocal=static_cast<int>(ceil(0.05*nlocal));
}
return host_nlocal;
}
/// Return the number of particles the device will handle this timestep
inline int get_gpu_count(const int ago, const int inum_full);
/// Return the average fraction of particles handled by device on all procs
inline double all_avg_split() {
if (_load_balance) {
double _all_avg_split=0.0;
MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0,
_device->replica());
_all_avg_split/=_device->replica_size();
return _all_avg_split/_avg_count;
} else
return _actual_split;
}
/// If CPU neighboring, allow the device fraction to increase on 2nd timestep
inline int ago_first(int ago) const
{ if (_avg_count==1 && _actual_split<_desired_split) ago=0; return ago; }
/// Start the timer for asynchronous device execution
inline void start_timer() {
if (_measure_this_step) {
_device->gpu->sync();
_device->gpu_barrier();
_device->start_host_timer();
_device_time.start();
_device->gpu->sync();
_device->gpu_barrier();
}
}
/// Stop the timer for asynchronous device execution
inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }
/// Calculate the new host/device split based on the cpu and device times
/** \note Only does calculation every _HD_BALANCE_EVERY timesteps
(and first 10) **/
inline void balance(const double cpu_time);
/// Calls balance() and then get_gpu_count()
inline int balance(const int ago,const int inum_full,const double cpu_time) {
balance(cpu_time);
return get_gpu_count(ago,inum_full);
}
private:
PairGPUDevice<numtyp,acctyp> *_device;
UCL_Timer _device_time;
bool _init_done, _gpu_nbor;
bool _load_balance;
double _actual_split, _avg_split, _desired_split, _max_split;
int _avg_count;
bool _measure_this_step;
int _inum, _inum_full, _timestep;
};
#define PairGPUBalanceT PairGPUBalance<numtyp,acctyp>
template <class numtyp, class acctyp>
void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
const bool gpu_nbor, const double split) {
clear();
_gpu_nbor=gpu_nbor;
_init_done=true;
_device=gpu;
_device_time.init(*gpu->gpu);
if (split<0.0) {
_load_balance=true;
_desired_split=0.90;
} else {
_load_balance=false;
_desired_split=split;
}
_actual_split=_desired_split;
_avg_split=0.0;
_avg_count=0;
_timestep=0;
}
template <class numtyp, class acctyp>
int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) {
_measure_this_step=false;
if (_load_balance) {
if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
_measure_this_step=true;
_inum_full=inum_full;
}
if (ago==0) {
_actual_split=_desired_split;
_max_split=_desired_split;
}
}
_inum=static_cast<int>(floor(_actual_split*inum_full));
if (_inum==0) _inum++;
_timestep++;
return _inum;
}
template <class numtyp, class acctyp>
void PairGPUBalanceT::balance(const double cpu_time) {
if (_measure_this_step) {
_measure_this_step=false;
double gpu_time=_device_time.seconds();
double max_gpu_time;
MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
_device->gpu_comm());
if (_inum_full==_inum) {
_desired_split=1.0;
return;
}
double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
double cpu_other_time=_device->host_time()-cpu_time;
int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
cpu_time_per_atom);
double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
_desired_split=split*_HD_BALANCE_GAP;
if (_desired_split>1.0)
_desired_split=1.0;
if (_desired_split<0.0)
_desired_split=0.0;
if (!_gpu_nbor) {
if (_desired_split<_max_split)
_actual_split=_desired_split;
else
_actual_split=_max_split;
}
//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl;
}
_avg_split+=_desired_split;
_avg_count++;
}
#endif

View File

@ -1,300 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Peng Wang (Nvidia), penwang@nvidia.com
Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> neigh_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int i, const double4 *pos)
{
return pos[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(neigh_tex, i);
}
#endif
#else
#define fetch_pos(i,y) x_[i]
#define BLOCK_NBOR_BUILD 64
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp4 float4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp4 float4
#endif
#define BLOCK_CELL_2D 8
#define SBBITS 30
#define SBBITS 30
__kernel void transpose(int *out, int *in, int columns_in, int rows_in)
{
__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
unsigned ti=THREAD_ID_X;
unsigned tj=THREAD_ID_Y;
unsigned bi=BLOCK_ID_X;
unsigned bj=BLOCK_ID_Y;
unsigned i=bi*BLOCK_CELL_2D+ti;
unsigned j=bj*BLOCK_CELL_2D+tj;
if ((i<columns_in) && (j<rows_in))
block[tj][ti]=in[j*columns_in+i];
__syncthreads();
i=bj*BLOCK_CELL_2D+ti;
j=bi*BLOCK_CELL_2D+tj;
if ((i<rows_in) && (j<columns_in))
out[j*rows_in+i] = block[ti][tj];
}
__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
numtyp boxlo0,
numtyp boxlo1, numtyp boxlo2, numtyp boxhi0,
numtyp boxhi1, numtyp boxhi2, numtyp cell_size,
int ncellx, int ncelly, int nall) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < nall) {
numtyp4 p = fetch_pos(i,pos); //pos[i];
p.x -= boxlo0;
p.y -= boxlo1;
p.z -= boxlo2;
p.x = fmaxf(p.x, -cell_size);
p.x = fminf(p.x, boxhi0-boxlo0+cell_size);
p.y = fmaxf(p.y, -cell_size);
p.y = fminf(p.y, boxhi1-boxlo1+cell_size);
p.z = fmaxf(p.z, -cell_size);
p.z = fminf(p.z, boxhi2-boxlo2+cell_size);
unsigned int id = (unsigned int)(p.x/cell_size + 1.0)
+ (unsigned int)(p.y/cell_size + 1.0) * ncellx
+ (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly;
cell_id[i] = id;
particle_id[i] = i;
}
}
__kernel void kernel_calc_cell_counts(unsigned *cell_id,
int *cell_counts, int nall, int ncell) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nall) {
int id = cell_id[idx];
// handle boundary cases
if (idx == 0) {
for (int i = 0; i < id + 1; i++)
cell_counts[i] = 0;
}
if (idx == nall - 1) {
for (int i = id+1; i <= ncell; i++)
cell_counts[i] = nall;
}
if (idx > 0 && idx < nall) {
int id_l = cell_id[idx-1];
if (id != id_l) {
for (int i = id_l+1; i <= id; i++)
cell_counts[i] = idx;
}
}
}
}
__kernel void calc_neigh_list_cell(numtyp4 *pos,
int *cell_particle_id,
int *cell_counts,
int *nbor_list,
int *host_nbor_list,
int *host_numj,
int neigh_bin_size,
numtyp cell_size,
int ncellx, int ncelly, int ncellz,
int inum, int nt, int nall)
{
int tid = threadIdx.x;
int ix = blockIdx.x;
int iy = blockIdx.y % ncelly;
int iz = blockIdx.y / ncelly;
int icell = ix + iy*ncellx + iz*ncellx*ncelly;
__shared__ int cell_list_sh[BLOCK_NBOR_BUILD];
__shared__ numtyp4 pos_sh[BLOCK_NBOR_BUILD];
int icell_begin = cell_counts[icell];
int icell_end = cell_counts[icell+1];
int nborz0 = max(iz-1,0), nborz1 = min(iz+1, ncellz-1),
nbory0 = max(iy-1,0), nbory1 = min(iy+1, ncelly-1),
nborx0 = max(ix-1,0), nborx1 = min(ix+1, ncellx-1);
numtyp4 diff;
numtyp r2;
for (int ii = 0; ii < ceil((numtyp)(icell_end - icell_begin)/blockDim.x); ii++) {
int i = icell_begin + tid + ii*blockDim.x;
int pid_i = nall, pid_j, stride;
numtyp4 atom_i, atom_j;
int cnt = 0;
int *neigh_counts, *neigh_list;
if (i < icell_end)
pid_i = cell_particle_id[i];
if (pid_i < nt) {
atom_i = fetch_pos(pid_i,pos); //pos[pid_i];
}
if (pid_i < inum) {
stride=inum;
neigh_counts=nbor_list+stride+pid_i;
neigh_list=neigh_counts+stride;
nbor_list[pid_i]=pid_i;
} else {
stride=1;
neigh_counts=host_numj+pid_i-inum;
neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
}
// loop through neighbors
for (int nborz = nborz0; nborz <= nborz1; nborz++) {
for (int nbory = nbory0; nbory <= nbory1; nbory++) {
for (int nborx = nborx0; nborx <= nborx1; nborx++) {
int jcell = nborx + nbory*ncellx + nborz*ncellx*ncelly;
int jcell_begin = cell_counts[jcell];
int jcell_end = cell_counts[jcell+1];
int num_atom_cell = jcell_end - jcell_begin;
// load jcell to shared memory
int num_iter = (int)ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
for (int k = 0; k < num_iter; k++) {
int end_idx = min(BLOCK_NBOR_BUILD, num_atom_cell-k*BLOCK_NBOR_BUILD);
if (tid < end_idx) {
pid_j = cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
cell_list_sh[tid] = pid_j;
atom_j = fetch_pos(pid_j,pos); //[pid_j];
pos_sh[tid].x = atom_j.x;
pos_sh[tid].y = atom_j.y;
pos_sh[tid].z = atom_j.z;
}
__syncthreads();
if (pid_i < nt) {
for (int j = 0; j < end_idx; j++) {
int pid_j = cell_list_sh[j]; // gather from shared memory
diff.x = atom_i.x - pos_sh[j].x;
diff.y = atom_i.y - pos_sh[j].y;
diff.z = atom_i.z - pos_sh[j].z;
r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
if (r2 < cell_size*cell_size && r2 > 1e-5) {
if (cnt < neigh_bin_size) {
*neigh_list = pid_j;
neigh_list+=stride;
}
cnt++;
}
}
}
__syncthreads();
} // for (k)
}
}
}
if (pid_i < nt)
*neigh_counts = cnt;
} // for (i)
}
__kernel void kernel_special(__global int *dev_nbor,
__global int *host_nbor_list,
__global int *host_numj, __global int *tag,
__global int *nspecial, __global int *special,
int inum, int nt, int max_nbors) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
if (ii<nt) {
int stride;
__global int *list, *list_end;
int n1=nspecial[ii*3];
int n2=nspecial[ii*3+1];
int n3=nspecial[ii*3+2];
int numj;
if (ii < inum) {
stride=inum;
list=dev_nbor+stride+ii;
numj=*list;
list+=stride;
} else {
stride=1;
list=host_nbor_list+(ii-inum)*max_nbors;
numj=host_numj[ii-inum];
}
list_end=list+numj*stride;
for ( ; list<list_end; list+=stride) {
int nbor=*list;
int jtag=tag[nbor];
int offset=ii;
for (int i=0; i<n3; i++) {
if (special[offset]==jtag) {
int which = 1;
if (i>=n1)
which++;
if (i>=n2)
which++;
nbor=nbor ^ (which << SBBITS);
*list=nbor;
}
offset+=nt;
}
}
} // if ii
}

View File

@ -1,120 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
/*************************************************************************
Preprocessor Definitions
Note: It is assumed that constants with the same names are defined with
the same values in all files.
ARCH
Definition: Architecture number for accelerator
MEM_THREADS
Definition: Number of threads with sequential ids accessing memory
simultaneously on multiprocessor
WARP_SIZE:
Definition: Number of threads guaranteed to be on the same instruction
THREADS_PER_ATOM
Definition: Default number of threads assigned per atom for pair styles
Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
THREADS_PER_CHARGE
Definition: Default number of threads assigned per atom for pair styles
with charge
Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
PPPM_MAX_SPLINE
Definition: Maximum order for splines in PPPM
PPPM_BLOCK_1D
Definition: Thread block size for PPPM kernels
Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
PPPM_BLOCK_1D%32==0
BLOCK_PAIR
Definition: Default thread block size for pair styles
Restrictions:
MAX_SHARED_TYPES 8
Definition: Max number of atom type params can be stored in shared memory
Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
BLOCK_CELL_2D
Definition: Default block size in each dimension for cell list builds
and matrix transpose
BLOCK_CELL_ID
Definition: Default block size for binning atoms in cell list builds
BLOCK_NBOR_BUILD
Definition: Default block size for neighbor list builds
BLOCK_BIO_PAIR
Definition: Default thread block size for "bio" pair styles
MAX_BIO_SHARED_TYPES
Definition: Max number of atom type params can be stored in shared memory
Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 &&
MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR
*************************************************************************/
#ifndef PAIR_GPU_DEV_KERNEL
#define PAIR_GPU_DEV_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
#else
#define GLOBAL_ID_X get_global_id(0)
#define ARCH 0
#define DRIVER 0
#define MEM_THREADS 16
#define WARP_SIZE 1
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#define BLOCK_NBOR_BUILD 64
#define BLOCK_BIO_PAIR 64
#endif
#define PPPM_MAX_SPLINE 8
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
__kernel void kernel_zero(__global int *mem, int numel) {
int ii=GLOBAL_ID_X;
if (ii<numel)
mem[ii]=0;
}
__kernel void kernel_info(__global int *info) {
info[0]=ARCH;
info[1]=MEM_THREADS;
info[2]=WARP_SIZE;
info[3]=THREADS_PER_ATOM;
info[4]=PPPM_MAX_SPLINE;
info[5]=PPPM_BLOCK_1D;
info[6]=BLOCK_PAIR;
info[7]=MAX_SHARED_TYPES;
info[8]=BLOCK_CELL_2D;
info[9]=BLOCK_CELL_ID;
info[10]=BLOCK_NBOR_BUILD;
info[11]=BLOCK_BIO_PAIR;
info[12]=MAX_BIO_SHARED_TYPES;
info[13]=THREADS_PER_CHARGE;
}
#endif

View File

@ -1,614 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include "pair_gpu_device.h"
#include "pair_gpu_precision.h"
#include <map>
#include <math.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#ifdef USE_OPENCL
#include "pair_gpu_dev_cl.h"
#else
#include "pair_gpu_dev_ptx.h"
#endif
#define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
template <class numtyp, class acctyp>
PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
_gpu_mode(GPU_FORCE), _first_device(0),
_last_device(0), _compiled(false) {
}
template <class numtyp, class acctyp>
PairGPUDeviceT::~PairGPUDevice() {
clear_device();
}
template <class numtyp, class acctyp>
int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
const int first_gpu, const int last_gpu,
const int gpu_mode, const double p_split,
const int nthreads, const int t_per_atom) {
_nthreads=nthreads;
#ifdef _OPENMP
omp_set_num_threads(nthreads);
#endif
_threads_per_atom=t_per_atom;
_threads_per_charge=t_per_atom;
if (_device_init)
return 0;
_device_init=true;
_comm_world=world;
_comm_replica=replica;
_first_device=first_gpu;
_last_device=last_gpu;
_gpu_mode=gpu_mode;
_particle_split=p_split;
// Get the rank/size within the world
MPI_Comm_rank(_comm_world,&_world_me);
MPI_Comm_size(_comm_world,&_world_size);
// Get the rank/size within the replica
MPI_Comm_rank(_comm_replica,&_replica_me);
MPI_Comm_size(_comm_replica,&_replica_size);
// Get the names of all nodes
int name_length;
char node_name[MPI_MAX_PROCESSOR_NAME];
char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
MPI_Get_processor_name(node_name,&name_length);
MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
std::string node_string=std::string(node_name);
// Get the number of procs per node
std::map<std::string,int> name_map;
std::map<std::string,int>::iterator np;
for (int i=0; i<_world_size; i++) {
std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
np=name_map.find(i_string);
if (np==name_map.end())
name_map[i_string]=1;
else
np->second++;
}
int procs_per_node=name_map.begin()->second;
// Assign a unique id to each node
int split_num=0, split_id=0;
for (np=name_map.begin(); np!=name_map.end(); ++np) {
if (np->first==node_string)
split_id=split_num;
split_num++;
}
// Set up a per node communicator and find rank within
MPI_Comm node_comm;
MPI_Comm_split(_comm_world, split_id, 0, &node_comm);
int node_rank;
MPI_Comm_rank(node_comm,&node_rank);
// set the device ID
_procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
(last_gpu-first_gpu+1)));
int my_gpu=node_rank/_procs_per_gpu+first_gpu;
// Time on the device only if 1 proc per gpu
_time_device=true;
if (_procs_per_gpu>1)
_time_device=false;
// Set up a per device communicator
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
MPI_Comm_rank(_comm_gpu,&_gpu_rank);
gpu=new UCL_Device();
if (my_gpu>=gpu->num_devices())
return -2;
gpu->set(my_gpu);
_long_range_precompute=0;
int flag=compile_kernels();
return flag;
}
template <class numtyp, class acctyp>
int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal,
const int host_nlocal, const int nall,
PairGPUNbor *nbor, const int maxspecial,
const int gpu_host, const int max_nbors,
const double cell_size, const bool pre_cut) {
if (!_device_init)
return -1;
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
return -5;
// Counts of data transfers for timing overhead estimates
_data_in_estimate=0;
_data_out_estimate=1;
// Initial number of local particles
int ef_nlocal=nlocal;
if (_particle_split<1.0 && _particle_split>0.0)
ef_nlocal=static_cast<int>(_particle_split*nlocal);
bool gpu_nbor=false;
if (_gpu_mode==GPU_NEIGH)
gpu_nbor=true;
if (_init_count==0) {
// Initialize atom and nbor data
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
return -3;
_data_in_estimate++;
if (charge)
_data_in_estimate++;
if (rot)
_data_in_estimate++;
} else {
if (atom.charge()==false && charge)
_data_in_estimate++;
if (atom.quat()==false && rot)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial))
return -3;
}
if (!ans.init(ef_nlocal,charge,rot,*gpu))
return -3;
if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
_block_cell_id, _block_nbor_build))
return -3;
nbor->cell_size(cell_size);
_init_count++;
return 0;
}
template <class numtyp, class acctyp>
int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
const int nall) {
if (!_device_init)
return -1;
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
return -5;
if (_init_count==0) {
// Initialize atom and nbor data
if (!atom.init(nall,true,false,*gpu,false,false))
return -3;
} else
if (!atom.add_fields(true,false,false,false))
return -3;
if (!ans.init(nlocal,true,false,*gpu))
return -3;
_init_count++;
return 0;
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::set_single_precompute
(PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) {
_long_range_precompute=1;
pppm_single=pppm;
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::set_double_precompute
(PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) {
_long_range_precompute=2;
pppm_double=pppm;
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::init_message(FILE *screen, const char *name,
const int first_gpu, const int last_gpu) {
#ifdef USE_OPENCL
std::string fs="";
#else
std::string fs=toa(gpu->free_gigabytes())+"/";
#endif
if (_replica_me == 0 && screen) {
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"-------------------------------------\n");
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu);
#ifdef _OPENMP
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
#endif
fprintf(screen,"-------------------------------------");
fprintf(screen,"-------------------------------------\n");
int last=last_gpu+1;
if (last>gpu->num_devices())
last=gpu->num_devices();
for (int i=first_gpu; i<last; i++) {
std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
" GHZ (";
if (sizeof(PRECISION)==4) {
if (sizeof(ACC_PRECISION)==4)
sname+="Single Precision)";
else
sname+="Mixed Precision)";
} else
sname+="Double Precision)";
fprintf(screen,"GPU %d: %s\n",i,sname.c_str());
}
fprintf(screen,"-------------------------------------");
fprintf(screen,"-------------------------------------\n\n");
}
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls,
double &gpu_overhead,
double &gpu_driver_overhead) {
UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
UCL_Timer over_timer(*gpu);
if (_data_in_estimate>0) {
host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
timers_in=new UCL_Timer[_data_in_estimate];
}
if (_data_out_estimate>0) {
host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
timers_out=new UCL_Timer[_data_out_estimate];
}
if (kernel_calls>0) {
kernel_data=new UCL_D_Vec<int>[kernel_calls];
timers_kernel=new UCL_Timer[kernel_calls];
}
for (int i=0; i<_data_in_estimate; i++) {
host_data_in[i].alloc(1,*gpu);
dev_data_in[i].alloc(1,*gpu);
timers_in[i].init(*gpu);
}
for (int i=0; i<_data_out_estimate; i++) {
host_data_out[i].alloc(1,*gpu);
dev_data_out[i].alloc(1,*gpu);
timers_out[i].init(*gpu);
}
for (int i=0; i<kernel_calls; i++) {
kernel_data[i].alloc(1,*gpu);
timers_kernel[i].init(*gpu);
}
gpu_overhead=0.0;
gpu_driver_overhead=0.0;
for (int i=0; i<10; i++) {
gpu->sync();
gpu_barrier();
over_timer.start();
gpu->sync();
gpu_barrier();
double driver_time=MPI_Wtime();
for (int i=0; i<_data_in_estimate; i++) {
timers_in[i].start();
ucl_copy(dev_data_in[i],host_data_in[i],true);
timers_in[i].stop();
}
for (int i=0; i<kernel_calls; i++) {
timers_kernel[i].start();
zero(kernel_data[i],1);
timers_kernel[i].stop();
}
for (int i=0; i<_data_out_estimate; i++) {
timers_out[i].start();
ucl_copy(host_data_out[i],dev_data_out[i],true);
timers_out[i].stop();
}
over_timer.stop();
double time=over_timer.seconds();
driver_time=MPI_Wtime()-driver_time;
if (time_device()) {
for (int i=0; i<_data_in_estimate; i++)
timers_in[i].add_to_total();
for (int i=0; i<kernel_calls; i++)
timers_kernel[i].add_to_total();
for (int i=0; i<_data_out_estimate; i++)
timers_out[i].add_to_total();
}
double mpi_time, mpi_driver_time;
MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
gpu_overhead+=mpi_time;
gpu_driver_overhead+=mpi_driver_time;
}
gpu_overhead/=10.0;
gpu_driver_overhead/=10.0;
if (_data_in_estimate>0) {
delete [] host_data_in;
delete [] dev_data_in;
delete [] timers_in;
}
if (_data_out_estimate>0) {
delete [] host_data_out;
delete [] dev_data_out;
delete [] timers_out;
}
if (kernel_calls>0) {
delete [] kernel_data;
delete [] timers_kernel;
}
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::output_times(UCL_Timer &time_pair,
PairGPUAns<numtyp,acctyp> &ans,
PairGPUNbor &nbor, const double avg_split,
const double max_bytes,
const double gpu_overhead,
const double driver_overhead,
const int threads_per_atom, FILE *screen) {
double single[8], times[8];
single[0]=atom.transfer_time()+ans.transfer_time();
single[1]=nbor.time_nbor.total_seconds();
single[2]=nbor.time_kernel.total_seconds();
single[3]=time_pair.total_seconds();
single[4]=atom.cast_time()+ans.cast_time();
single[5]=gpu_overhead;
single[6]=driver_overhead;
single[7]=ans.cpu_idle_time();
MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
double max_mb=mpi_max_bytes/(1024.0*1024.0);
if (replica_me()==0)
if (screen && times[5]>0.0) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (time_device()) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
if (nbor.gpu_nbor())
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size);
else
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size);
}
fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Threads / atom: %d.\n",threads_per_atom);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size);
fprintf(screen,"-------------------------------------");
fprintf(screen,"--------------------------------\n\n");
}
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
UCL_Timer &time_out,
UCL_Timer &time_map,
UCL_Timer &time_rho,
UCL_Timer &time_interp,
PairGPUAns<numtyp,acctyp> &ans,
const double max_bytes,
const double cpu_time,
const double idle_time, FILE *screen) {
double single[8], times[8];
single[0]=time_out.total_seconds();
single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
single[2]=time_map.total_seconds();
single[3]=time_rho.total_seconds();
single[4]=time_interp.total_seconds();
single[5]=ans.transfer_time()+ans.cast_time();
single[6]=cpu_time;
single[7]=idle_time;
MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
double max_mb=mpi_max_bytes/(1024.0*1024.0);
if (replica_me()==0)
if (screen && times[6]>0.0) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (time_device()) {
fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size);
fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size);
fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size);
fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size);
fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size);
fprintf(screen,"Total rho: %.4f s.\n",
(times[0]+times[2]+times[3])/_replica_size);
fprintf(screen,"Total interp: %.4f s.\n",
(times[1]+times[4])/_replica_size);
fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
fprintf(screen,"Total: %.4f s.\n",
(times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
_replica_size);
}
fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size);
fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
fprintf(screen,"-------------------------------------");
fprintf(screen,"--------------------------------\n\n");
}
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::clear() {
if (_init_count>0) {
_long_range_precompute=0;
_init_count--;
if (_init_count==0) {
atom.clear();
_nbor_shared.clear();
if (_compiled) {
k_zero.clear();
k_info.clear();
delete dev_program;
_compiled=false;
}
}
}
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::clear_device() {
while (_init_count>0)
clear();
if (_device_init) {
delete gpu;
_device_init=false;
}
}
template <class numtyp, class acctyp>
int PairGPUDeviceT::compile_kernels() {
int flag=0;
if (_compiled)
return flag;
std::string flags="-cl-mad-enable";
dev_program=new UCL_Program(*gpu);
int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str());
if (success!=UCL_SUCCESS)
return -4;
k_zero.set_function(*dev_program,"kernel_zero");
k_info.set_function(*dev_program,"kernel_info");
_compiled=true;
UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
k_info.set_size(1,1);
k_info.run(&d_gpu_lib_data.begin());
ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
_ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
#ifndef USE_OPENCL
if (_ptx_arch>gpu->arch())
return -4;
#endif
_num_mem_threads=h_gpu_lib_data[1];
_warp_size=h_gpu_lib_data[2];
if (_threads_per_atom<1)
_threads_per_atom=h_gpu_lib_data[3];
if (_threads_per_charge<1)
_threads_per_charge=h_gpu_lib_data[13];
_pppm_max_spline=h_gpu_lib_data[4];
_pppm_block=h_gpu_lib_data[5];
_block_pair=h_gpu_lib_data[6];
_max_shared_types=h_gpu_lib_data[7];
_block_cell_2d=h_gpu_lib_data[8];
_block_cell_id=h_gpu_lib_data[9];
_block_nbor_build=h_gpu_lib_data[10];
_block_bio_pair=h_gpu_lib_data[11];
_max_bio_shared_types=h_gpu_lib_data[12];
if (static_cast<size_t>(_block_pair)>gpu->group_size())
_block_pair=gpu->group_size();
if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
_block_bio_pair=gpu->group_size();
if (_threads_per_atom>_warp_size)
_threads_per_atom=_warp_size;
if (_warp_size%_threads_per_atom!=0)
_threads_per_atom=1;
if (_threads_per_charge>_warp_size)
_threads_per_charge=_warp_size;
if (_warp_size%_threads_per_charge!=0)
_threads_per_charge=1;
return flag;
}
template <class numtyp, class acctyp>
double PairGPUDeviceT::host_memory_usage() const {
return atom.host_memory_usage()+4*sizeof(numtyp)+
sizeof(PairGPUDevice<numtyp,acctyp>);
}
template class PairGPUDevice<PRECISION,ACC_PRECISION>;
PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom) {
return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
particle_split,nthreads,t_per_atom);
}
void lmp_clear_device() {
pair_gpu_device.clear_device();
}
double lmp_gpu_forces(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
}

View File

@ -1,311 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_DEVICE_H
#define PAIR_GPU_DEVICE_H
#include "pair_gpu_atom.h"
#include "pair_gpu_ans.h"
#include "pair_gpu_nbor.h"
#include "pppm_gpu_memory.h"
#include "mpi.h"
#include <sstream>
#include "stdio.h"
#include <string>
#include <queue>
template <class numtyp, class acctyp,
class grdtyp, class grdtyp4> class PPPMGPUMemory;
template <class numtyp, class acctyp>
class PairGPUDevice {
public:
PairGPUDevice();
~PairGPUDevice();
/// Initialize the device for use by this process
/** Sets up a per-device MPI communicator for load balancing and initializes
* the device (>=first_gpu and <=last_gpu) that this proc will be using
* Returns:
* - 0 if successfull
* - -2 if GPU not found
* - -4 if GPU library not compiled for GPU **/
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom);
/// Initialize the device for Atom and Neighbor storage
/** \param rot True if quaternions need to be stored
* \param nlocal Total number of local particles to allocate memory for
* \param host_nlocal Initial number of host particles to allocate memory for
* \param nall Total number of local+ghost particles
* \param gpu_nbor True if neighboring is performed on device
* \param gpu_host 0 if host will not perform force calculations,
* 1 if gpu_nbor is true, and host needs a half nbor list,
* 2 if gpu_nbor is true, and host needs a full nbor list
* \param max_nbors Initial number of rows in the neighbor matrix
* \param cell_size cutoff+skin
* \param pre_cut True if cutoff test will be performed in separate kernel
* than the force kernel
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
const int nlocal, const int host_nlocal, const int nall,
PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
const int max_nbors, const double cell_size, const bool pre_cut);
/// Initialize the device for Atom storage only
/** \param nlocal Total number of local particles to allocate memory for
* \param nall Total number of local+ghost particles
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
/// Output a message for pair_style acceleration with device stats
void init_message(FILE *screen, const char *name,
const int first_gpu, const int last_gpu);
/// Perform charge assignment asynchronously for PPPM
void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
float,_lgpu_float4> *pppm);
/// Perform charge assignment asynchronously for PPPM
void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
double,_lgpu_double4> *pppm);
/// Esimate the overhead from GPU calls from multiple procs
/** \param kernel_calls Number of kernel calls/timestep for timing estimated
* overhead
* \param gpu_overhead Estimated gpu overhead per timestep (sec)
* \param driver_overhead Estimated overhead from driver per timestep (s) **/
void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
double &gpu_driver_overhead);
/// Returns true if double precision is supported on card
inline bool double_precision() { return gpu->double_precision(); }
/// Output a message with timing information
void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans,
PairGPUNbor &nbor, const double avg_split,
const double max_bytes, const double gpu_overhead,
const double driver_overhead,
const int threads_per_atom, FILE *screen);
/// Output a message with timing information
void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
UCL_Timer & time_map, UCL_Timer & time_rho,
UCL_Timer &time_interp,
PairGPUAns<numtyp,acctyp> &ans,
const double max_bytes, const double cpu_time,
const double cpu_idle_time, FILE *screen);
/// Clear all memory on host and device associated with atom and nbor data
void clear();
/// Clear all memory on host and device
void clear_device();
/// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
{ ans_queue.push(ans); }
/// Add "answers" (force,energies,etc.) into LAMMPS structures
inline double fix_gpu(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
atom.data_unavail();
if (ans_queue.empty()==false) {
stop_host_timer();
double evdw=0.0;
while (ans_queue.empty()==false) {
evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
ans_queue.pop();
}
return evdw;
}
return 0.0;
}
/// Start timer on host
inline void start_host_timer()
{ _cpu_full=MPI_Wtime(); _host_timer_started=true; }
/// Stop timer on host
inline void stop_host_timer() {
if (_host_timer_started) {
_cpu_full=MPI_Wtime()-_cpu_full;
_host_timer_started=false;
}
}
/// Return host time
inline double host_time() { return _cpu_full; }
/// Return host memory usage in bytes
double host_memory_usage() const;
/// Return the number of procs sharing a device (size of device commincator)
inline int procs_per_gpu() const { return _procs_per_gpu; }
/// Return the number of threads per proc
inline int num_threads() const { return _nthreads; }
/// My rank within all processes
inline int world_me() const { return _world_me; }
/// Total number of processes
inline int world_size() const { return _world_size; }
/// MPI Barrier for world
inline void world_barrier() { MPI_Barrier(_comm_world); }
/// Return the replica MPI communicator
inline MPI_Comm & replica() { return _comm_replica; }
/// My rank within replica communicator
inline int replica_me() const { return _replica_me; }
/// Number of procs in replica communicator
inline int replica_size() const { return _replica_size; }
/// Return the per-GPU MPI communicator
inline MPI_Comm & gpu_comm() { return _comm_gpu; }
/// Return my rank in the device communicator
inline int gpu_rank() const { return _gpu_rank; }
/// MPI Barrier for gpu
inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
/// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH
inline int gpu_mode() const { return _gpu_mode; }
/// Index of first device used by a node
inline int first_device() const { return _first_device; }
/// Index of last device used by a node
inline int last_device() const { return _last_device; }
/// Particle split defined in fix
inline double particle_split() const { return _particle_split; }
/// Return the initialization count for the device
inline int init_count() const { return _init_count; }
/// True if device is being timed
inline bool time_device() const { return _time_device; }
/// Return the number of threads accessing memory simulatenously
inline int num_mem_threads() const { return _num_mem_threads; }
/// Return the number of threads per atom for pair styles
inline int threads_per_atom() const { return _threads_per_atom; }
/// Return the number of threads per atom for pair styles using charge
inline int threads_per_charge() const { return _threads_per_charge; }
/// Return the min of the pair block size or the device max block size
inline int pair_block_size() const { return _block_pair; }
/// Return the maximum number of atom types that can be used with shared mem
inline int max_shared_types() const { return _max_shared_types; }
/// Return the maximum order for PPPM splines
inline int pppm_max_spline() const { return _pppm_max_spline; }
/// Return the block size for PPPM kernels
inline int pppm_block() const { return _pppm_block; }
/// Return the block size for neighbor binning
inline int block_cell_2d() const { return _block_cell_2d; }
/// Return the block size for atom mapping for neighbor builds
inline int block_cell_id() const { return _block_cell_id; }
/// Return the block size for neighbor build kernel
inline int block_nbor_build() const { return _block_nbor_build; }
/// Return the block size for "bio" pair styles
inline int block_bio_pair() const { return _block_bio_pair; }
/// Return the maximum number of atom types for shared mem with "bio" styles
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
/// Architecture gpu code compiled for (returns 0 for OpenCL)
inline double ptx_arch() const { return _ptx_arch; }
// -------------------- SHARED DEVICE ROUTINES --------------------
// Perform asynchronous zero of integer array
void zero(UCL_D_Vec<int> &mem, const int numel) {
int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
_block_pair));
k_zero.set_size(num_blocks,_block_pair);
k_zero.run(&mem.begin(),&numel);
}
// -------------------------- DEVICE DATA -------------------------
/// Geryon Device
UCL_Device *gpu;
enum{GPU_FORCE, GPU_NEIGH};
// --------------------------- ATOM DATA --------------------------
/// Atom Data
PairGPUAtom<numtyp,acctyp> atom;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor Data
PairGPUNborShared _nbor_shared;
// ------------------------ LONG RANGE DATA -------------------------
// Long Range Data
int _long_range_precompute;
PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
/// Precomputations for long range charge assignment (asynchronously)
inline void precompute(const int ago, const int nlocal, const int nall,
double **host_x, int *host_type, bool &success,
double *charge, double *boxlo, double *prd) {
if (_long_range_precompute==1)
pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
boxlo,prd);
else if (_long_range_precompute==2)
pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
boxlo,prd);
}
private:
std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
int _init_count;
bool _device_init, _host_timer_started, _time_device;
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
_replica_size;
int _gpu_mode, _first_device, _last_device, _nthreads;
double _particle_split;
double _cpu_full;
double _ptx_arch;
int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
int _pppm_max_spline, _pppm_block;
int _block_pair, _max_shared_types;
int _block_cell_2d, _block_cell_id, _block_nbor_build;
int _block_bio_pair, _max_bio_shared_types;
UCL_Program *dev_program;
UCL_Kernel k_zero, k_info;
bool _compiled;
int compile_kernels();
int _data_in_estimate, _data_out_estimate;
template <class t>
inline std::string toa(const t& in) {
std::ostringstream o;
o.precision(2);
o << in;
return o.str();
}
};
#endif

View File

@ -1,406 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
Peng Wang (Nvidia), penwang@nvidia.com
------------------------------------------------------------------------- */
#include "pair_gpu_precision.h"
#include "pair_gpu_nbor.h"
#include "pair_gpu_device.h"
#include "math.h"
int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
if (_gpu_nbor)
return (max_nbors+2)*sizeof(int);
else if (_use_packing)
return ((max_nbors+2)*2)*sizeof(int);
else
return (max_nbors+3)*sizeof(int);
}
bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
const int host_inum, const int max_nbors,
const int maxspecial, UCL_Device &devi,
const bool gpu_nbor, const int gpu_host,
const bool pre_cut, const int block_cell_2d,
const int block_cell_id, const int block_nbor_build) {
clear();
_block_cell_2d=block_cell_2d;
_block_cell_id=block_cell_id;
_block_nbor_build=block_nbor_build;
_shared=shared;
dev=&devi;
_gpu_nbor=gpu_nbor;
if (gpu_host==0)
_gpu_host=false;
else if (gpu_host==1)
_gpu_host=true;
else
// Not yet implemented
assert(0==1);
if (pre_cut || gpu_nbor==false)
_alloc_packed=true;
else
_alloc_packed=false;
bool success=true;
// Initialize timers for the selected GPU
time_nbor.init(*dev);
time_kernel.init(*dev);
time_nbor.zero();
time_kernel.zero();
_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
if (_max_atoms==0)
_max_atoms=1000;
_max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
_max_nbors=max_nbors;
_maxspecial=maxspecial;
if (gpu_nbor==false)
_maxspecial=0;
if (gpu_nbor==false)
success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
alloc(success);
if (!success)
return false;
if (_use_packing==false)
_shared->compile_kernels(devi,gpu_nbor);
return success;
}
void PairGPUNbor::alloc(bool &success) {
dev_nbor.clear();
host_acc.clear();
int nt=_max_atoms+_max_host;
if (_use_packing==false || _gpu_nbor)
success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
else
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (host_acc.alloc(nt*2,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
_c_bytes=dev_nbor.row_bytes();
if (_alloc_packed) {
dev_packed.clear();
success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_c_bytes+=dev_packed.row_bytes();
}
if (_max_host>0) {
host_nbor.clear();
dev_host_nbor.clear();
dev_host_numj.clear();
host_ilist.clear();
host_jlist.clear();
success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (dev_host_numj.alloc(_max_host,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success)
return;
for (int i=0; i<nt; i++)
host_ilist[i]=i;
success=success && (host_jlist.alloc(_max_host,*dev,
UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success)
return;
int *ptr=host_nbor.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=_max_nbors;
}
_c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
}
if (_maxspecial>0) {
dev_nspecial.clear();
dev_special.clear();
dev_special_t.clear();
int at=_max_atoms+_max_host;
success=success && (dev_nspecial.alloc(3*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (dev_special.alloc(_maxspecial*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
dev_special_t.row_bytes();
}
_allocated=true;
}
void PairGPUNbor::clear() {
_gpu_bytes=0.0;
_cell_bytes=0.0;
_c_bytes=0.0;
if (_allocated) {
_allocated=false;
host_packed.clear();
host_acc.clear();
dev_nbor.clear();
dev_host_nbor.clear();
dev_packed.clear();
host_nbor.clear();
dev_host_numj.clear();
host_ilist.clear();
host_jlist.clear();
dev_nspecial.clear();
dev_special.clear();
dev_special_t.clear();
time_kernel.clear();
time_nbor.clear();
}
}
double PairGPUNbor::host_memory_usage() const {
if (_gpu_nbor) {
if (_gpu_host)
return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
host_jlist.row_bytes();
else
return 0;
} else
return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+
sizeof(PairGPUNbor);
}
void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
int **firstneigh, const int block_size) {
time_nbor.start();
UCL_H_Vec<int> ilist_view;
ilist_view.view(ilist,inum,*dev);
ucl_copy(dev_nbor,ilist_view,false);
UCL_D_Vec<int> nbor_offset;
UCL_H_Vec<int> host_offset;
int copy_count=0;
int ij_count=0;
int acc_count=0;
int dev_count=0;
int *h_ptr=host_packed.begin();
_nbor_pitch=inum;
for (int ii=0; ii<inum; ii++) {
int i=ilist[ii];
int nj=numj[i];
host_acc[ii]=nj;
host_acc[ii+inum]=acc_count;
acc_count+=nj;
int *jlist=firstneigh[i];
for (int jj=0; jj<nj; jj++) {
*h_ptr=jlist[jj];
h_ptr++;
ij_count++;
if (ij_count==IJ_SIZE) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
ucl_copy(nbor_offset,host_offset,true);
copy_count++;
ij_count=0;
dev_count+=IJ_SIZE;
h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
}
}
}
if (ij_count!=0) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
nbor_offset.view_offset(dev_count,dev_packed,ij_count);
ucl_copy(nbor_offset,host_offset,true);
}
UCL_D_Vec<int> acc_view;
acc_view.view_offset(inum,dev_nbor,inum*2);
ucl_copy(acc_view,host_acc,true);
time_nbor.stop();
if (_use_packing==false) {
time_kernel.start();
int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
_shared->k_nbor.set_size(GX,block_size);
_shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
time_kernel.stop();
}
}
template <class numtyp, class acctyp>
void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
const int nall,
PairGPUAtom<numtyp,acctyp> &atom,
double *sublo, double *subhi, int *tag,
int **nspecial, int **special, bool &success,
int &mn) {
const int nt=inum+host_inum;
if (_maxspecial>0) {
time_nbor.start();
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
view_nspecial.view(nspecial[0],nt*3,*dev);
view_special.view(special[0],nt*_maxspecial,*dev);
view_tag.view(tag,nall,*dev);
ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
ucl_copy(atom.dev_tag,view_tag,nall,false);
time_nbor.stop();
time_nbor.add_to_total();
time_kernel.start();
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
&_maxspecial,&nt);
} else
time_kernel.start();
_nbor_pitch=inum;
_shared->neigh_tex.bind_float(atom.dev_x,4);
int ncellx, ncelly, ncellz, ncell_3d;
ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
2.0*_cell_size)/_cell_size));
ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
2.0*_cell_size)/_cell_size));
ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
2.0*_cell_size)/_cell_size));
ncell_3d = ncellx * ncelly * ncellz;
UCL_D_Vec<int> cell_counts;
cell_counts.alloc(ncell_3d+1,dev_nbor);
_cell_bytes=cell_counts.row_bytes();
/* build cell list on GPU */
const int neigh_block=_block_cell_id;
const int GX=(int)ceil((float)nall/neigh_block);
const numtyp sublo0=static_cast<numtyp>(sublo[0]);
const numtyp sublo1=static_cast<numtyp>(sublo[1]);
const numtyp sublo2=static_cast<numtyp>(sublo[2]);
const numtyp subhi0=static_cast<numtyp>(subhi[0]);
const numtyp subhi1=static_cast<numtyp>(subhi[1]);
const numtyp subhi2=static_cast<numtyp>(subhi[2]);
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
_shared->k_cell_id.set_size(GX,neigh_block);
_shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
&atom.dev_particle_id.begin(),
&sublo0, &sublo1, &sublo2, &subhi0, &subhi1,
&subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
atom.sort_neighbor(nall);
/* calculate cell count */
_shared->k_cell_counts.set_size(GX,neigh_block);
_shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(),
&nall, &ncell_3d);
/* build the neighbor list */
const int cell_block=_block_nbor_build;
_shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
_shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
&cell_counts.begin(), &dev_nbor.begin(),
&dev_host_nbor.begin(), &dev_host_numj.begin(),
&_max_nbors,&cell_size_cast,
&ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
/* Get the maximum number of nbors and realloc if necessary */
UCL_D_Vec<int> numj;
numj.view_offset(inum,dev_nbor,inum);
ucl_copy(host_acc,numj,inum,false);
if (nt>inum) {
UCL_H_Vec<int> host_offset;
host_offset.view_offset(inum,host_acc,nt-inum);
ucl_copy(host_offset,dev_host_numj,nt-inum,false);
}
mn=host_acc[0];
for (int i=1; i<nt; i++)
mn=std::max(mn,host_acc[i]);
if (mn>_max_nbors) {
mn=static_cast<int>(static_cast<double>(mn)*1.10);
dev_nbor.clear();
success=success && (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_cell_id,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes=dev_nbor.row_bytes();
if (_max_host>0) {
host_nbor.clear();
dev_host_nbor.clear();
success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
success=success && (dev_host_nbor.alloc(mn*_max_host,
dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
int *ptr=host_nbor.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=mn;
}
_gpu_bytes+=dev_host_nbor.row_bytes();
}
if (_alloc_packed) {
dev_packed.clear();
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_packed.row_bytes();
}
if (!success)
return;
_max_nbors=mn;
time_kernel.stop();
time_kernel.add_to_total();
build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
special, success, mn);
return;
}
if (_maxspecial>0) {
const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
_shared->k_special.set_size(GX2,cell_block);
_shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
&dev_host_numj.begin(), &atom.dev_tag.begin(),
&dev_nspecial.begin(), &dev_special.begin(),
&inum, &nt, &_max_nbors);
}
time_kernel.stop();
time_nbor.start();
if (_gpu_host)
ucl_copy(host_nbor,dev_host_nbor,false);
time_nbor.stop();
}
template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
(const int inum, const int host_inum, const int nall,
PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
int *, int **, int **, bool &success, int &mn);

View File

@ -1,204 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_NBOR_H
#define PAIR_GPU_NBOR_H
#include "pair_gpu_atom.h"
#include "pair_gpu_nbor_shared.h"
#define IJ_SIZE 131072
#ifdef USE_OPENCL
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
using namespace ucl_opencl;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
using namespace ucl_cudadr;
#endif
class PairGPUNbor {
public:
PairGPUNbor() : _allocated(false), _use_packing(false) {}
~PairGPUNbor() { clear(); }
/// Determine whether neighbor unpacking should be used
/** If false, twice as much memory is reserved to allow unpacking neighbors by
* atom for coalesced access. **/
void packing(const bool use_packing) { _use_packing=use_packing; }
/// Clear any old data and setup for new LAMMPS run
/** \param inum Initial number of particles whose neighbors stored on device
* \param host_inum Initial number of particles whose nbors copied to host
* \param max_nbors Initial number of rows in the neighbor matrix
* \param gpu_nbor True if device will perform neighboring
* \param gpu_host 0 if host will not perform force calculations,
* 1 if gpu_nbor is true, and host needs a half nbor list,
* 2 if gpu_nbor is true, and host needs a full nbor list
* \param pre_cut True if cutoff test will be performed in separate kernel
* than the force kernel **/
bool init(PairGPUNborShared *shared, const int inum, const int host_inum,
const int max_nbors, const int maxspecial, UCL_Device &dev,
const bool gpu_nbor, const int gpu_host, const bool pre_cut,
const int block_cell_2d, const int block_cell_id,
const int block_nbor_build);
/// Set the size of the cutoff+skin
inline void cell_size(const double size) { _cell_size=size; }
/// Get the size of the cutoff+skin
inline double cell_size() const { return _cell_size; }
/// Check if there is enough memory for neighbor data and realloc if not
/** \param inum Number of particles whose nbors will be stored on device
* \param max_nbor Current max number of neighbors for a particle
* \param success False if insufficient memory **/
inline void resize(const int inum, const int max_nbor, bool &success) {
if (inum>_max_atoms || max_nbor>_max_nbors) {
_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
if (max_nbor>_max_nbors)
_max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10);
alloc(success);
}
}
/// Check if there is enough memory for neighbor data and realloc if not
/** \param inum Number of particles whose nbors will be stored on device
* \param host_inum Number of particles whose nbors will be copied to host
* \param max_nbor Current max number of neighbors for a particle
* \param success False if insufficient memory **/
inline void resize(const int inum, const int host_inum, const int max_nbor,
bool &success) {
if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) {
_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
_max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
if (max_nbor>_max_nbors)
_max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10);
alloc(success);
}
}
/// Free all memory on host and device
void clear();
/// Bytes per atom used on device
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by class
double host_memory_usage() const;
/// True if neighboring performed on GPU
inline bool gpu_nbor() const { return _gpu_nbor; }
/// Make a copy of unpacked nbor lists in the packed storage area (for gb)
inline void copy_unpacked(const int inum, const int maxj)
{ ucl_copy(dev_packed,dev_nbor,inum*(maxj+2),true); }
/// Copy neighbor list from host (first time or from a rebuild)
void get_host(const int inum, int *ilist, int *numj,
int **firstneigh, const int block_size);
/// Return the stride in elements for each nbor row
inline int nbor_pitch() const { return _nbor_pitch; }
/// Return the maximum number of atoms that can currently be stored
inline int max_atoms() const { return _max_atoms; }
/// Return the maximum number of nbors for a particle based on current alloc
inline int max_nbors() const { return _max_nbors; }
/// Loop through neighbor count array and return maximum nbors for a particle
inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
int mn=0;
for (int i=0; i<inum; i++)
mn=std::max(mn,numj[ilist[i]]);
return mn;
}
/// Build nbor list on the device
template <class numtyp, class acctyp>
void build_nbor_list(const int inum, const int host_inum, const int nall,
PairGPUAtom<numtyp,acctyp> &atom, double *sublo,
double *subhi, int *tag, int **nspecial, int **special,
bool &success, int &max_nbors);
/// Return the number of bytes used on device
inline double gpu_bytes() {
double res = _gpu_bytes + _c_bytes + _cell_bytes;
if (_gpu_nbor==false)
res += 2*IJ_SIZE*sizeof(int);
return res;
}
// ------------------------------- Data -------------------------------
/// Device neighbor matrix
/** - 1st row is i (index into atom data)
* - 2nd row is numj (number of neighbors)
* - 3rd row is starting location in packed nbors
* - Remaining rows are the neighbors arranged for coalesced access **/
UCL_D_Vec<int> dev_nbor;
/// Packed storage for neighbor lists copied from host
UCL_D_Vec<int> dev_packed;
/// Host buffer for copying neighbor lists
UCL_H_Vec<int> host_packed;
/// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
UCL_H_Vec<int> host_acc;
// ----------------- Data for GPU Neighbor Calculation ---------------
/// Host storage for device calculated neighbor lists
/** Same storage format as device matrix **/
UCL_H_Vec<int> host_nbor;
/// Device storage for neighbor list matrix that will be copied to host
/** - 1st row is numj
* - Remaining rows are by atom, columns are nbors **/
UCL_D_Vec<int> dev_host_nbor;
UCL_D_Vec<int> dev_host_numj;
UCL_H_Vec<int> host_ilist;
UCL_H_Vec<int*> host_jlist;
/// Device storage for special neighbor counts
UCL_D_Vec<int> dev_nspecial;
/// Device storage for special neighbors
UCL_D_Vec<int> dev_special, dev_special_t;
/// Device timers
UCL_Timer time_nbor, time_kernel;
private:
PairGPUNborShared *_shared;
UCL_Device *dev;
bool _allocated, _use_packing;
int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
bool _gpu_nbor, _gpu_host, _alloc_packed;
double _cell_size;
double _gpu_bytes, _c_bytes, _cell_bytes;
void alloc(bool &success);
int _block_cell_2d, _block_cell_id, _block_nbor_build;
};
#endif

View File

@ -1,46 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#else
#define GLOBAL_ID_X get_global_id(0)
#endif
__kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
const int inum) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
if (ii<inum) {
__global int *nbor=dev_nbor+ii+inum;
int numj=*nbor;
nbor+=inum;
__global int *list=dev_ij+*nbor;
__global int *list_end=list+numj;
for ( ; list<list_end; list++) {
*nbor=*list;
nbor+=inum;
}
} // if ii
}

View File

@ -1,71 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include "pair_gpu_nbor_shared.h"
#ifdef USE_OPENCL
#include "pair_gpu_nbor_cl.h"
#else
#include "pair_gpu_nbor_ptx.h"
#include "pair_gpu_build_ptx.h"
#endif
void PairGPUNborShared::clear() {
if (_compiled) {
if (_gpu_nbor) {
k_cell_id.clear();
k_cell_counts.clear();
k_build_nbor.clear();
k_transpose.clear();
k_special.clear();
delete build_program;
} else {
k_nbor.clear();
delete nbor_program;
}
_compiled=false;
}
}
void PairGPUNborShared::compile_kernels(UCL_Device &dev, const bool gpu_nbor) {
if (_compiled)
return;
_gpu_nbor=gpu_nbor;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
if (gpu_nbor==false) {
nbor_program=new UCL_Program(dev);
nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
k_nbor.set_function(*nbor_program,"kernel_unpack");
} else {
build_program=new UCL_Program(dev);
#ifdef USE_OPENCL
std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
exit(1);
#else
build_program->load_string(pair_gpu_build_kernel,flags.c_str());
#endif
k_cell_id.set_function(*build_program,"calc_cell_id");
k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
k_transpose.set_function(*build_program,"transpose");
k_special.set_function(*build_program,"kernel_special");
neigh_tex.get_texture(*build_program,"neigh_tex");
}
_compiled=true;
}

View File

@ -1,58 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_NBOR_SHARED_H
#define PAIR_GPU_NBOR_SHARED_H
#ifdef USE_OPENCL
#include "geryon/ocl_kernel.h"
#include "geryon/ocl_texture.h"
using namespace ucl_opencl;
#else
#include "geryon/nvd_kernel.h"
#include "geryon/nvd_texture.h"
using namespace ucl_cudadr;
#endif
class PairGPUNborShared {
public:
PairGPUNborShared() : _compiled(false) {}
~PairGPUNborShared() { clear(); }
/// Free all memory on host and device
void clear();
/// Texture for cached position/type access with CUDA
UCL_Texture neigh_tex;
/// Compile kernels for neighbor lists
void compile_kernels(UCL_Device &dev, const bool gpu_nbor);
// ----------------------------- Kernels
UCL_Program *nbor_program, *build_program;
UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
UCL_Kernel k_transpose, k_special;
private:
bool _compiled, _gpu_nbor;
};
#endif

View File

@ -1,90 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_PRECISION_H
#define PAIR_PRECISION_H
struct _lgpu_float2 {
float x; float y;
};
struct _lgpu_float4 {
float x; float y; float z; float w;
};
struct _lgpu_double2 {
double x; double y;
};
struct _lgpu_double4 {
double x; double y; double z; double w;
};
#include <iostream>
inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
out << v.x << " " << v.y;
return out;
}
inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
out << v.x << " " << v.y << " " << v.z;
return out;
}
inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
out << v.x << " " << v.y;
return out;
}
inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
out << v.x << " " << v.y << " " << v.z;
return out;
}
// PRECISION - Precision for rsq, energy, force, and torque calculation
// ACC_PRECISION - Precision for accumulation of energies, forces, and torques
#ifdef _SINGLE_DOUBLE
#define OCL_PRECISION_COMPILE "-D_SINGLE_DOUBLE"
#define PRECISION float
#define ACC_PRECISION double
#define numtyp2 _lgpu_float2
#define numtyp4 _lgpu_float4
#define acctyp4 _lgpu_double4
#endif
#ifdef _DOUBLE_DOUBLE
#define OCL_PRECISION_COMPILE "-D_DOUBLE_DOUBLE"
#define PRECISION double
#define ACC_PRECISION double
#define numtyp2 _lgpu_double2
#define numtyp4 _lgpu_double4
#define acctyp4 _lgpu_double4
#endif
#ifndef PRECISION
#define OCL_PRECISION_COMPILE "-D_SINGLE_SINGLE"
#define PRECISION float
#define ACC_PRECISION float
#define numtyp2 _lgpu_float2
#define numtyp4 _lgpu_float4
#define acctyp4 _lgpu_float4
#endif
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#endif

View File

@ -1,82 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef USE_OPENCL
#include "cudpp.h"
#endif
class PairWinSort {
public:
inline PairWinSort() : _allocated(false) {
#ifndef USE_OPENCL
sort_config.op = CUDPP_ADD;
sort_config.datatype = CUDPP_UINT;
sort_config.algorithm = CUDPP_SORT_RADIX;
sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
#endif
}
inline ~PairWinSort() { clear(); }
/// Free all memory on host and device
inline void clear() {
#ifndef USE_OPENCL
if (_allocated) { cudppDestroyPlan(sort_plan); _allocated=false; }
#endif
}
inline bool alloc(const int max_atoms) {
#ifndef USE_OPENCL
clear();
CUDPPResult result = cudppPlan(&sort_plan, sort_config, max_atoms, 1, 0);
if (CUDPP_SUCCESS != result)
return false;
#endif
return true;
}
/// Sort arrays for neighbor list calculation
void sort_neighbor(const int num_atoms, unsigned *cell_begin, int *particle_begin) {
#ifndef USE_OPENCL
CUDPPResult result = cudppSort(sort_plan, cell_begin, particle_begin,
8*sizeof(unsigned), num_atoms);
if (CUDPP_SUCCESS != result) {
printf("Error in cudppSort\n");
assert(1==0);
}
#endif
}
private:
bool allocated;
#ifndef USE_OPENCL
CUDPPConfiguration sort_config;
CUDPPHandle sort_plan;
#endif
};
static PairWinSort win_sort;
extern "C" __declspec(dllexport) bool _win_sort_alloc(const int max_atoms) {
win_sort.alloc(max_atoms);
}
extern "C" __declspec(dllexport) bool _win_sort(const int max_atoms, unsigned *cell_begin,
int *particle_begin) {
win_sort.sort(num_atoms,cell_begin,particle_begin);
}

Some files were not shown because too many files have changed in this diff Show More