Added GPU library and package directories.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3036 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
wmbrown 2009-08-10 20:03:47 +00:00
parent 15b6eeb011
commit 2c6dc60a30
28 changed files with 5296 additions and 0 deletions

75
lib/gpu/Makefile Normal file
View File

@ -0,0 +1,75 @@
#***************************************************************************
# Makefile
# -------------------
# W. Michael Brown
#
# _________________________________________________________________________
# Build for the LAMMPS GPU Force Library
#
# _________________________________________________________________________
#
# begin : Tue June 23 2009
# copyright : (C) 2009 by W. Michael Brown
# email : wmbrown@sandia.gov
# ***************************************************************************/
BIN_DIR = .
OBJ_DIR = .
AR = ar
CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math
CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
CUDA_PREC = -D_SINGLE_SINGLE
CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
CUDA_LIB = $(OBJ_DIR)/pair_gpu_lib.a
# Headers for CUDA Stuff
NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h
# Headers for Pair Stuff
PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h
ALL_H = $(NVC_H) $(PAIR_H)
EXECS = $(BIN_DIR)/nvc_get_devices
OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/gb_gpu.cu_o \
$(OBJ_DIR)/gb_gpu_memory.cu_o $(OBJ_DIR)/lj_gpu.cu_o \
$(OBJ_DIR)/lj_gpu_memory.cu_o $(OBJ_DIR)/pair_gpu_atom.cu_o \
$(OBJ_DIR)/pair_gpu_nbor.cu_o
all: $(CUDA_LIB) $(EXECS)
$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H)
$(CUDA) -o $@ -c nvc_device.cu
$(OBJ_DIR)/pair_gpu_atom.cu_o: pair_gpu_atom.cu pair_gpu_texture.h pair_gpu_atom.h $(NVC_H)
$(CUDA) -o $@ -c pair_gpu_atom.cu
$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
$(CUDA) -o $@ -c pair_gpu_nbor.cu
$(OBJ_DIR)/lj_gpu_memory.cu_o: lj_gpu_memory.cu lj_gpu_memory.h $(ALL_H)
$(CUDA) -o $@ -c lj_gpu_memory.cu
$(OBJ_DIR)/lj_gpu.cu_o: lj_gpu.cu $(ALL_H) lj_gpu_memory.h lj_gpu_kernel.h
$(CUDA) -o $@ -c lj_gpu.cu
$(OBJ_DIR)/gb_gpu_memory.cu_o: gb_gpu_memory.cu gb_gpu_memory.h $(ALL_H)
$(CUDA) -o $@ -c gb_gpu_memory.cu
$(OBJ_DIR)/gb_gpu.cu_o: gb_gpu.cu $(ALL_H) gb_gpu_memory.h gb_gpu_kernel.h gb_gpu_extra.h
$(CUDA) -o $@ -c gb_gpu.cu
$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o
$(CUDA_LIB): $(OBJS)
$(AR) -crusv $(CUDA_LIB) $(OBJS)
clean:
rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo
veryclean: clean
rm -rf *~ *.linkinfo

94
lib/gpu/README Normal file
View File

@ -0,0 +1,94 @@
/***************************************************************************
README
-------------------
W. Michael Brown
README for building LAMMPS GPU Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
GENERAL NOTES
This library, pair_gpu_lib.a, provides routines for GPGPU acceleration of
LAMMPS pair styles. Currently, only CUDA enabled GPUs are supported.
Compilation of this library requires installing the CUDA GPU driver and
CUDA toolkit for your operating system. In addition to the LAMMPS library,
the binary nvc_get_devices will also be built. This can be used to query
the names and properties of GPU devices on your system.
NOTE: Installation of the CUDA SDK is not required.
Current pair styles supporting GPU Accelartion:
1. lj/cut/gpu
2. gayberne/gpu
MULTIPLE LAMMPS PROCESSES
When using GPGPU acceleration, you are restricted to one physical GPU per
LAMMPS process. This can be multiple GPUs on a single node or across multiple
nodes. Intructions on GPU assignment can be found in the LAMMPS documentation.
SPEEDUPS
The speedups that can be obtained using this library are highly
dependent on the GPU architecture and the computational expense of
the pair potential. When comparing a single precision Tesla C1060 run to a
serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for lj/cut
with a cutoff of 2.5. For gayberne with a cutoff of 7, the speedup is >103x
for 8000 particles. The speedup will improve with an increase in the
number of particles or an increase in the cutoff.
BUILDING AND PRECISION MODES
To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for
your machine. Type make. Additionally, the GPU package must be installed and
compiled for LAMMPS. The library supports 3 precision modes as determined by
the CUDA_PREC variable:
CUDA_PREC = -D_SINGLE_SINGLE # Single precision for all calculations
CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations
CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
NOTE: Double precision is only supported on certain GPUS (with
compute capability>=1.3).
NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
package has been installed before installing the GPU package in LAMMPS.
GPU MEMORY
Upon initialization of the pair style, the library will reserve memory for
64K atoms per GPU or 70% of each cards GPU memory, whichever value is limiting.
The value of 70% can be changed by editing the PERCENT_GPU_MEMORY definition
in the source file. The value of 64K cannot be increased and is the maximum
number of atoms allowed per GPU. Using the 'neigh_modify one' modifier in
your LAMMPS input script can help to increase maximum number of atoms per
GPU for cards with limited memory.

518
lib/gpu/gb_gpu.cu Normal file
View File

@ -0,0 +1,518 @@
/***************************************************************************
gb_gpu.cu
-------------------
W. Michael Brown
Gay-Berne anisotropic potential GPU calcultation
*** Force decomposition by Atom Version ***
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include "nvc_macros.h"
#include "nvc_timer.h"
#include "nvc_device.h"
#include "gb_gpu_memory.h"
#include "gb_gpu_kernel.h"
using namespace std;
static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
#define GBMT GB_GPU_Memory<numtyp,acctyp>
// ---------------------------------------------------------------------------
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only pack neighbors matching the specified inclusive range of forms
// -- Only pack neighbors within cutoff
// ---------------------------------------------------------------------------
template<class numtyp>
__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
const int start, const int inum,
const int *dev_ij, const int form_low,
const int form_high, const int nall) {
// ii indexes the two interacting particles in gi
int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;
if (ii<inum) {
int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
nbor+=nbor_pitch;
int *nbor_newj=nbor;
nbor+=nbor_pitch;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,7);
int newj=0;
for ( ; list<list_end; list++) {
int j=*list;
if (j>=nall)
j%=nall;
int jtype=_x_<numtyp>(j,7);
if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
// Compute r12;
numtyp rsq=_x_<numtyp>(j,0)-ix;
rsq*=rsq;
numtyp t=_x_<numtyp>(j,1)-iy;
rsq+=t*t;
t=_x_<numtyp>(j,2)-iz;
rsq+=t*t;
if (rsq< _cutsq_<numtyp>(itype,jtype)) {
*nbor=j;
nbor+=nbor_pitch;
newj++;
}
}
}
*nbor_newj=newj;
}
}
// ---------------------------------------------------------------------------
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only pack neighbors matching the specified inclusive range of forms
// -- Only pack neighbors within cutoff
// -- Fast version of routine that uses shared memory for LJ constants
// ---------------------------------------------------------------------------
template<class numtyp>
__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
const int start, const int inum,
const int *dev_ij, const int form_low,
const int form_high, const int nall) {
int ii=threadIdx.x;
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
int itype=ii/MAX_SHARED_TYPES;
int jtype=ii%MAX_SHARED_TYPES;
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
form[ii]=_form_(itype,jtype);
}
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
if (ii<inum) {
int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
nbor+=nbor_pitch;
int *nbor_newj=nbor;
nbor+=nbor_pitch;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
int newj=0;
for ( ; list<list_end; list++) {
int j=*list;
if (j>=nall)
j%=nall;
int mtype=itype+_x_<numtyp>(j,7);
if (form[mtype]>=form_low && form[mtype]<=form_high) {
// Compute r12;
numtyp rsq=_x_<numtyp>(j,0)-ix;
rsq*=rsq;
numtyp t=_x_<numtyp>(j,1)-iy;
rsq+=t*t;
t=_x_<numtyp>(j,2)-iz;
rsq+=t*t;
if (rsq<cutsq[mtype]) {
*nbor=j;
nbor+=nbor_pitch;
newj++;
}
}
}
*nbor_newj=newj;
}
}
template<class numtyp, class acctyp>
void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
const int inum, const int form_low, const int form_high) {
if (gbm.shared_types)
kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
else
kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
}
// ---------------------------------------------------------------------------
// Convert something to a string
// ---------------------------------------------------------------------------
#include <sstream>
template <class t>
inline string gb_gpu_toa(const t& in) {
ostringstream o;
o.precision(2);
o << in;
return o.str();
}
// ---------------------------------------------------------------------------
// Return string with GPU info
// ---------------------------------------------------------------------------
string gb_gpu_name(const int id, const int max_nbors) {
string name=GBMF[0].gpu.name(id)+", "+
gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
max_nbors))+" Atoms";
return name;
}
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int max_nbors, const int thread, const int gpu_id) {
assert(thread<MAX_GPU_THREADS);
if (GBMF[thread].gpu.num_devices()==0)
return 0;
ij_size=IJ_SIZE;
return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
well, cutsq, sigma, epsilon, host_lshape, form,
host_lj1, host_lj2, host_lj3, host_lj4, offset,
special_lj, max_nbors, false, gpu_id);
}
// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
void gb_gpu_clear(const int thread) {
GBMF[thread].clear();
}
// ---------------------------------------------------------------------------
// copy atom positions, quaternions, and optionally types to device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
double **host_quat, const int *host_type,
const bool rebuild, cudaStream_t &stream) {
atom.time_atom.start();
atom.reset_write_buffer();
// Rows 1-3 of dev_x are position; rows 4-7 are quaternion
atom.add_atom_data(host_x[0],3);
atom.add_atom_data(host_x[0]+1,3);
atom.add_atom_data(host_x[0]+2,3);
atom.add_atom_data(host_quat[0],4);
atom.add_atom_data(host_quat[0]+1,4);
atom.add_atom_data(host_quat[0]+2,4);
atom.add_atom_data(host_quat[0]+3,4);
int csize=7;
// If a rebuild occured, copy type data
if (rebuild) {
atom.add_atom_data(host_type);
csize++;
}
atom.copy_atom_data(csize,stream);
atom.time_atom.stop();
}
void gb_gpu_atom(double **host_x, double **host_quat,
const int *host_type, const bool rebuild, const int thread) {
_gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
GBMF[thread].pair_stream);
}
// ---------------------------------------------------------------------------
// Signal that we need to transfer a new neighbor list
// ---------------------------------------------------------------------------
template <class gbmtyp>
int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
const int inum, int *ilist, const int *numj,
const int *type, bool &success) {
if (nall>gbm.max_atoms) {
success=false;
return 0;
}
success=true;
gbm.nbor.time_nbor.start();
gbm.atom.nall(nall);
gbm.atom.inum(inum);
if (gbm.multiple_forms) {
int p=0, acc=0;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[p]=numj[ilist[i]];
gbm.nbor.host_ij[p+inum]=acc;
acc+=numj[ilist[i]];
p++;
}
}
gbm.last_ellipse=p;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[p]=numj[ilist[i]];
gbm.nbor.host_ij[p+inum]=acc;
acc+=numj[ilist[i]];
p++;
}
}
gbm.nbor.ij_total=0;
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
gbm.nbor.dev_nbor.row_size(),
gbm.nbor.dev_nbor.row_size(),2,inum,
gbm.pair_stream);
} else {
gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
gbm.last_ellipse=inum;
}
gbm.nbor.time_nbor.stop();
if (gbm.multiple_forms)
return gbm.host_olist.begin();
return ilist;
}
int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
int *ilist, const int *numj, const int *type,
const int thread, bool &success) {
return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
success);
}
// ---------------------------------------------------------------------------
// Copy a set of ij_size ij interactions to device and compute energies,
// forces, and torques for those interactions
// ---------------------------------------------------------------------------
template <class gbmtyp>
void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
gbm.nbor.time_nbor.add_to_total();
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
gbm.nbor.time_nbor.start();
gbm.nbor.add(num_ij,gbm.pair_stream);
gbm.nbor.time_nbor.stop();
}
void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
_gb_gpu_nbors(GBMF[thread],num_ij,eflag);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques for all ij interactions
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
const bool rebuild) {
// Compute the block size and grid size to keep all cores busy
const int BX=BLOCK_1D;
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
if (gbm.multiple_forms) {
gbm.time_kernel.start();
if (gbm.last_ellipse>0) {
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
static_cast<double>(BX)));
pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
gbm.time_gayberne.stop();
if (gbm.last_ellipse==gbm.atom.inum()) {
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
gbm.time_gayberne2.stop();
gbm.time_pair.start();
gbm.time_pair.stop();
return;
}
// ------------ SPHERE_ELLIPSE ---------------
gbm.time_kernel2.start();
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
gbm.last_ellipse)/BX));
pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
ELLIPSE_SPHERE);
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
gbm.time_gayberne2.stop();
} else {
gbm.atom.ans.zero();
gbm.time_kernel.stop();
gbm.time_gayberne.start();
gbm.time_gayberne.stop();
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
gbm.time_gayberne2.stop();
}
// ------------ LJ ---------------
gbm.time_pair.start();
if (gbm.last_ellipse<gbm.atom.inum()) {
if (gbm.shared_types)
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),
gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),
gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse,
gbm.atom.inum(), gbm.atom.nall());
else
kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),
gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
}
gbm.time_pair.stop();
} else {
gbm.time_kernel.start();
pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(),
eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
gbm.time_gayberne.stop();
}
}
void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
const int thread) {
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
}
// ---------------------------------------------------------------------------
// Get energies, forces, and torques to host
// ---------------------------------------------------------------------------
template<class numtyp, class acctyp>
double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial) {
double evdw;
gbm.atom.time_answer.start();
gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
gbm.atom.time_atom.add_to_total();
gbm.nbor.time_nbor.add_to_total();
gbm.time_kernel.add_to_total();
gbm.time_gayberne.add_to_total();
if (gbm.multiple_forms) {
gbm.time_kernel2.add_to_total();
gbm.time_gayberne2.add_to_total();
gbm.time_pair.add_to_total();
}
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
gbm.atom.add_forces(ilist,f);
gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
gbm.atom.time_answer.stop();
gbm.atom.time_answer.add_to_total();
return evdw;
}
double gb_gpu_forces(double **f, double **tor, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial, const int thread) {
return _gb_gpu_forces<PRECISION,ACC_PRECISION>
(GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
vflag_atom,eatom,vatom,virial);
}
void gb_gpu_time(const int i) {
cout.precision(4);
cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds()
<< " s.\n"
<< "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds()
<< " s.\n"
<< "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
GBMF[i].time_kernel2.total_seconds() << " s.\n"
<< "Force calc: " << GBMF[i].time_gayberne.total_seconds()+
GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
if (GBMF[i].multiple_forms)
cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n";
cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds()
<< " s.\n";
}
int gb_gpu_num_devices() {
return GBMF[0].gpu.num_devices();
}
double gb_gpu_bytes() {
return GBMF[0].host_memory_usage();
}

337
lib/gpu/gb_gpu_extra.h Normal file
View File

@ -0,0 +1,337 @@
/***************************************************************************
gb_gpu_extra.h
-------------------
W. Michael Brown
Inline GPU kernel routines ala math_extra for the CPU.
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef GB_GPU_EXTRA_H
#define GB_GPU_EXTRA_H
#include "math.h"
#include "stdio.h"
#include "string.h"
/* ----------------------------------------------------------------------
Atomic update of global memory
------------------------------------------------------------------------- */
/*
template <class numtyp> __device__
inline void atomicAdd(numtyp *address, numtyp val);
template <>
__device__ inline void atomicAdd<float>(float *address, float val)
{
int i_val = __float_as_int(val);
int tmp0 = 0;
int tmp1;
while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) {
tmp0 = tmp1;
i_val = __float_as_int(val + __int_as_float(tmp1));
}
}*/
/* ----------------------------------------------------------------------
dot product of 2 vectors
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
{
return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
}
/* ----------------------------------------------------------------------
cross product of 2 vectors
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_cross3(const numtyp *v1,
const numtyp *v2, numtyp *ans)
{
ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
}
/* ----------------------------------------------------------------------
determinant of a matrix
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
{
numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] -
m[3]*m[1]*m[8] + m[3]*m[2]*m[7] +
m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
return ans;
}
/* ----------------------------------------------------------------------
diagonal matrix times a full matrix
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9],
numtyp ans[9])
{
ans[0] = _well_<numtyp>(i,0)*m[0];
ans[1] = _well_<numtyp>(i,0)*m[1];
ans[2] = _well_<numtyp>(i,0)*m[2];
ans[3] = _well_<numtyp>(i,1)*m[3];
ans[4] = _well_<numtyp>(i,1)*m[4];
ans[5] = _well_<numtyp>(i,1)*m[5];
ans[6] = _well_<numtyp>(i,2)*m[6];
ans[7] = _well_<numtyp>(i,2)*m[7];
ans[8] = _well_<numtyp>(i,2)*m[8];
}
/* ----------------------------------------------------------------------
diagonal matrix times a full matrix
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9],
numtyp ans[9])
{
ans[0] = _shape_<numtyp>(i,0)*m[0];
ans[1] = _shape_<numtyp>(i,0)*m[1];
ans[2] = _shape_<numtyp>(i,0)*m[2];
ans[3] = _shape_<numtyp>(i,1)*m[3];
ans[4] = _shape_<numtyp>(i,1)*m[4];
ans[5] = _shape_<numtyp>(i,1)*m[5];
ans[6] = _shape_<numtyp>(i,2)*m[6];
ans[7] = _shape_<numtyp>(i,2)*m[7];
ans[8] = _shape_<numtyp>(i,2)*m[8];
}
/* ----------------------------------------------------------------------
add two matrices
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_plus3(const numtyp m[9],
const numtyp m2[9], numtyp ans[9])
{
ans[0] = m[0]+m2[0];
ans[1] = m[1]+m2[1];
ans[2] = m[2]+m2[2];
ans[3] = m[3]+m2[3];
ans[4] = m[4]+m2[4];
ans[5] = m[5]+m2[5];
ans[6] = m[6]+m2[6];
ans[7] = m[7]+m2[7];
ans[8] = m[8]+m2[8];
}
/* ----------------------------------------------------------------------
multiply the transpose of mat1 times mat2
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9],
const numtyp m2[9],
numtyp ans[9])
{
ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
}
/* ----------------------------------------------------------------------
row vector times matrix
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_row_times3(const numtyp *v,
const numtyp m[9], numtyp *ans)
{
ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
}
/* ----------------------------------------------------------------------
solve Ax = b or M ans = v
use gaussian elimination & partial pivoting on matrix
error_flag set to 2 if bad matrix inversion attempted
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
const numtyp *v, numtyp *ans,
int *error_flag)
{
// create augmented matrix for pivoting
numtyp aug[12], t;
aug[3] = v[0];
aug[0] = m[0];
aug[1] = m[1];
aug[2] = m[2];
aug[7] = v[1];
aug[4] = m[3];
aug[5] = m[4];
aug[6] = m[5];
aug[11] = v[2];
aug[8] = m[6];
aug[9] = m[7];
aug[10] = m[8];
if (fabs(aug[4]) > fabs(aug[0])) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
}
if (fabs(aug[8]) > fabs(aug[0])) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
}
if (aug[0] != (numtyp)0.0) {
if (0!=0) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
}
} else if (aug[4] != (numtyp)0.0) {
if (1!=0) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
}
} else if (aug[8] != (numtyp)0.0) {
if (2!=0) {
numtyp swapt;
swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
}
} else
*error_flag=2;
t = aug[4]/aug[0];
aug[5]-=t*aug[1];
aug[6]-=t*aug[2];
aug[7]-=t*aug[3];
t = aug[8]/aug[0];
aug[9]-=t*aug[1];
aug[10]-=t*aug[2];
aug[11]-=t*aug[3];
if (fabs(aug[9]) > fabs(aug[5])) {
numtyp swapt;
swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
}
if (aug[5] != (numtyp)0.0) {
if (1!=1) {
numtyp swapt;
swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
}
} else if (aug[9] != (numtyp)0.0) {
if (2!=1) {
numtyp swapt;
swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
}
}
t = aug[9]/aug[5];
aug[10]-=t*aug[6];
aug[11]-=t*aug[7];
if (aug[10] == (numtyp)0.0)
*error_flag=2;
ans[2] = aug[11]/aug[10];
t = (numtyp)0.0;
t += aug[6]*ans[2];
ans[1] = (aug[7]-t) / aug[5];
t = (numtyp)0.0;
t += aug[1]*ans[1];
t += aug[2]*ans[2];
ans[0] = (aug[3]-t) / aug[0];
}
/* ----------------------------------------------------------------------
compute rotation matrix from quaternion conjugate
quat = [w i j k]
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi,
numtyp mat[9])
{
numtyp qi3=_x_<numtyp>(qi,3);
numtyp qi4=_x_<numtyp>(qi,4);
numtyp qi5=_x_<numtyp>(qi,5);
numtyp qi6=_x_<numtyp>(qi,6);
numtyp w2 = qi3*qi3;
numtyp i2 = qi4*qi4;
numtyp j2 = qi5*qi5;
numtyp k2 = qi6*qi6;
numtyp twoij = 2.0*qi4*qi5;
numtyp twoik = 2.0*qi4*qi6;
numtyp twojk = 2.0*qi5*qi6;
numtyp twoiw = 2.0*qi4*qi3;
numtyp twojw = 2.0*qi5*qi3;
numtyp twokw = 2.0*qi6*qi3;
mat[0] = w2+i2-j2-k2;
mat[3] = twoij-twokw;
mat[6] = twojw+twoik;
mat[1] = twoij+twokw;
mat[4] = w2-i2+j2-k2;
mat[7] = twojk-twoiw;
mat[2] = twoik-twojw;
mat[5] = twojk+twoiw;
mat[8] = w2-i2-j2+k2;
}
#endif

861
lib/gpu/gb_gpu_kernel.h Normal file
View File

@ -0,0 +1,861 @@
/***************************************************************************
gb_gpu_kernel.cu
-------------------
W. Michael Brown
Routines that actually perform the force/torque computation
*** Force Decomposition by Atom Version ***
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef GB_GPU_KERNEL
#define GB_GPU_KERNEL
#include "gb_gpu_extra.h"
template <class numtyp>
static __inline__ __device__ void compute_eta_torque(numtyp m[9],
numtyp m2[9],
const int i,
numtyp ans[9])
{
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
den = (numtyp)1.0/den;
numtyp shapex=_shape_<numtyp>(i,0);
numtyp shapey=_shape_<numtyp>(i,1);
numtyp shapez=_shape_<numtyp>(i,2);
ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
template<class numtyp, class acctyp>
__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
const int *dev_nbor, const int nbor_pitch,
acctyp *ans, size_t ans_pitch, int *err_flag,
const bool eflag, const bool vflag,
const int inum, const int nall) {
__shared__ numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
if (ii<4)
sp_lj[ii]=special_lj[ii];
ii+=INT_MUL(blockIdx.x,blockDim.x);
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp torx=(numtyp)0;
acctyp tory=(numtyp)0;
acctyp torz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
nbor+=nbor_pitch;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,7);
numtyp a1[9], b1[9], g1[9];
{
numtyp t[9];
gpu_quat_to_mat_trans(i,a1);
gpu_shape_times3(itype,a1,t);
gpu_transpose_times3(a1,t,g1);
gpu_well_times3(itype,a1,t);
gpu_transpose_times3(a1,t,b1);
}
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = 1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
int jtype=_x_<numtyp>(j,7);
// Compute r12
numtyp r12[3];
r12[0] = _x_<numtyp>(j,0)-ix;
r12[1] = _x_<numtyp>(j,1)-iy;
r12[2] = _x_<numtyp>(j,2)-iz;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp a2[9];
gpu_quat_to_mat_trans(j,a2);
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
numtyp g2[9];
{
gpu_shape_times3(jtype,a2,g12);
gpu_transpose_times3(a2,g12,g2);
gpu_plus3(g1,g2,g12);
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- replace r12 with r12 hat
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
numtyp sigma = _sigma_<numtyp>(itype,jtype);
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
}
// torque for particle 1
{
numtyp tempv[3], tempv2[3];
tempv[0] = -uslj_rsq*kappa[0];
tempv[1] = -uslj_rsq*kappa[1];
tempv[2] = -uslj_rsq*kappa[2];
gpu_row_times3(kappa,g1,tempv2);
gpu_cross3(tempv,tempv2,tUr);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
// Compute teta
numtyp temp[9], tempv[3], tempv2[3];
compute_eta_torque(g12,a1,itype,temp);
numtyp temp1 = -eta*gum[1];
tempv[0] = temp1*temp[0];
tempv[1] = temp1*temp[1];
tempv[2] = temp1*temp[2];
gpu_cross3(a1,tempv,tempv2);
teta[0] = tempv2[0];
teta[1] = tempv2[1];
teta[2] = tempv2[2];
tempv[0] = temp1*temp[3];
tempv[1] = temp1*temp[4];
tempv[2] = temp1*temp[5];
gpu_cross3(a1+3,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
tempv[0] = temp1*temp[6];
tempv[1] = temp1*temp[7];
tempv[2] = temp1*temp[8];
gpu_cross3(a1+6,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
}
numtyp chi, dchi[3], tchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b2[9], b12[9];
{
gpu_well_times3(jtype,a2,b12);
gpu_transpose_times3(a2,b12,b2);
gpu_plus3(b1,b2,b12);
}
// compute chi_12
r12[0]*=r;
r12[1]*=r;
r12[2]*=r;
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
chi = gpu_dot3(r12,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
// compute t_chi
numtyp tempv[3];
gpu_row_times3(iota,b1,tempv);
gpu_cross3(tempv,iota,tchi);
temp1 = (numtyp)-4.0*ir*ir;
tchi[0] *= temp1;
tchi[1] *= temp1;
tchi[2] *= temp1;
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag) {
r12[0]*=-r;
r12[1]*=-r;
r12[2]*=-r;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
fx+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
fy+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
fz+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
fx+=temp1*dchi[0]-temp2*dUr[0];
fy+=temp1*dchi[1]-temp2*dUr[1];
fz+=temp1*dchi[2]-temp2*dUr[2];
}
// Torque on 1
temp1 = -u_r*eta*factor_lj;
temp2 = -u_r*chi*factor_lj;
numtyp temp3 = -chi*eta*factor_lj;
torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
if (eflag) {
*ap1=energy;
ap1+=ans_pitch;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=ans_pitch;
}
}
*ap1=fx;
ap1+=ans_pitch;
*ap1=fy;
ap1+=ans_pitch;
*ap1=fz;
ap1+=ans_pitch;
*ap1=torx;
ap1+=ans_pitch;
*ap1=tory;
ap1+=ans_pitch;
*ap1=torz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
const int *dev_nbor, const int nbor_pitch,
acctyp *ans, size_t ans_pitch, int *err_flag,
const bool eflag, const bool vflag,
const int start, const int inum,
const int nall) {
__shared__ numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
if (ii<4)
sp_lj[ii]=special_lj[ii];
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
nbor+=nbor_pitch;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,7);
numtyp oner=_shape_<numtyp>(itype,0);
numtyp one_well=_well_<numtyp>(itype,0);
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = 1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
int jtype=_x_<numtyp>(j,7);
// Compute r12
numtyp r12[3];
r12[0] = _x_<numtyp>(j,0)-ix;
r12[1] = _x_<numtyp>(j,1)-iy;
r12[2] = _x_<numtyp>(j,2)-iz;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp r12hat[3];
r12hat[0]=r12[0]*ir;
r12hat[1]=r12[1]*ir;
r12hat[2]=r12[2]*ir;
numtyp a2[9];
gpu_quat_to_mat_trans(j,a2);
numtyp u_r, dUr[3], eta;
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
{
numtyp g2[9];
gpu_shape_times3(jtype,a2,g12);
gpu_transpose_times3(a2,g12,g2);
g12[0]=g2[0]+oner;
g12[4]=g2[4]+oner;
g12[8]=g2[8]+oner;
g12[1]=g2[1];
g12[2]=g2[2];
g12[3]=g2[3];
g12[5]=g2[5];
g12[6]=g2[6];
g12[7]=g2[7];
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12hat,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
numtyp sigma = _sigma_<numtyp>(itype,jtype);
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12hat);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
}
numtyp chi, dchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b12[9];
{
numtyp b2[9];
gpu_well_times3(jtype,a2,b12);
gpu_transpose_times3(a2,b12,b2);
b12[0]=b2[0]+one_well;
b12[4]=b2[4]+one_well;
b12[8]=b2[8]+one_well;
b12[1]=b2[1];
b12[2]=b2[2];
b12[3]=b2[3];
b12[5]=b2[5];
b12[6]=b2[6];
b12[7]=b2[7];
}
// compute chi_12
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
chi = gpu_dot3(r12hat,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12hat);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag) {
r12[0]*=-1;
r12[1]*=-1;
r12[2]*=-1;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
fx+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
fy+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
fz+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
fx+=temp1*dchi[0]-temp2*dUr[0];
fy+=temp1*dchi[1]-temp2*dUr[1];
fz+=temp1*dchi[2]-temp2*dUr[2];
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
if (eflag) {
*ap1=energy;
ap1+=ans_pitch;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=ans_pitch;
}
}
*ap1=fx;
ap1+=ans_pitch;
*ap1=fy;
ap1+=ans_pitch;
*ap1=fz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
const int *dev_ij, const int nbor_pitch, acctyp *ans,
size_t ans_pitch, int *err_flag, const bool eflag,
const bool vflag, const int start, const int inum,
const int nall) {
__shared__ numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
if (ii<4)
sp_lj[ii]=special_lj[ii];
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,7);
numtyp factor_lj;
for ( ; list<list_end; list++) {
int j=*list;
if (j < nall)
factor_lj = 1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
int jtype=_x_<numtyp>(j,7);
// Compute r12
numtyp delx = ix-_x_<numtyp>(j,0);
numtyp dely = iy-_x_<numtyp>(j,1);
numtyp delz = iz-_x_<numtyp>(j,2);
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
_form_(itype,jtype)==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
_lj1_<numtyp>(itype,jtype).y);
force*=factor_lj;
fx+=delx*force;
fy+=dely*force;
fz+=delz*force;
if (eflag) {
numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
_lj3_<numtyp>(itype,jtype).y);
energy+=factor_lj*(e-_offset_<numtyp>(1,1));
}
if (vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
if (eflag) {
*ap1+=energy;
ap1+=ans_pitch;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=ans_pitch;
}
}
*ap1+=fx;
ap1+=ans_pitch;
*ap1+=fy;
ap1+=ans_pitch;
*ap1+=fz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
const int *dev_ij, const int nbor_pitch,
acctyp *ans, size_t ans_pitch,int *err_flag,
const bool eflag, const bool vflag,
const int start, const int inum, const int nall){
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
__shared__ numtyp sp_lj[4];
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<4)
sp_lj[ii]=special_lj[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
int itype=ii/MAX_SHARED_TYPES;
int jtype=ii%MAX_SHARED_TYPES;
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
form[ii]=_form_(itype,jtype);
lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
if (eflag) {
lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
offset[ii]=_offset_<numtyp>(itype,jtype);
}
}
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
numtyp factor_lj;
for ( ; list<list_end; list++) {
int j=*list;
if (j < nall)
factor_lj = 1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
int mtype=itype+_x_<numtyp>(j,7);
// Compute r12
numtyp delx = ix-_x_<numtyp>(j,0);
numtyp dely = iy-_x_<numtyp>(j,1);
numtyp delz = iz-_x_<numtyp>(j,2);
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
fx+=delx*force;
fy+=dely*force;
fz+=delz*force;
if (eflag) {
numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
energy+=factor_lj*(e-offset[mtype]);
}
if (vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
if (eflag) {
*ap1+=energy;
ap1+=ans_pitch;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=ans_pitch;
}
}
*ap1+=fx;
ap1+=ans_pitch;
*ap1+=fy;
ap1+=ans_pitch;
*ap1+=fz;
} // if ii
}
#endif

146
lib/gpu/gb_gpu_memory.cu Normal file
View File

@ -0,0 +1,146 @@
/***************************************************************************
gb_gpu_memory.cu
-------------------
W. Michael Brown
Global variables for GPU Gayberne Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#include "gb_gpu_memory.h"
#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
template <class numtyp, class acctyp>
GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() {
this->atom.atom_fields(8);
this->atom.ans_fields(13);
this->nbor.packing(true);
}
template <class numtyp, class acctyp>
GB_GPU_MemoryT::~GB_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
const double gamma, const double upsilon,
const double mu, double **host_shape,
double **host_well, double **host_cutsq,
double **host_sigma, double **host_epsilon,
double *host_lshape, int **h_form, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int max_nbors,
const bool force_d, const int me) {
if (this->allocated)
clear();
LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
host_lj1, host_lj2, host_lj3, host_lj4, host_offset,
host_special_lj, max_nbors, me);
host_form=h_form;
// Initialize timers for the selected GPU
time_kernel.init();
time_gayberne.init();
time_kernel2.init();
time_gayberne2.init();
// Use the write buffer from atom for data initialization
NVC_HostT &host_write=this->atom.host_write;
assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
// Allocate, cast and asynchronous memcpy of constant data
gamma_upsilon_mu.safe_alloc(3);
host_write[0]=static_cast<numtyp>(gamma);
host_write[1]=static_cast<numtyp>(upsilon);
host_write[2]=static_cast<numtyp>(mu);
gamma_upsilon_mu.copy_from_host(host_write.begin());
lshape.safe_alloc(ntypes);
lshape.cast_copy(host_lshape,host_write);
lshape.copy_from_host(host_write.begin());
// Copy shape, well, sigma, epsilon, and cutsq onto GPU
shape.safe_alloc(ntypes,3);
shape.cast_copy(host_shape[0],host_write);
well.safe_alloc(ntypes,3);
well.cast_copy(host_well[0],host_write);
// Copy LJ data onto GPU
int lj_types=ntypes;
if (lj_types<=MAX_SHARED_TYPES)
lj_types=MAX_SHARED_TYPES;
form.safe_alloc(lj_types,lj_types);
form.copy_2Dfrom_host(host_form[0],ntypes,ntypes);
// See if we want fast GB-sphere or sphere-sphere calculations
multiple_forms=false;
for (int i=1; i<ntypes; i++)
for (int j=i; j<ntypes; j++)
if (host_form[i][j]!=ELLIPSE_ELLIPSE)
multiple_forms=true;
// Memory for ilist ordered by particle type
host_olist.safe_alloc_rw(this->max_atoms);
// Bind constant data to textures
lshape_bind_texture<numtyp>(lshape);
shape_bind_texture<numtyp>(shape);
well_bind_texture<numtyp>(well);
form_bind_texture(form);
return this->nbor.host_ij.begin();
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::clear() {
if (!this->allocated)
return;
int err_flag;
this->dev_error.copy_to_host(&err_flag);
if (err_flag == 1)
std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N "
<< "and RECOMPILE.\n";
else if (err_flag == 2)
std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
LJ_GPU_MemoryT::clear();
shape_unbind_texture<numtyp>();
well_unbind_texture<numtyp>();
form_unbind_texture();
shape.clear();
well.clear();
form.clear();
lshape.clear();
gamma_upsilon_mu.clear();
host_olist.clear();
}
template <class numtyp, class acctyp>
double GB_GPU_MemoryT::host_memory_usage() {
return this->atom.host_memory_usage(this->max_atoms)+
this->nbor.host_memory_usage()+4*sizeof(numtyp)+
sizeof(GB_GPU_Memory<numtyp,acctyp>)+this->max_atoms*sizeof(int);
}
template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;

74
lib/gpu/gb_gpu_memory.h Normal file
View File

@ -0,0 +1,74 @@
/***************************************************************************
gb_gpu_memory.h
-------------------
W. Michael Brown
Global variables for GPU Gayberne Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef GB_GPU_MEMORY_H
#define GB_GPU_MEMORY_H
#define MAX_GPU_THREADS 4
#include "lj_gpu_memory.h"
#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp,acctyp>
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
template <class numtyp, class acctyp>
class GB_GPU_Memory : public LJ_GPU_MemoryT {
public:
GB_GPU_Memory();
~GB_GPU_Memory();
int* init(const int ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **host_shape,
double **host_well, double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape, int **h_form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int max_nbors, const bool force_d, const int me);
void clear();
double host_memory_usage();
// ---------------------------- DATA ----------------------------
// ilist with particles sorted by type
NVC_HostI host_olist;
// --------------- Const Data for Atoms
NVC_ConstMatT shape, well;
NVC_ConstMatI form;
NVC_VecT lshape, gamma_upsilon_mu;
// --------------- Timing Stuff
NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2;
// True if we want to use fast GB-sphere or sphere-sphere calculations
bool multiple_forms;
int **host_form;
int last_ellipse;
private:
};
#endif

234
lib/gpu/lj_gpu.cu Normal file
View File

@ -0,0 +1,234 @@
/***************************************************************************
lj_gpu.cu
-------------------
W. Michael Brown
Lennard-Jones potential GPU calcultation
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include "nvc_macros.h"
#include "nvc_timer.h"
#include "nvc_device.h"
#include "lj_gpu_memory.h"
#include "lj_gpu_kernel.h"
using namespace std;
static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
#define LJMT LJ_GPU_Memory<numtyp,acctyp>
// ---------------------------------------------------------------------------
// Convert something to a string
// ---------------------------------------------------------------------------
#include <sstream>
template <class t>
inline string lj_gpu_toa(const t& in) {
ostringstream o;
o.precision(2);
o << in;
return o.str();
}
// ---------------------------------------------------------------------------
// Return string with GPU info
// ---------------------------------------------------------------------------
string lj_gpu_name(const int id, const int max_nbors) {
string name=LJMF.gpu.name(id)+", "+
lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+
lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id),
max_nbors))+" Atoms";
return name;
}
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
double **epsilon, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, const int max_nbors, const int gpu_id) {
if (LJMF.gpu.num_devices()==0)
return 0;
ij_size=IJ_SIZE;
return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
}
// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
void lj_gpu_clear() {
LJMF.clear();
}
// ---------------------------------------------------------------------------
// copy atom positions and optionally types to device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
const int *host_type, const bool rebuild,
cudaStream_t &stream) {
atom.time_atom.start();
atom.reset_write_buffer();
// First row of dev_x is x position, second is y, third is z
atom.add_atom_data(host_x[0],3);
atom.add_atom_data(host_x[0]+1,3);
atom.add_atom_data(host_x[0]+2,3);
int csize=3;
// If a rebuild occured, copy type data
if (rebuild) {
atom.add_atom_data(host_type);
csize++;
}
atom.copy_atom_data(csize,stream);
atom.time_atom.stop();
}
void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) {
_lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream);
}
// ---------------------------------------------------------------------------
// Signal that we need to transfer a new neighbor list
// ---------------------------------------------------------------------------
template <class LJMTyp>
bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum,
int *ilist, const int *numj) {
if (nall>ljm.max_atoms)
return false;
ljm.nbor.time_nbor.start();
ljm.atom.nall(nall);
ljm.atom.inum(inum);
ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream);
ljm.nbor.time_nbor.stop();
return true;
}
bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
const int *numj) {
return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj);
}
// ---------------------------------------------------------------------------
// Copy a set of ij_size ij interactions to device and compute energies,
// forces, and torques for those interactions
// ---------------------------------------------------------------------------
template <class LJMTyp>
void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
ljm.nbor.time_nbor.add_to_total();
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
ljm.nbor.time_nbor.start();
ljm.nbor.add(num_ij,ljm.pair_stream);
ljm.nbor.time_nbor.stop();
}
void lj_gpu_nbors(const int num_ij) {
_lj_gpu_nbors(LJMF,num_ij);
}
// ---------------------------------------------------------------------------
// Calculate energies and forces for all ij interactions
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
// Compute the block size and grid size to keep all cores busy
const int BX=BLOCK_1D;
int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
ljm.time_pair.start();
if (ljm.shared_types)
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
vflag, ljm.atom.inum(), ljm.atom.nall());
else
kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
vflag, ljm.atom.inum(), ljm.atom.nall());
ljm.time_pair.stop();
}
void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
_lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild);
}
// ---------------------------------------------------------------------------
// Get energies and forces to host
// ---------------------------------------------------------------------------
template<class numtyp, class acctyp>
double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial) {
double evdw;
ljm.atom.time_answer.start();
ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream);
ljm.atom.time_atom.add_to_total();
ljm.nbor.time_nbor.add_to_total();
ljm.time_pair.add_to_total();
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
ljm.atom.add_forces(ilist,f);
ljm.atom.time_answer.stop();
ljm.atom.time_answer.add_to_total();
return evdw;
}
double lj_gpu_forces(double **f, const int *ilist, const bool eflag,
const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial) {
return _lj_gpu_forces<PRECISION,ACC_PRECISION>
(LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial);
}
void lj_gpu_time() {
cout.precision(4);
cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n";
cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n";
cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n";
}
int lj_gpu_num_devices() {
return LJMF.gpu.num_devices();
}
double lj_gpu_bytes() {
return LJMF.host_memory_usage();
}

248
lib/gpu/lj_gpu_kernel.h Normal file
View File

@ -0,0 +1,248 @@
/***************************************************************************
lj_gpu_kernel.cu
-------------------
W. Michael Brown
Routines that actually perform the force computation
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef LJ_GPU_KERNEL
#define LJ_GPU_KERNEL
template<class numtyp, class acctyp>
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
const int *dev_ij, const int nbor_pitch, acctyp *ans,
size_t ans_pitch, const bool eflag,
const bool vflag, const int inum, const int nall) {
__shared__ numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
if (ii<4)
sp_lj[ii]=special_lj[ii];
ii+=INT_MUL(blockIdx.x,blockDim.x);
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=_x_<numtyp>(i,3);
numtyp factor_lj;
for ( ; list<list_end; list++) {
int j=*list;
if (j < nall)
factor_lj = 1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
int jtype=_x_<numtyp>(j,3);
// Compute r12
numtyp delx = ix-_x_<numtyp>(j,0);
numtyp dely = iy-_x_<numtyp>(j,1);
numtyp delz = iz-_x_<numtyp>(j,2);
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<_cutsq_<numtyp>(itype,jtype)) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv =r2inv*r2inv*r2inv;
numtyp force =factor_lj*r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
_lj1_<numtyp>(itype,jtype).y);
fx+=delx*force;
fy+=dely*force;
fz+=delz*force;
if (eflag) {
numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
_lj3_<numtyp>(itype,jtype).y);
energy+=factor_lj*(e-_offset_<numtyp>(1,1));
}
if (vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
if (eflag) {
*ap1=energy;
ap1+=ans_pitch;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=ans_pitch;
}
}
*ap1=fx;
ap1+=ans_pitch;
*ap1=fy;
ap1+=ans_pitch;
*ap1=fz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
const int *dev_ij, const int nbor_pitch,
acctyp *ans, size_t ans_pitch,const bool eflag,
const bool vflag, const int inum,
const int nall) {
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
__shared__ numtyp sp_lj[4];
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<4)
sp_lj[ii]=special_lj[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
int itype=ii/MAX_SHARED_TYPES;
int jtype=ii%MAX_SHARED_TYPES;
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
if (eflag) {
lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
offset[ii]=_offset_<numtyp>(itype,jtype);
}
}
ii+=INT_MUL(blockIdx.x,blockDim.x);
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
numtyp ix=_x_<numtyp>(i,0);
numtyp iy=_x_<numtyp>(i,1);
numtyp iz=_x_<numtyp>(i,2);
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
numtyp factor_lj;
for ( ; list<list_end; list++) {
int j=*list;
if (j < nall)
factor_lj = 1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
int mtype=itype+_x_<numtyp>(j,3);
// Compute r12
numtyp delx = ix-_x_<numtyp>(j,0);
numtyp dely = iy-_x_<numtyp>(j,1);
numtyp delz = iz-_x_<numtyp>(j,2);
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<cutsq[mtype]) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
fx+=delx*force;
fy+=dely*force;
fz+=delz*force;
if (eflag) {
numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
energy+=factor_lj*(e-offset[mtype]);
}
if (vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii;
if (eflag) {
*ap1=energy;
ap1+=ans_pitch;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=ans_pitch;
}
}
*ap1=fx;
ap1+=ans_pitch;
*ap1=fy;
ap1+=ans_pitch;
*ap1=fz;
} // if ii
}
#endif

167
lib/gpu/lj_gpu_memory.cu Normal file
View File

@ -0,0 +1,167 @@
/***************************************************************************
lj_gpu_memory.cu
-------------------
W. Michael Brown
Global variables for GPU Lennard-Jones Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#include "lj_gpu_memory.h"
#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
template <class numtyp, class acctyp>
int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int LJ_GPU_MemoryT::get_max_atoms(const int gpu_bytes, const int max_nbors) {
int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
bytes_per_atom(max_nbors));
if (matoms>MAX_ATOMS)
matoms=MAX_ATOMS;
return matoms;
}
template <class numtyp, class acctyp>
int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
double **host_cutsq, double **host_sigma,
double **host_epsilon, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int max_nbors,
const int me) {
if (allocated)
clear();
if (me>=gpu.num_devices())
return 0;
gpu.set(me);
if (gpu.revision()<1.0)
return 0;
// Initialize timers for the selected GPU
time_pair.init();
// Initialize atom and nbor data
max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
atom.init(max_atoms);
nbor.init(ij_size,max_atoms,max_nbors);
// Get a stream for computing pair potentials
CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
// Use the write buffer from atom for data initialization
NVC_HostT &host_write=atom.host_write;
assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
// Copy data for bonded interactions
special_lj.safe_alloc(4);
special_lj.cast_copy(host_special_lj,host_write);
// Copy sigma, epsilon, and cutsq onto GPU
sigma.safe_alloc(ntypes,ntypes);
sigma.cast_copy(host_sigma[0],host_write);
epsilon.safe_alloc(ntypes,ntypes);
epsilon.cast_copy(host_epsilon[0],host_write);
cutsq.safe_alloc(ntypes,ntypes);
cutsq.cast_copy(host_cutsq[0],host_write);
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
shared_types=true;
}
offset.safe_alloc(lj_types,lj_types);
offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes);
double *t1=host_lj1[0];
double *t2=host_lj2[0];
for (int i=0; i<lj_types*lj_types; i++) {
host_write[i*2]=t1[i];
host_write[i*2+1]=t2[i];
}
lj1.safe_alloc(lj_types,lj_types);
lj1.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
ntypes,ntypes);
t1=host_lj3[0];
t2=host_lj4[0];
for (int i=0; i<lj_types*lj_types; i++) {
host_write[i*2]=t1[i];
host_write[i*2+1]=t2[i];
}
lj3.safe_alloc(lj_types,lj_types);
lj3.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
ntypes,ntypes);
// Bind constant data to textures
sigma_bind_texture<numtyp>(sigma);
epsilon_bind_texture<numtyp>(epsilon);
cutsq_bind_texture<numtyp>(cutsq);
offset_bind_texture<numtyp>(offset);
lj1_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj1);
lj3_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj3);
dev_error.safe_alloc(1);
dev_error.zero();
allocated=true;
return nbor.host_ij.begin();
}
template <class numtyp, class acctyp>
void LJ_GPU_MemoryT::clear() {
if (!allocated)
return;
allocated=false;
// Check for any pair style specific errors here
int err_flag;
dev_error.copy_to_host(&err_flag);
atom.clear();
nbor.clear();
sigma_unbind_texture<numtyp>();
epsilon_unbind_texture<numtyp>();
cutsq_unbind_texture<numtyp>();
offset_unbind_texture<numtyp>();
lj1_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
lj3_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream));
dev_error.clear();
sigma.clear();
epsilon.clear();
special_lj.clear();
cutsq.clear();
offset.clear();
lj1.clear();
lj3.clear();
}
template <class numtyp, class acctyp>
double LJ_GPU_MemoryT::host_memory_usage() const {
return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+
sizeof(LJ_GPU_Memory<numtyp,acctyp>);
}
template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>;

88
lib/gpu/lj_gpu_memory.h Normal file
View File

@ -0,0 +1,88 @@
/***************************************************************************
lj_gpu_memory.h
-------------------
W. Michael Brown
Global variables for GPU Lennard-Jones Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef LJ_GPU_MEMORY_H
#define LJ_GPU_MEMORY_H
#include "nvc_device.h"
#include "pair_gpu_atom.h"
#include "pair_gpu_nbor.h"
#define BLOCK_1D 64
#define MAX_SHARED_TYPES 8
#define PERCENT_GPU_MEMORY 0.7
template <class numtyp, class acctyp>
class LJ_GPU_Memory {
public:
LJ_GPU_Memory() : allocated(false) {}
~LJ_GPU_Memory() { clear(); }
/// Allocate memory on host and device
int* init(const int ij_size, const int ntypes, double **host_cutsq,
double **host_sigma, double **host_epsilon,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int max_nbors, const int me);
/// Free any memory on host and device
void clear();
/// Returns memory usage on GPU per atom
int bytes_per_atom(const int max_nbors) const;
/// Maximum number of atoms that can be stored on GPU
int get_max_atoms(const int gpu_bytes, const int max_nbors);
/// Total host memory used by library
double host_memory_usage() const;
// ------------------------- DATA -----------------------------
// Device Properties
NVCDevice gpu;
// Device Error Flag
NVC_VecI dev_error;
// Stream for asynchronous work
cudaStream_t pair_stream;
// Atom Data
PairGPUAtom<numtyp,acctyp> atom;
// Neighbor Data
PairGPUNbor nbor;
// --------------- Const Data for Atoms
NVC_ConstMatT sigma, epsilon, cutsq, offset;
NVC_ConstMat< typename cu_vec_traits<numtyp>::vec2 > lj1, lj3;
NVC_VecT special_lj;
size_t max_atoms;
// Timing for pair calculation
NVCTimer time_pair;
// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
protected:
bool allocated;
};
#endif

96
lib/gpu/nvc_device.cu Normal file
View File

@ -0,0 +1,96 @@
/***************************************************************************
nvc_device.cu
-------------------
W. Michael Brown
Utilities for dealing with cuda devices
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Wed Jan 28 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "nvc_macros.h"
#include "nvc_device.h"
// Grabs the properties for all devices
NVCDevice::NVCDevice() {
CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices));
for (int dev=0; dev<_num_devices; ++dev) {
cudaDeviceProp deviceProp;
CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
break;
_properties.push_back(deviceProp);
}
_device=0;
}
// Set the CUDA device to the specified device number
void NVCDevice::set(int num) {
if (_device==num)
return;
cudaThreadExit();
CUDA_SAFE_CALL(cudaSetDevice(num));
_device=num;
}
// List all devices along with all properties
void NVCDevice::print_all(ostream &out) {
if (num_devices() == 0)
printf("There is no device supporting CUDA\n");
for (int i=0; i<num_devices(); ++i) {
printf("\nDevice %d: \"%s\"\n", i, name(i).c_str());
printf(" Revision number: %.1f\n", revision(i));
printf(" Total amount of global memory: %.2f GB\n",
gigabytes(i));
#if CUDART_VERSION >= 2000
printf(" Number of multiprocessors: %d\n",
_properties[i].multiProcessorCount);
printf(" Number of cores: %d\n",cores(i));
#endif
printf(" Total amount of constant memory: %u bytes\n",
_properties[i].totalConstMem);
printf(" Total amount of shared memory per block: %u bytes\n",
_properties[i].sharedMemPerBlock);
printf(" Total number of registers available per block: %d\n",
_properties[i].regsPerBlock);
printf(" Warp size: %d\n",
_properties[i].warpSize);
printf(" Maximum number of threads per block: %d\n",
_properties[i].maxThreadsPerBlock);
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
_properties[i].maxThreadsDim[0],
_properties[i].maxThreadsDim[1],
_properties[i].maxThreadsDim[2]);
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
_properties[i].maxGridSize[0],
_properties[i].maxGridSize[1],
_properties[i].maxGridSize[2]);
printf(" Maximum memory pitch: %u bytes\n",
_properties[i].memPitch);
printf(" Texture alignment: %u bytes\n",
_properties[i].textureAlignment);
printf(" Clock rate: %.2f GHz\n",
clock_rate(i));
#if CUDART_VERSION >= 2000
printf(" Concurrent copy and execution: %s\n",
_properties[i].deviceOverlap ? "Yes" : "No");
#endif
}
}

92
lib/gpu/nvc_device.h Normal file
View File

@ -0,0 +1,92 @@
/***************************************************************************
nvc_device.h
-------------------
W. Michael Brown
Utilities for dealing with cuda devices
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Wed Jan 28 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef NVC_DEVICE
#define NVC_DEVICE
#include <string>
#include <vector>
#include <iostream>
using namespace std;
/// Class for looking at device properties
/** \note Calls to change the device outside of the class results in incorrect
* behavior
* \note There is no error checking for indexing past the number of devices **/
class NVCDevice {
public:
/// Grabs the properties for all devices
NVCDevice();
/// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); }
/// Set the CUDA device to the specified device number
void set(int num);
/// Get the current device number
inline int device_num() { return _device; }
/// Get the current CUDA device name
inline string name() { return name(_device); }
/// Get the CUDA device name
inline string name(const int i) { return string(_properties[i].name); }
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores
inline unsigned cores(const int i)
{ return _properties[i].multiProcessorCount*8; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory
inline double gigabytes(const int i)
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
/// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory
inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
/// Return the GPGPU revision number for current device
inline double revision() { return revision(_device); }
/// Return the GPGPU revision number
inline double revision(const int i)
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
/// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz
inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;}
/// List all devices along with all properties
void print_all(ostream &out);
private:
int _device, _num_devices;
vector<cudaDeviceProp> _properties;
};
#endif

View File

@ -0,0 +1,31 @@
/***************************************************************************
nvc_get_devices.h
-------------------
W. Michael Brown
List properties of cuda devices
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Wed Jan 28 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#include "nvc_device.h"
int main(int argc, char** argv) {
NVCDevice gpu;
gpu.print_all(cout);
return 0;
}

121
lib/gpu/nvc_macros.h Normal file
View File

@ -0,0 +1,121 @@
#ifndef NVC_MACROS_H
#define NVC_MACROS_H
#include <stdio.h>
#include "math_constants.h"
#define INT_MUL(x,y) (__mul24(x,y))
//#define INT_MUL(x,y) ((x)*(y))
template <class numbr>
static __inline__ __device__ numbr cuda_inf() { return CUDART_INF_F; }
#ifdef CUDA_DOUBLE
template <>
static __inline__ __device__ double cuda_inf<double>() { return CUDART_INF; }
#endif
template <class numbr>
static __inline__ __device__ numbr cuda_zero() { return 0.0; }
template <>
static __inline__ __device__ float cuda_zero<float>() { return 0.0f; }
#ifdef DEBUG
# define CU_SAFE_CALL_NO_SYNC( call ) do { \
CUresult err = call; \
if( CUDA_SUCCESS != err) { \
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
exit(EXIT_FAILURE); \
} } while (0)
# define CU_SAFE_CALL( call ) do { \
CU_SAFE_CALL_NO_SYNC(call); \
CUresult err = cuCtxSynchronize(); \
if( CUDA_SUCCESS != err) { \
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
exit(EXIT_FAILURE); \
} } while (0)
# define CUDA_SAFE_CALL_NO_SYNC( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
# define CUDA_SAFE_CALL( call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
# define CUFFT_SAFE_CALL( call) do { \
cufftResult err = call; \
if( CUFFT_SUCCESS != err) { \
fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} } while (0)
# define CUT_SAFE_CALL( call) \
if( CUTTrue != call) { \
fprintf(stderr, "Cut error in file '%s' in line %i.\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
}
//! Check for CUDA error
# define CUT_CHECK_ERROR(errorMessage) do { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} } while (0)
//! Check for malloc error
# define CUT_SAFE_MALLOC( mallocCall ) do{ \
if( !(mallocCall)) { \
fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} } while(0);
//! Check if conditon is true (flexible assert)
# define CUT_CONDITION( val) \
if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) { \
exit(EXIT_FAILURE); \
}
#else // not DEBUG
#define CUT_BANK_CHECKER( array, index) array[index]
// void macros for performance reasons
# define CUT_CHECK_ERROR(errorMessage)
# define CUT_CHECK_ERROR_GL()
# define CUT_CONDITION( val)
# define CU_SAFE_CALL_NO_SYNC( call) call
# define CU_SAFE_CALL( call) call
# define CUDA_SAFE_CALL_NO_SYNC( call) call
# define CUDA_SAFE_CALL( call) call
# define CUT_SAFE_CALL( call) call
# define CUFFT_SAFE_CALL( call) call
# define CUT_SAFE_MALLOC( mallocCall ) mallocCall
#endif
#endif

447
lib/gpu/nvc_memory.h Normal file
View File

@ -0,0 +1,447 @@
/***************************************************************************
nvc_memory.h
-------------------
W. Michael Brown
Routines for memory management on CUDA devices
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef NVC_MEMORY_H
#define NVC_MEMORY_H
#include <iostream>
#define NVC_HostT NVC_Host<numtyp>
#define NVC_HostD NVC_Host<double>
#define NVC_HostS NVC_Host<float>
#define NVC_HostI NVC_Host<int>
#define NVC_VecT NVC_Vec<numtyp>
#define NVC_VecD NVC_Vec<double>
#define NVC_VecS NVC_Vec<float>
#define NVC_VecI NVC_Vec<int>
#define NVC_VecI2 NVC_Vec<int2>
#define NVC_VecU2 NVC_Vec<uint2>
#define NVC_MatT NVC_Mat<numtyp>
#define NVC_MatD NVC_Mat<double>
#define NVC_MatS NVC_Mat<float>
#define NVC_MatI NVC_Mat<int>
#define NVC_ConstMatT NVC_ConstMat<numtyp>
#define NVC_ConstMatD NVC_ConstMat<double>
#define NVC_ConstMatS NVC_ConstMat<float>
#define NVC_ConstMatI NVC_ConstMat<int>
#define NVC_ConstMatD2 NVC_ConstMat<double2>
namespace NVC {
// Get a channel for float array
template <class numtyp>
inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) {
channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
}
// Get a channel for float2 array
template <>
inline void cuda_gb_get_channel<float2>(cudaChannelFormatDesc &channel) {
channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
}
// Get a channel for double array
template <>
inline void cuda_gb_get_channel<double>(cudaChannelFormatDesc &channel) {
channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned);
}
// Get a channel for double array
template <>
inline void cuda_gb_get_channel<double2>(cudaChannelFormatDesc &channel) {
channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned);
}
// Get a channel for int array
template <>
inline void cuda_gb_get_channel<int>(cudaChannelFormatDesc &channel) {
channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned);
}
}
/// Page-locked Row Vector on Host
template <class numtyp>
class NVC_Host {
public:
NVC_Host() { _cols=0; }
~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); }
// Allocate page-locked memory with fast write/slow read on host
inline void safe_alloc_w(const size_t cols) {
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes,
cudaHostAllocWriteCombined));
_end=_array+cols;
}
// Allocate page-locked memory with fast read/write on host
inline void safe_alloc_rw(const size_t cols) {
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes));
_end=_array+cols;
}
/// Free any memory associated with device
inline void clear()
{ if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } }
/// Set each element to zero
inline void zero() { memset(_array,0,row_bytes()); }
/// Set first n elements to zero
inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); }
inline numtyp * begin() { return _array; }
inline const numtyp * begin() const { return _array; }
inline numtyp * end() { return _end; }
inline const numtyp * end() const { return _end; }
inline size_t numel() const { return _cols; }
inline size_t rows() const { return 1; }
inline size_t cols() const { return _cols; }
inline size_t row_size() const { return _cols; }
inline size_t row_bytes() const { return _row_bytes; }
inline numtyp & operator[](const int i) { return _array[i]; }
inline const numtyp & operator[](const int i) const { return _array[i]; }
/// Copy from device (numel is not bytes)
inline void copy_from_device(const numtyp *device_p, size_t numel) {
CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp),
cudaMemcpyDeviceToHost));
}
/// Copy to device (numel is not bytes)
inline void copy_to_device(numtyp *device_p, size_t numel) {
CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp),
cudaMemcpyHostToDevice));
}
/// Copy to 2D matrix on device (numel is not bytes)
inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
const size_t rows, const size_t cols) {
CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp),
_array,cols*sizeof(numtyp),
cols*sizeof(numtyp),rows,
cudaMemcpyHostToDevice));
}
/// Asynchronous copy from device (numel is not bytes)
inline void copy_from_device(const numtyp *device_p, size_t numel,
cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
cudaMemcpyDeviceToHost,stream));
}
/// Asynchronous copy to device (numel is not bytes)
inline void copy_to_device(numtyp *device_p, size_t numel,
cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
cudaMemcpyHostToDevice,stream));
}
/// Asynchronous copy to 2D matrix on device (numel is not bytes)
inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
const size_t rows, const size_t cols,
cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp),
_array,cols*sizeof(numtyp),
cols*sizeof(numtyp),rows,
cudaMemcpyHostToDevice,stream));
}
private:
numtyp *_array, *_end;
size_t _row_bytes, _row_size, _rows, _cols;
};
/// Row vector on device
template <class numtyp>
class NVC_Vec {
public:
NVC_Vec() { _cols=0; }
~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); }
// Row vector on device
inline void safe_alloc(const size_t cols) {
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes));
_end=_array+cols;
}
/// Free any memory associated with device
inline void clear()
{ if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
/// Set each element to zero
inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); }
inline numtyp * begin() { return _array; }
inline const numtyp * begin() const { return _array; }
inline numtyp * end() { return _end; }
inline const numtyp * end() const { return _end; }
inline size_t numel() const { return _cols; }
inline size_t rows() const { return 1; }
inline size_t cols() const { return _cols; }
inline size_t row_size() const { return _cols; }
inline size_t row_bytes() const { return _row_bytes; }
/// Copy from host
inline void copy_from_host(const numtyp *host_p)
{ CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(),
cudaMemcpyHostToDevice)); }
/// Asynchronous copy from host
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
{ CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(),
cudaMemcpyHostToDevice, stream)); }
/// Copy to host
inline void copy_to_host(numtyp *host_p)
{ CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(),
cudaMemcpyDeviceToHost)); }
/// Copy n elements to host
inline void copy_to_host(numtyp *host_p, const int n)
{ CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp),
cudaMemcpyDeviceToHost)); }
/// Cast and then copy to device
template <class numtyp2>
inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
for (int i=0; i<numel(); i++)
host_write[i]=static_cast<numtyp>(buffer[i]);
copy_from_host(host_write.begin());
}
/// Bind to texture
template <class texture>
inline void bind_texture(texture &texi, cudaChannelFormatDesc &channel) {
NVC::cuda_gb_get_channel<numtyp>(channel);
texi.addressMode[0] = cudaAddressModeClamp;
texi.addressMode[1] = cudaAddressModeClamp;
texi.filterMode = cudaFilterModePoint;
texi.normalized = false;
CUDA_SAFE_CALL(cudaBindTexture(NULL,&texi,_array,&channel));
}
/// Output the vector (debugging)
inline void print(std::ostream &out) { print (out, numel()); }
// Output first n elements of vector
inline void print(std::ostream &out, const int n) {
numtyp *t=new numtyp[n];
copy_to_host(t,n);
for (int i=0; i<n; i++)
out << t[i] << " ";
delete []t;
}
private:
numtyp *_array, *_end;
size_t _row_bytes, _row_size, _rows, _cols;
};
/// 2D Matrix on device (can have extra column storage to get correct alignment)
template <class numtyp>
class NVC_Mat {
public:
NVC_Mat() { _rows=0; }
~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); }
// Row major matrix on device
// - Coalesced access using adjacent cols on same row
// - NVC_Mat(row,col) given by array[row*row_size()+col]
inline void safe_alloc(const size_t rows, const size_t cols) {
_rows=rows;
_cols=cols;
CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch,
cols*sizeof(numtyp),rows));
_row_size=_pitch/sizeof(numtyp);
_end=_array+_row_size*cols;
}
/// Free any memory associated with device
inline void clear()
{ if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
/// Set each element to zero
inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); }
inline numtyp * begin() { return _array; }
inline const numtyp * begin() const { return _array; }
inline numtyp * end() { return _end; }
inline const numtyp * end() const { return _end; }
inline size_t numel() const { return _cols*_rows; }
inline size_t rows() const { return _rows; }
inline size_t cols() const { return _cols; }
inline size_t row_size() const { return _row_size; }
inline size_t row_bytes() const { return _pitch; }
/// Copy from host (elements not bytes)
inline void copy_from_host(const numtyp *host_p, const size_t numel)
{ CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp),
cudaMemcpyHostToDevice)); }
/// Asynchronous copy from host (elements not bytes)
inline void copy_from_host(const numtyp *host_p, const size_t numel,
cudaStream_t &stream)
{ CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp),
cudaMemcpyHostToDevice, stream)); }
/// Asynchronous Copy from Host
/** \note Used when the number of columns/rows allocated on host smaller than
* on device **/
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
const size_t cols, cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp),
cols*sizeof(numtyp), rows,
cudaMemcpyHostToDevice,stream));
}
private:
numtyp *_array, *_end;
size_t _pitch, _row_size, _rows, _cols;
};
/// Const 2D Matrix on device (requires texture binding)
template <class numtyp>
class NVC_ConstMat {
public:
NVC_ConstMat() { _rows=0; }
~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); }
/// Row major matrix on device
inline void safe_alloc(const size_t rows, const size_t cols) {
_rows=rows;
_cols=cols;
NVC::cuda_gb_get_channel<numtyp>(_channel);
CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows));
}
/// Bind to texture
template <class texture>
inline void bind_texture(texture &texi) {
texi.addressMode[0] = cudaAddressModeClamp;
texi.addressMode[1] = cudaAddressModeClamp;
texi.filterMode = cudaFilterModePoint;
texi.normalized = false;
CUDA_SAFE_CALL(cudaBindTextureToArray(&texi,_array,&_channel));
}
/// Free any memory associated with device
inline void clear()
{ if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFreeArray(_array)); } }
inline size_t numel() const { return _cols*_rows; }
inline size_t rows() const { return _rows; }
inline size_t cols() const { return _cols; }
inline size_t row_size() const { return _cols; }
inline size_t row_bytes() const { return _cols*sizeof(numtyp); }
/// Copy from Host
inline void copy_from_host(const numtyp *host_p) {
CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p,
numel()*sizeof(numtyp),
cudaMemcpyHostToDevice));
}
/// Copy from Host
/** \note Used when the number of columns/rows allocated on host smaller than
* on device **/
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
const size_t cols) {
CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p,
cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
cudaMemcpyHostToDevice));
}
/// Asynchronous Copy from Host
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p,
numel()*sizeof(numtyp),
cudaMemcpyHostToDevice,stream));
}
/// Asynchronous Copy from Host
/** \note Used when the number of columns/rows allocated on host smaller than
* on device **/
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
const size_t cols, cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p,
cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
cudaMemcpyHostToDevice,stream));
}
/// Cast buffer to numtyp in host_write and copy to array
template <class numtyp2>
inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
int n=numel();
for (int i=0; i<n; i++) {
host_write[i]=static_cast<numtyp>(*buffer); buffer++;
}
copy_from_host(host_write.begin());
}
/// Cast buffer to numtyp in host_write and copy to array
/** \note Used when the number of columns/rows allocated on host smaller than
* on device **/
template <class numtyp2>
inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write,
const size_t rows, const size_t cols) {
int n=rows*cols;
for (int i=0; i<n; i++) {
host_write[i]=static_cast<numtyp>(*buffer); buffer++;
}
copy_2Dfrom_host(host_write.begin(),rows,cols);
}
/// Cast buffer to numtyp in host_write and copy to array asynchronously
template <class numtyp2>
inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write,
cudaStream_t &stream) {
int n=numel();
for (int i=0; i<n; i++) {
host_write[i]=static_cast<numtyp>(*buffer); buffer++;
}
copy_from_host(host_write.begin(),stream);
}
private:
size_t _rows, _cols;
cudaArray *_array;
cudaChannelFormatDesc _channel;
};
#endif

83
lib/gpu/nvc_timer.h Normal file
View File

@ -0,0 +1,83 @@
/***************************************************************************
nvc_timer.h
-------------------
W. Michael Brown
Class for timing CUDA routines
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Tue Feb 3 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef NVC_TIMER_H
#define NVC_TIMER_H
#include "nvc_macros.h"
#define cudaEventDestroy(a)
/// Class for timing CUDA events
class NVCTimer {
public:
NVCTimer() : _total_time(0.0f), initialized(false) { }
~NVCTimer() {
if (initialized)
{ cudaEventDestroy(start_event); cudaEventDestroy(stop_event); }
}
inline void init() {
if (!initialized) {
initialized=true;
CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );
}
}
/// Start timing
inline void start() { cudaEventRecord(start_event,0); }
/// Stop timing and store event time
inline void stop() { cudaEventRecord(stop_event,0); }
/// Set the time elapsed to zero (not the total_time)
inline void zero()
{ cudaEventRecord(start_event,0); cudaEventRecord(stop_event,0); }
/// Add time from previous start and stop to total
/** Forces synchronization **/
inline void add_to_total() { _total_time+=time(); }
/// Return the time (ms) of last start to stop - Forces synchronization
inline double time() {
float timer;
cudaEventSynchronize(stop_event);
CUDA_SAFE_CALL( cudaEventElapsedTime(&timer,start_event,stop_event) );
return timer;
}
/// Return the total time in ms
inline double total_time() { return _total_time; }
/// Return the total time in seconds
inline double total_seconds() { return _total_time/1000.0; }
private:
cudaEvent_t start_event, stop_event;
double _total_time;
bool initialized;
};
#endif

173
lib/gpu/pair_gpu_atom.cu Normal file
View File

@ -0,0 +1,173 @@
/***************************************************************************
pair_gpu_atom.cu
-------------------
W. Michael Brown
Memory routines for moving atom and force data between host and gpu
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#include "pair_gpu_atom.h"
#define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
template <class numtyp, class acctyp>
int PairGPUAtomT::bytes_per_atom() const {
return atom_fields()*sizeof(numtyp)+ans_fields()*sizeof(acctyp);
}
template <class numtyp, class acctyp>
void PairGPUAtomT::init(const int max_atoms) {
if (allocated)
clear();
_max_atoms=max_atoms;
// Initialize timers for the selected GPU
time_atom.init();
time_answer.init();
// Device matrices for atom and force data
dev_x.safe_alloc(atom_fields(),max_atoms);
x_bind_texture<numtyp>(dev_x);
ans.safe_alloc(ans_fields(),max_atoms);
// Get a host write only buffer
host_write.safe_alloc_w(max_atoms*4);
// Get a host read/write buffer
host_read.safe_alloc_rw(ans.row_size()*ans_fields());
allocated=true;
}
template <class numtyp, class acctyp>
void PairGPUAtomT::clear() {
if (!allocated)
return;
allocated=false;
x_unbind_texture<numtyp>();
ans.clear();
host_write.clear();
host_read.clear();
dev_x.clear();
}
template <class numtyp, class acctyp>
double PairGPUAtomT::host_memory_usage(const int max_atoms) const {
return max_atoms*atom_fields()*sizeof(numtyp)+
ans_fields()*(max_atoms)*sizeof(acctyp)+
sizeof(PairGPUAtom<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
cudaStream_t &s) {
_eflag=eflag;
_vflag=vflag;
int csize=ans_fields();
if (!eflag)
csize--;
if (!vflag)
csize-=6;
host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s);
}
template <class numtyp, class acctyp>
double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom,
const bool vflag_atom, double *eatom,
double **vatom, double *virial) {
double evdwl=0.0;
int gap=ans.row_size()-_inum;
acctyp *ap=host_read.begin();
if (_eflag) {
if (eflag_atom) {
for (int i=0; i<_inum; i++) {
evdwl+=*ap;
eatom[ilist[i]]+=*ap*0.5;
ap++;
}
} else
for (int i=0; i<_inum; i++) {
evdwl+=*ap;
ap++;
}
ap+=gap;
evdwl*=0.5;
}
_read_loc=ap;
gap=ans.row_size();
if (_vflag) {
if (vflag_atom) {
for (int ii=0; ii<_inum; ii++) {
int i=ilist[ii];
ap=_read_loc+ii;
for (int j=0; j<6; j++) {
vatom[i][j]+=*ap*0.5;
virial[j]+=*ap;
ap+=gap;
}
}
} else {
for (int ii=0; ii<_inum; ii++) {
ap=_read_loc+ii;
for (int j=0; j<6; j++) {
virial[j]+=*ap;
ap+=gap;
}
}
}
for (int j=0; j<6; j++)
virial[j]*=0.5;
_read_loc+=gap*6;
}
return evdwl;
}
template <class numtyp, class acctyp>
void PairGPUAtomT::add_forces(const int *ilist, double **f) {
int gap=ans.row_size();
for (int ii=0; ii<_inum; ii++) {
acctyp *ap=_read_loc+ii;
int i=ilist[ii];
f[i][0]+=*ap;
ap+=gap;
f[i][1]+=*ap;
ap+=gap;
f[i][2]+=*ap;
}
}
template <class numtyp, class acctyp>
void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) {
int gap=ans.row_size();
_read_loc+=gap*3;
for (int ii=0; ii<n; ii++) {
acctyp *ap=_read_loc+ii;
int i=ilist[ii];
tor[i][0]+=*ap;
ap+=gap;
tor[i][1]+=*ap;
ap+=gap;
tor[i][2]+=*ap;
}
}
template class PairGPUAtom<PRECISION,ACC_PRECISION>;

151
lib/gpu/pair_gpu_atom.h Normal file
View File

@ -0,0 +1,151 @@
/***************************************************************************
pair_gpu_atom.h
-------------------
W. Michael Brown
Memory routines for moving atom and force data between host and gpu
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef PAIR_GPU_ATOM_H
#define PAIR_GPU_ATOM_H
// PRECISION - Precision for rsq, energy, force, and torque calculation
// ACC_PRECISION - Precision for accumulation of energies, forces, and torques
#ifdef _SINGLE_DOUBLE
#define PRECISION float
#define ACC_PRECISION double
#endif
#ifdef _DOUBLE_DOUBLE
#define PRECISION double
#define ACC_PRECISION double
#endif
#ifndef PRECISION
#define PRECISION float
#define ACC_PRECISION double
#endif
#define MAX_ATOMS 65536
#include "nvc_timer.h"
#include "pair_gpu_texture.h"
template <class numtyp, class acctyp>
class PairGPUAtom {
public:
PairGPUAtom() : _atom_fields(4), _ans_fields(10), allocated(false) {}
~PairGPUAtom() { clear(); }
// Accessors
inline int atom_fields() const { return _atom_fields; }
inline int ans_fields() const { return _ans_fields; }
inline int max_atoms() const { return _max_atoms; }
inline int nall() const { return _nall; }
inline int inum() const { return _inum; }
/// Set number of atoms for future copy operations
inline void nall(const int n) { _nall=n; }
/// Set number of inum for future copy operations
inline void inum(const int n) { _inum=n; }
/// Set the number of atom fields (x, y, z, type, etc)
inline void atom_fields(const int n) { _atom_fields=n; }
/// Set the number of answer fields (energy, virial, force, etc.)
inline void ans_fields(const int n) { _ans_fields=n; }
/// Memory usage per atom in this class
/** \note atom_fields and ans_fields should be set for correct answer **/
int bytes_per_atom() const;
/// Must be called once to allocate host and device memory
/** \note atom_fields and ans_fields should be set first if not default **/
void init(const int max_atoms);
/// Free all memory on host and device
void clear();
/// Return the total amount of host memory used by class
double host_memory_usage(const int max_atoms) const;
// -------------------------COPY TO GPU ----------------------------------
/// Reset the write buffer pointer (Start copying new atom data)
inline void reset_write_buffer() { _write_loc=host_write.begin(); }
/// Add a row to write buffer with unit stride
/** Copies nall() elements **/
template<class cpytyp>
inline void add_atom_data(const cpytyp *host_ptr)
{ for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i]; _write_loc++; } }
/// Add a row to write buffer with non-unit stride
/** Copies nall() elements **/
template<class cpytyp>
inline void add_atom_data(const cpytyp *hostptr, const int stride) {
int t=_nall*stride;
for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; }
}
/// Copy num_rows x nall() write buffer to x in GPU
/** num_rows<=atom_fields() **/
inline void copy_atom_data(const int num_rows, cudaStream_t &stream)
{ dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); }
// -------------------------COPY FROM GPU -------------------------------
/// Copy answers from GPU into read buffer
void copy_answers(const bool eflag, const bool vflag, cudaStream_t &s);
/// Copy energy and virial data into LAMMPS memory
double energy_virial(const int *ilist, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial);
/// Add forces from the GPU into a LAMMPS pointer
void add_forces(const int *ilist, double **f);
/// Add torques from the GPU into a LAMMPS pointer
void add_torques(const int *ilist, double **tor, const int n);
// ------------------------------ DATA ----------------------------------
// atom_fields() x n (example: rows 1-3 position, 4 is type)
NVC_ConstMatT dev_x;
// ans_fields() x n Storage for Forces, etc.
// example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force
// example: if (!eflag) 1-3 is force
NVC_Mat<acctyp> ans;
// Buffer for moving floating point data to GPU
NVC_HostT host_write;
// Buffer for moving floating point data to CPU
NVC_Host<acctyp> host_read;
// Timing Stuff
NVCTimer time_atom, time_answer;
private:
bool allocated, _eflag, _vflag;
int _atom_fields, _ans_fields;
int _max_atoms, _nall, _inum;
numtyp * _write_loc;
acctyp * _read_loc;
};
#endif

80
lib/gpu/pair_gpu_nbor.cu Normal file
View File

@ -0,0 +1,80 @@
/***************************************************************************
pair_gpu_nbor.cu
-------------------
W. Michael Brown
Neighbor memory operations for LAMMPS GPU Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#include "pair_gpu_nbor.h"
int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
if (_use_packing)
return (max_nbors*2+4)*sizeof(int);
else
return (max_nbors+3)*sizeof(int);
}
void PairGPUNbor::init(const int ij_size, const int max_atoms,
const int max_nbors) {
if (allocated)
clear();
// Initialize timers for the selected GPU
time_nbor.init();
if (_use_packing)
dev_nbor.safe_alloc(max_nbors+4,max_atoms);
else
dev_nbor.safe_alloc(3,max_atoms);
ij.safe_alloc(max_nbors*max_atoms);
host_ij.safe_alloc_w(ij_size);
allocated=true;
}
void PairGPUNbor::clear() {
if (!allocated)
return;
allocated=false;
ij.clear();
host_ij.clear();
dev_nbor.clear();
}
double PairGPUNbor::host_memory_usage() const {
return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor);
}
void PairGPUNbor::reset(const int inum, int *ilist, const int *numj,
cudaStream_t &s) {
ij_total=0;
dev_nbor.copy_from_host(ilist,inum);
int acc=0;
for (int i=0; i<inum; i++) {
host_ij[i]=numj[ilist[i]];
host_ij[i+inum]=acc;
acc+=numj[ilist[i]];
}
host_ij.copy_to_2Ddevice(dev_nbor.begin()+dev_nbor.row_size(),
dev_nbor.row_size(),2,inum, s);
}

91
lib/gpu/pair_gpu_nbor.h Normal file
View File

@ -0,0 +1,91 @@
/***************************************************************************
pair_gpu_nbor.h
-------------------
W. Michael Brown
Neighbor memory operations for LAMMPS GPU Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#ifndef PAIR_GPU_NBOR_H
#define PAIR_GPU_NBOR_H
#include "nvc_macros.h"
#include "nvc_timer.h"
#include "nvc_memory.h"
#define BLOCK_1D 64
#define IJ_SIZE 131072
class PairGPUNbor {
public:
PairGPUNbor() : _use_packing(false), allocated(false) {}
~PairGPUNbor() { clear(); }
/// Determine whether neighbor packing should be used
/** If true, twice as much memory is reserved to allow packing neighbors by
* atom for coalesced access after cutoff evaluation. This can be used
* for expensive potentials where it is more efficient to evaluate the
* cutoff separately from the potential in order to reduce thread divergence
* for expensive routines **/
void packing(const bool use_packing) { _use_packing=use_packing; }
/// Called once to allocate memory
void init(const int ij_size, const int max_atoms, const int max_nbors);
/// Free all memory on host and device
void clear();
/// Bytes per atom used on device
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by class
double host_memory_usage() const;
/// Reset neighbor data (first time or from a rebuild)
void reset(const int inum, int *ilist, const int *numj, cudaStream_t &s);
/// Add neighbor data from host
inline void add(const int num_ij, cudaStream_t &s)
{ host_ij.copy_to_device(ij.begin()+ij_total,num_ij,s); ij_total+=num_ij; }
/// Pack neighbors satisfying cutoff by atom for coalesced access
void pack_nbors(const int GX, const int BX, const int start,
const int inum, const int form_low, const int form_high);
// ------------------------------- Data -------------------------------
// Store IJ interactions on device
NVC_VecI ij;
// Buffer for moving ij data to GPU
NVC_HostI host_ij;
// --------------- Atom neighbors
// 3 x n
// - 1st row is i
// - 2nd row is numj (number of neighbors)
// - 3rd row is starting address in host_ij of neighbors
NVC_MatI dev_nbor;
// --------------- Timing Stuff
NVCTimer time_nbor;
int ij_total;
private:
bool allocated, _use_packing;
};
#endif

292
lib/gpu/pair_gpu_texture.h Normal file
View File

@ -0,0 +1,292 @@
/***************************************************************************
pair_gpu_texture.h
-------------------
W. Michael Brown
Tricks for templating textures
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
#include "nvc_memory.h"
#ifndef PAIR_GPU_TEXTURE_H
#define PAIR_GPU_TEXTURE_H
template <class numtyp> class cu_vec_traits;
template <> class cu_vec_traits<float> { public: typedef float2 vec2; };
template <> class cu_vec_traits<double> { public: typedef double2 vec2; };
// ------------------------------- x ------------------------------------
static texture<float, 2, cudaReadModeElementType> x_float_tex;
static texture<int2, 2, cudaReadModeElementType> x_double_tex;
template <class numtyp> inline void x_bind_texture(NVC_ConstMatT &mat)
{ mat.bind_texture(x_float_tex); }
template <> inline void x_bind_texture<double>(NVC_ConstMatD &mat)
{ mat.bind_texture(x_double_tex); }
template <class numtyp> inline void x_unbind_texture()
{ cudaUnbindTexture(x_float_tex); }
template <> inline void x_unbind_texture<double>()
{ cudaUnbindTexture(x_double_tex); }
template <class numtyp>
static __inline__ __device__ numtyp _x_(const int i, const int j) {
return tex2D(x_float_tex,i,j);
}
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double _x_<double>(const int i,const int j) {
int2 t=tex2D(x_double_tex,i,j);
return __hiloint2double(t.y, t.x);
}
#endif
// ------------------------------- form ------------------------------------
static texture<int, 2, cudaReadModeElementType> form_tex;
inline void form_bind_texture(NVC_ConstMatI &mat)
{ mat.bind_texture(form_tex); }
inline void form_unbind_texture()
{ cudaUnbindTexture(form_tex); }
static __inline__ __device__ int _form_(const int i, const int j) {
return tex2D(form_tex,i,j);
}
// ------------------------------- lshape ------------------------------------
static texture<float, 1, cudaReadModeElementType> lshape_float_tex;
static texture<int2, 1, cudaReadModeElementType> lshape_double_tex;
static cudaChannelFormatDesc channel_lshape;
template <class numtyp> inline void lshape_bind_texture(NVC_VecT &vec)
{ vec.bind_texture(lshape_float_tex,channel_lshape); }
template <> inline void lshape_bind_texture<double>(NVC_VecD &vec)
{ vec.bind_texture(lshape_double_tex,channel_lshape); }
template <class numtyp> inline void lshape_unbind_texture()
{ cudaUnbindTexture(lshape_float_tex); }
template <> inline void lshape_unbind_texture<double>()
{ cudaUnbindTexture(lshape_double_tex); }
template <class numtyp>
static __inline__ __device__ numtyp _lshape_(const int i)
{ return tex1Dfetch(lshape_float_tex,i); }
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double _lshape_<double>(const int i) {
int2 t=tex1Dfetch(lshape_double_tex,i);
return __hiloint2double(t.y, t.x);
}
#endif
// ------------------------------- shape ------------------------------------
static texture<float, 2, cudaReadModeElementType> shape_float_tex;
static texture<int2, 2, cudaReadModeElementType> shape_double_tex;
template <class numtyp> inline void shape_bind_texture(NVC_ConstMatT &mat)
{ mat.bind_texture(shape_float_tex); }
template <> inline void shape_bind_texture<double>(NVC_ConstMatD &mat)
{ mat.bind_texture(shape_double_tex); }
template <class numtyp> inline void shape_unbind_texture()
{ cudaUnbindTexture(shape_float_tex); }
template <> inline void shape_unbind_texture<double>()
{ cudaUnbindTexture(shape_double_tex); }
template <class numtyp>
static __inline__ __device__ numtyp _shape_(const int i, const int j) {
return tex2D(shape_float_tex,j,i);
}
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double _shape_<double>(const int i, const int j) {
int2 t=tex2D(shape_double_tex,j,i);
return __hiloint2double(t.y, t.x);
}
#endif
// ------------------------------- well ------------------------------------
static texture<float, 2, cudaReadModeElementType> well_float_tex;
static texture<int2, 2, cudaReadModeElementType> well_double_tex;
template <class numtyp> inline void well_bind_texture(NVC_ConstMatT &mat)
{ mat.bind_texture(well_float_tex); }
template <> inline void well_bind_texture<double>(NVC_ConstMatD &mat)
{ mat.bind_texture(well_double_tex); }
template <class numtyp> inline void well_unbind_texture()
{ cudaUnbindTexture(well_float_tex); }
template <> inline void well_unbind_texture<double>()
{ cudaUnbindTexture(well_double_tex); }
template <class numtyp>
static __inline__ __device__ numtyp _well_(const int i, const int j)
{ return tex2D(well_float_tex,j,i); }
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double _well_<double>(const int i,const int j) {
int2 t=tex2D(well_double_tex,j,i);
return __hiloint2double(t.y, t.x);
}
#endif
// ------------------------------- sigma ------------------------------------
static texture<float, 2, cudaReadModeElementType> sigma_float_tex;
static texture<int2, 2, cudaReadModeElementType> sigma_double_tex;
template <class numtyp> inline void sigma_bind_texture(NVC_ConstMatT &mat)
{ mat.bind_texture(sigma_float_tex); }
template <> inline void sigma_bind_texture<double>(NVC_ConstMatD &mat)
{ mat.bind_texture(sigma_double_tex); }
template <class numtyp> inline void sigma_unbind_texture()
{ cudaUnbindTexture(sigma_float_tex); }
template <> inline void sigma_unbind_texture<double>()
{ cudaUnbindTexture(sigma_double_tex); }
template <class numtyp>
static __inline__ __device__ numtyp _sigma_(const int i, const int j) {
return tex2D(sigma_float_tex,j,i);
}
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double _sigma_<double>(const int i,const int j) {
int2 t=tex2D(sigma_double_tex,j,i);
return __hiloint2double(t.y, t.x);
}
#endif
// ------------------------------- epsilon ------------------------------------
static texture<float, 2, cudaReadModeElementType> epsilon_float_tex;
static texture<int2, 2, cudaReadModeElementType> epsilon_double_tex;
template <class numtyp> inline void epsilon_bind_texture(NVC_ConstMatT &mat)
{ mat.bind_texture(epsilon_float_tex); }
template <> inline void epsilon_bind_texture<double>(NVC_ConstMatD &mat)
{ mat.bind_texture(epsilon_double_tex); }
template <class numtyp> inline void epsilon_unbind_texture()
{ cudaUnbindTexture(epsilon_float_tex); }
template <> inline void epsilon_unbind_texture<double>()
{ cudaUnbindTexture(epsilon_double_tex); }
template <class numtyp>
static __inline__ __device__ numtyp _epsilon_(const int i, const int j) {
return tex2D(epsilon_float_tex,j,i);
}
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double _epsilon_<double>(const int i,const int j) {
int2 t=tex2D(epsilon_double_tex,j,i);
return __hiloint2double(t.y, t.x);
}
#endif
// ------------------------------- cutsq ------------------------------------
static texture<float, 2, cudaReadModeElementType> cutsq_float_tex;
static texture<int2, 2, cudaReadModeElementType> cutsq_double_tex;
template <class numtyp> inline void cutsq_bind_texture(NVC_ConstMatT &mat)
{ mat.bind_texture(cutsq_float_tex); }
template <> inline void cutsq_bind_texture<double>(NVC_ConstMatD &mat)
{ mat.bind_texture(cutsq_double_tex); }
template <class numtyp> inline void cutsq_unbind_texture()
{ cudaUnbindTexture(cutsq_float_tex); }
template <> inline void cutsq_unbind_texture<double>()
{ cudaUnbindTexture(cutsq_double_tex); }
template <class numtyp>
static __inline__ __device__ numtyp _cutsq_(const int i, const int j) {
return tex2D(cutsq_float_tex,j,i);
}
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double _cutsq_<double>(const int i,const int j) {
int2 t=tex2D(cutsq_double_tex,j,i);
return __hiloint2double(t.y, t.x);
}
#endif
// ------------------------------- lj1 ------------------------------------
static texture<float2, 2, cudaReadModeElementType> lj1_float_tex;
static texture<int4, 2, cudaReadModeElementType> lj1_double_tex;
template <class numtyp> inline void lj1_bind_texture(NVC_ConstMatT &mat)
{ mat.bind_texture(lj1_float_tex); }
template <> inline void lj1_bind_texture<double2>(NVC_ConstMatD2 &mat)
{ mat.bind_texture(lj1_double_tex); }
template <class numtyp> inline void lj1_unbind_texture()
{ cudaUnbindTexture(lj1_float_tex); }
template <> inline void lj1_unbind_texture<double2>()
{ cudaUnbindTexture(lj1_double_tex); }
template <class numtyp>
static __inline__ __device__
typename cu_vec_traits<numtyp>::vec2 _lj1_(const int i, const int j) {
return tex2D(lj1_float_tex,j,i);
}
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double2 _lj1_<double>(const int i,const int j) {
int4 t=tex2D(lj1_double_tex,j,i);
double2 ans;
ans.x=__hiloint2double(t.y, t.x);
ans.y=__hiloint2double(t.w, t.z);
return ans;
}
#endif
// ------------------------------- lj3 ------------------------------------
static texture<float2, 2, cudaReadModeElementType> lj3_float_tex;
static texture<int4, 2, cudaReadModeElementType> lj3_double_tex;
template <class numtyp> inline void lj3_bind_texture(NVC_ConstMatT &mat)
{ mat.bind_texture(lj3_float_tex); }
template <> inline void lj3_bind_texture<double2>(NVC_ConstMatD2 &mat)
{ mat.bind_texture(lj3_double_tex); }
template <class numtyp> inline void lj3_unbind_texture()
{ cudaUnbindTexture(lj3_float_tex); }
template <> inline void lj3_unbind_texture<double2>()
{ cudaUnbindTexture(lj3_double_tex); }
template <class numtyp>
static __inline__ __device__
typename cu_vec_traits<numtyp>::vec2 _lj3_(const int i, const int j) {
return tex2D(lj3_float_tex,j,i);
}
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double2 _lj3_<double>(const int i,const int j) {
int4 t=tex2D(lj3_double_tex,j,i);
double2 ans;
ans.x=__hiloint2double(t.y, t.x);
ans.y=__hiloint2double(t.w, t.z);
return ans;
}
#endif
// ------------------------------- offset ------------------------------------
static texture<float, 2, cudaReadModeElementType> offset_float_tex;
static texture<int2, 2, cudaReadModeElementType> offset_double_tex;
template <class numtyp> inline void offset_bind_texture(NVC_ConstMatT &mat)
{ mat.bind_texture(offset_float_tex); }
template <> inline void offset_bind_texture<double>(NVC_ConstMatD &mat)
{ mat.bind_texture(offset_double_tex); }
template <class numtyp> inline void offset_unbind_texture()
{ cudaUnbindTexture(offset_float_tex); }
template <> inline void offset_unbind_texture<double>()
{ cudaUnbindTexture(offset_double_tex); }
template <class numtyp>
static __inline__ __device__ numtyp _offset_(const int i, const int j) {
return tex2D(offset_float_tex,j,i);
}
#ifdef GB_GPU_DOUBLE
template <>
static __inline__ __device__ double _offset_<double>(const int i,const int j) {
int2 t=tex2D(offset_double_tex,j,i);
return __hiloint2double(t.y, t.x);
}
#endif
#endif

34
src/GPU/Install.csh Normal file
View File

@ -0,0 +1,34 @@
# Install/unInstall package classes in LAMMPS
# edit Makefile.package to include/exclude GPU library
if ($1 == 1) then
# sed -i 's/\S*gpu //' ../Makefile.package
# sed -i 's|^PKGPATH =\s*|&-L../../lib/gpu |' ../Makefile.package
# sed -i 's|^PKGLIB =\s*|&-lpair_gpu_lib |' ../Makefile.package
cp -f pair_lj_cut_gpu.h ../
cp -f pair_lj_cut_gpu.cpp ../
if (-e ../pair_gayberne.cpp) then
cp -f style_gpu.h ../
cp -f pair_gayberne_gpu.h ../
cp -f pair_gayberne_gpu.cpp ../
else
grep -v erne style_gpu.h > ../style_gpu.h
endif
else if ($1 == 0) then
# sed -i 's/\S*gpu //' ../Makefile.package
rm -f ../style_gpu.h
touch ../style_gpu.h
rm -f ../pair_lj_cut_gpu.h
rm -f ../pair_lj_cut_gpu.cpp
rm -f ../pair_gayberne_gpu.h
rm -f ../pair_gayberne_gpu.cpp
endif

View File

@ -0,0 +1,369 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
cetain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "pair_gayberne_gpu.h"
#include "math_extra.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#ifdef GB_GPU_OMP
#include "omp.h"
#endif
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
// External functions from cuda library for atom decomposition
int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int max_nbors, const int thread, const int gpu_id);
void gb_gpu_clear(const int thread);
int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
int *ilist, const int *numj, const int *type,
const int thread, bool &success);
void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread);
void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type,
const bool rebuild, const int thread);
void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
const int thread);
double gb_gpu_forces(double **f, double **tor, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial, const int thread);
std::string gb_gpu_name(const int i, const int max_nbors);
void gb_gpu_time(const int thread);
int gb_gpu_num_devices();
double gb_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
omp_chunk(0), nthreads(1),
multi_gpu_mode(ONE_NODE),
multi_gpu_param(0)
{
ij_new[0]=NULL;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairGayBerneGPU::~PairGayBerneGPU()
{
if (comm->me == 0 && screen) {
printf("\n\n-------------------------------------");
printf("--------------------------------\n");
printf(" GPU Time Stamps: ");
printf("\n-------------------------------------");
printf("--------------------------------\n");
gb_gpu_time(my_thread);
std::cout << "Procs: " << universe->nprocs << std::endl;
printf("-------------------------------------");
printf("--------------------------------\n\n");
}
#pragma omp parallel
{
#ifdef GB_GPU_OMP
int my_thread=omp_get_thread_num();
#endif
gb_gpu_clear(my_thread);
if (ij_new[my_thread]!=NULL) {
ij_new[my_thread]=NULL;
delete [] ij_new[my_thread];
}
}
}
/* ---------------------------------------------------------------------- */
void PairGayBerneGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
if (vflag_atom)
error->all("Per-atom virial not available with GPU Gay-Berne.");
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int inum = list->inum;
bool rebuild=false;
if (neighbor->ncalls > last_neighbor) {
last_neighbor=neighbor->ncalls;
rebuild=true;
}
#pragma omp parallel
{
bool success=true;
#ifdef GB_GPU_OMP
int my_thread=omp_get_thread_num();
if (rebuild) {
omp_chunk=static_cast<int>(ceil(static_cast<double>(inum)/nthreads));
if (my_thread==nthreads-1)
thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk;
else
thread_inum[my_thread]=omp_chunk;
olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal,
thread_inum[my_thread],
list->ilist+omp_chunk*my_thread,
list->numneigh, atom->type, my_thread,
success);
}
#else
if (rebuild)
olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist,
list->numneigh, atom->type, my_thread,
success);
#endif
if (!success)
error->one("Total # of atoms exceeds maximum allowed per GPGPU.\n");
// copy atom data to GPU
gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread);
int i,j,ii,jj,jnum;
double factor_lj;
int *jlist;
if (rebuild==true) {
int num_ij = 0;
// loop over neighbors of my atoms
int *ijp=ij_new[my_thread];
#ifdef GB_GPU_OMP
int mgo=my_thread*omp_chunk;
int mgot=mgo+thread_inum[my_thread];
#else
int mgo=0, mgot=inum;
#endif
for (ii = mgo; ii<mgot; ii++) {
i = olist[my_thread][ii];
jlist = list->firstneigh[i];
jnum = list->numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
*ijp=j;
ijp++;
num_ij++;
if (num_ij==ij_size) {
memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
gb_gpu_nbors(num_ij,eflag,my_thread);
ijp=ij_new[my_thread];
num_ij=0;
}
}
}
if (num_ij>0) {
memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
gb_gpu_nbors(num_ij,eflag,my_thread);
}
}
gb_gpu_gayberne(eflag,vflag,rebuild,my_thread);
double lvirial[6];
for (int i=0; i<6; i++) lvirial[i]=0.0;
double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag,
eflag_atom, vflag_atom, eatom, vatom, lvirial,
my_thread);
#pragma omp critical
{
eng_vdwl+=my_eng;
virial[0]+=lvirial[0];
virial[1]+=lvirial[1];
virial[2]+=lvirial[2];
virial[3]+=lvirial[3];
virial[4]+=lvirial[4];
virial[5]+=lvirial[5];
}
} //End parallel
if (vflag_fdotr) virial_compute();
}
/* ----------------------------------------------------------------------
global settings
------------------------------------------------------------------------- */
void PairGayBerneGPU::settings(int narg, char **arg)
{
if (narg != 4 && narg != 6) error->all("Illegal pair_style command");
// Set multi_gpu_mode to one_node for multiple gpus on 1 node
// -- param is starting gpu id
// Set multi_gpu_mode to one_gpu to select the same gpu id on every node
// -- param is id of gpu
// Set multi_gpu_mode to multi_gpu to get ma
// -- param is number of gpus per node
multi_gpu_mode=ONE_NODE;
multi_gpu_param=0;
if (narg==6) {
multi_gpu_param=atoi(arg[5]);
if (strcmp("one_node",arg[4])==0)
multi_gpu_mode=ONE_NODE;
else if (strcmp("one_gpu",arg[4])==0)
multi_gpu_mode=ONE_GPU;
else if (strcmp("multi_gpu",arg[4])==0) {
multi_gpu_mode=MULTI_GPU;
if (multi_gpu_param<1)
error->all("Illegal pair_style command");
} else
error->all("Illegal pair_style command");
narg-=2;
}
PairGayBerne::settings(narg,arg);
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairGayBerneGPU::init_style()
{
// Set the GPU ID
int my_gpu=comm->me+multi_gpu_param;
int ngpus=universe->nprocs;
if (multi_gpu_mode==ONE_GPU) {
my_gpu=multi_gpu_param;
ngpus=1;
} else if (multi_gpu_mode==MULTI_GPU) {
ngpus=multi_gpu_param;
my_gpu=comm->me%ngpus;
if (ngpus>universe->nprocs)
ngpus=universe->nprocs;
}
if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type)
error->all("Pair gayberne requires atom attributes quat, torque, shape");
if (atom->radius_flag)
error->all("Pair gayberne cannot be used with atom attribute diameter");
int irequest = neighbor->request(this);
// per-type shape precalculations
for (int i = 1; i <= atom->ntypes; i++) {
if (setwell[i]) {
double *one = atom->shape[i];
shape[i][0] = one[0]*one[0];
shape[i][1] = one[1]*one[1];
shape[i][2] = one[2]*one[2];
lshape[i] = (one[0]*one[1]+one[2]*one[2])*sqrt(one[0]*one[1]);
}
}
// Repeat cutsq calculation because done after call to init_style
double cut;
for (int i = 1; i <= atom->ntypes; i++)
for (int j = i; j <= atom->ntypes; j++) {
cut = init_one(i,j);
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
// If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads
#ifdef GB_GPU_OMP
if (multi_gpu_mode!=ONE_GPU)
nthreads=ngpus=gb_gpu_num_devices();
else
nthreads=ngpus=1;
if (nthreads>MAX_GPU_THREADS)
nthreads=MAX_GPU_THREADS;
omp_set_num_threads(nthreads);
#endif
#pragma omp parallel firstprivate(my_gpu)
{
#ifdef GB_GPU_OMP
int my_thread = omp_get_thread_num();
if (multi_gpu_mode!=ONE_GPU)
my_gpu=my_thread;
if (multi_gpu_mode==ONE_NODE)
my_gpu+=multi_gpu_param;
#endif
ij[my_thread]=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu,
shape, well, cutsq, sigma, epsilon, lshape, form,
lj1, lj2, lj3, lj4, offset, force->special_lj,
neighbor->oneatom, my_thread, my_gpu);
if (ij[my_thread]==0)
error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
if (ij_new[my_thread]!=NULL)
delete [] ij_new[my_thread];
ij_new[my_thread]=new int[ij_size];
}
last_neighbor = -1;
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
if (force->newton_pair)
error->all("Cannot use newton with GPU GayBerne pair style.");
if (comm->me == 0 && screen) {
printf("\n-------------------------------------");
printf("-------------------------------------\n");
printf("- Using GPGPU acceleration for Gay-Berne:\n");
printf("-------------------------------------");
printf("-------------------------------------\n");
for (int i=0; i<ngpus; i++) {
int gpui=my_gpu;
if (multi_gpu_mode==ONE_NODE)
gpui=i+multi_gpu_param;
else if (multi_gpu_mode==MULTI_GPU)
gpui=i;
std::string gpu_string=gb_gpu_name(gpui,neighbor->oneatom);
printf("GPU %d: %s\n",gpui,gpu_string.c_str());
}
printf("-------------------------------------");
printf("-------------------------------------\n\n");
}
}
/* ---------------------------------------------------------------------- */
double PairGayBerneGPU::memory_usage()
{
double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int);
return bytes+gb_gpu_bytes();
}

View File

@ -0,0 +1,43 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_H
#define PAIR_GPU_H
#include "pair_gayberne.h"
#define MAX_GPU_THREADS 4
namespace LAMMPS_NS {
class PairGayBerneGPU : public PairGayBerne {
public:
PairGayBerneGPU(LAMMPS *lmp);
~PairGayBerneGPU();
void compute(int, int);
void settings(int, char **);
void init_style();
double memory_usage();
enum { ONE_NODE, ONE_GPU, MULTI_GPU };
private:
int ij_size;
int *ij[MAX_GPU_THREADS], *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS];
int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk;
int last_neighbor, multi_gpu_mode, multi_gpu_param;
};
}
#endif

270
src/GPU/pair_lj_cut_gpu.cpp Normal file
View File

@ -0,0 +1,270 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
cetain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Mike Brown (SNL)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "pair_lj_cut_gpu.h"
#include "math_extra.h"
#include "atom.h"
#include "atom_vec.h"
#include "comm.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "integrate.h"
#include "memory.h"
#include "error.h"
#include "neigh_request.h"
#include "universe.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
// External functions from cuda library for force decomposition
int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,
double **sigma, double **epsilon, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int max_nbors,
const int gpu_id);
void lj_gpu_clear();
bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
const int *numj);
void lj_gpu_nbors(const int num_ij);
void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild);
void lj_gpu(const bool eflag, const bool vflag, const bool rebuild);
double lj_gpu_forces(double **f, const int *ilist, const bool eflag,
const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial);
std::string lj_gpu_name(const int gpu_id, const int max_nbors);
void lj_gpu_time();
int lj_gpu_num_devices();
double lj_gpu_bytes();
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
{
ij_new=NULL;
respa_enable = 0;
}
/* ----------------------------------------------------------------------
free all arrays
------------------------------------------------------------------------- */
PairLJCutGPU::~PairLJCutGPU()
{
if (comm->me == 0 && screen) {
printf("\n\n-------------------------------------");
printf("--------------------------------\n");
printf(" GPU Time Stamps: ");
printf("\n-------------------------------------");
printf("--------------------------------\n");
lj_gpu_time();
std::cout << "Procs: " << universe->nprocs << std::endl;
printf("-------------------------------------");
printf("--------------------------------\n\n");
}
lj_gpu_clear();
if (ij_new!=NULL) {
ij_new=NULL;
delete [] ij_new;
}
}
/* ---------------------------------------------------------------------- */
void PairLJCutGPU::compute(int eflag, int vflag)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = 0;
if (vflag_atom)
error->all("Per-atom virial not available with GPU Gay-Berne.");
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
int inum = list->inum;
int *ilist = list->ilist;
bool rebuild=false;
if (neighbor->ncalls > last_neighbor) {
last_neighbor=neighbor->ncalls;
rebuild=true;
}
// copy nbors to GPU
if (rebuild)
if (!lj_gpu_reset_nbors(nall, inum, ilist, list->numneigh))
error->one("Total # of atoms exceed maximum allowed per GPGPU.\n");
// copy atom data to GPU
lj_gpu_atom(atom->x,atom->type,rebuild);
int i,j,ii,jj,jnum;
double factor_lj;
int *jlist;
if (rebuild==true) {
int num_ij = 0;
// loop over neighbors of my atoms
int *ijp=ij_new;
for (ii = 0; ii<inum; ii++) {
i = ilist[ii];
jlist = list->firstneigh[i];
jnum = list->numneigh[i];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
*ijp=j;
ijp++;
num_ij++;
if (num_ij==ij_size) {
memcpy(ij,ij_new,num_ij*sizeof(int));
lj_gpu_nbors(num_ij);
ijp=ij_new;
num_ij=0;
}
}
}
if (num_ij>0) {
memcpy(ij,ij_new,num_ij*sizeof(int));
lj_gpu_nbors(num_ij);
}
}
lj_gpu(eflag,vflag,rebuild);
eng_vdwl=lj_gpu_forces(atom->f,ilist,eflag,vflag, eflag_atom, vflag_atom,
eatom, vatom, virial);
if (vflag_fdotr) virial_compute();
}
/* ----------------------------------------------------------------------
global settings
------------------------------------------------------------------------- */
void PairLJCutGPU::settings(int narg, char **arg)
{
if (narg != 1 && narg != 3) error->all("Illegal pair_style command");
// Set multi_gpu_mode to one_node for multiple gpus on 1 node
// -- param is starting gpu id
// Set multi_gpu_mode to one_gpu to select the same gpu id on every node
// -- param is id of gpu
// Set multi_gpu_mode to multi_gpu to get ma
// -- param is number of gpus per node
multi_gpu_mode=ONE_NODE;
multi_gpu_param=0;
if (narg==3) {
multi_gpu_param=atoi(arg[2]);
if (strcmp("one_node",arg[1])==0)
multi_gpu_mode=ONE_NODE;
else if (strcmp("one_gpu",arg[1])==0)
multi_gpu_mode=ONE_GPU;
else if (strcmp("multi_gpu",arg[1])==0) {
multi_gpu_mode=MULTI_GPU;
if (multi_gpu_param<1)
error->all("Illegal pair_style command");
} else
error->all("Illegal pair_style command");
narg-=2;
}
PairLJCut::settings(narg,arg);
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
void PairLJCutGPU::init_style()
{
// Set the GPU ID
int my_gpu=comm->me+multi_gpu_param;
int ngpus=universe->nprocs;
if (multi_gpu_mode==ONE_GPU) {
my_gpu=multi_gpu_param;
ngpus=1;
} else if (multi_gpu_mode==MULTI_GPU) {
ngpus=multi_gpu_param;
my_gpu=comm->me%ngpus;
if (ngpus>universe->nprocs)
ngpus=universe->nprocs;
}
int irequest = neighbor->request(this);
cut_respa=NULL;
// Repeat cutsq calculation because done after call to init_style
double cut;
for (int i = 1; i <= atom->ntypes; i++)
for (int j = i; j <= atom->ntypes; j++) {
cut = init_one(i,j);
cutsq[i][j] = cutsq[j][i] = cut*cut;
}
ij=lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3,
lj4, offset, force->special_lj, neighbor->oneatom, my_gpu);
if (ij==0)
error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
if (ij_new!=NULL)
delete [] ij_new;
ij_new=new int[ij_size];
last_neighbor = -1;
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
if (force->newton_pair)
error->all("Cannot use newton with GPU LJCut pair style.");
if (comm->me == 0 && screen) {
printf("\n-------------------------------------");
printf("-------------------------------------\n");
printf("- Using GPGPU acceleration for LJ-Cut:\n");
printf("-------------------------------------");
printf("-------------------------------------\n");
for (int i=0; i<ngpus; i++) {
int gpui=my_gpu;
if (multi_gpu_mode==ONE_NODE)
gpui=i+multi_gpu_param;
else if (multi_gpu_mode==MULTI_GPU)
gpui=i;
std::string gpu_string=lj_gpu_name(gpui,neighbor->oneatom);
printf("GPU %d: %s\n",gpui,gpu_string.c_str());
}
printf("-------------------------------------");
printf("-------------------------------------\n\n");
}
}
/* ---------------------------------------------------------------------- */
double PairLJCutGPU::memory_usage()
{
double bytes=Pair::memory_usage()+ij_size*sizeof(int);
return bytes+lj_gpu_bytes();
}

40
src/GPU/pair_lj_cut_gpu.h Normal file
View File

@ -0,0 +1,40 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef PAIR_LJ_CUT_GPU_H
#define PAIR_LJ_CUT_GPU_H
#include "pair_lj_cut.h"
namespace LAMMPS_NS {
class PairLJCutGPU : public PairLJCut {
public:
PairLJCutGPU(LAMMPS *lmp);
~PairLJCutGPU();
void compute(int, int);
void settings(int, char **);
void init_style();
double memory_usage();
enum { ONE_NODE, ONE_GPU, MULTI_GPU };
private:
int ij_size;
int *ij, *ij_new;
int last_neighbor, multi_gpu_mode, multi_gpu_param;
};
}
#endif

41
src/GPU/style_gpu.h Normal file
View File

@ -0,0 +1,41 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef AtomInclude
#endif
#ifdef AtomClass
# endif
#ifdef ComputeInclude
#endif
#ifdef ComputeClass
#endif
#ifdef FixInclude
#endif
#ifdef FixClass
#endif
#ifdef PairInclude
#include "pair_lj_cut_gpu.h"
#include "pair_gayberne_gpu.h"
#endif
#ifdef PairClass
PairStyle(lj/cut/gpu,PairLJCutGPU)
PairStyle(gayberne/gpu,PairGayBerneGPU)
#endif