forked from lijiext/lammps
Added GPU library and package directories.
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3036 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
15b6eeb011
commit
2c6dc60a30
|
@ -0,0 +1,75 @@
|
|||
#***************************************************************************
|
||||
# Makefile
|
||||
# -------------------
|
||||
# W. Michael Brown
|
||||
#
|
||||
# _________________________________________________________________________
|
||||
# Build for the LAMMPS GPU Force Library
|
||||
#
|
||||
# _________________________________________________________________________
|
||||
#
|
||||
# begin : Tue June 23 2009
|
||||
# copyright : (C) 2009 by W. Michael Brown
|
||||
# email : wmbrown@sandia.gov
|
||||
# ***************************************************************************/
|
||||
|
||||
BIN_DIR = .
|
||||
OBJ_DIR = .
|
||||
AR = ar
|
||||
CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math
|
||||
CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
|
||||
CUDA_PREC = -D_SINGLE_SINGLE
|
||||
CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
|
||||
|
||||
CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
|
||||
|
||||
CUDA_LIB = $(OBJ_DIR)/pair_gpu_lib.a
|
||||
|
||||
# Headers for CUDA Stuff
|
||||
NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h
|
||||
# Headers for Pair Stuff
|
||||
PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h
|
||||
|
||||
ALL_H = $(NVC_H) $(PAIR_H)
|
||||
|
||||
EXECS = $(BIN_DIR)/nvc_get_devices
|
||||
OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/gb_gpu.cu_o \
|
||||
$(OBJ_DIR)/gb_gpu_memory.cu_o $(OBJ_DIR)/lj_gpu.cu_o \
|
||||
$(OBJ_DIR)/lj_gpu_memory.cu_o $(OBJ_DIR)/pair_gpu_atom.cu_o \
|
||||
$(OBJ_DIR)/pair_gpu_nbor.cu_o
|
||||
|
||||
all: $(CUDA_LIB) $(EXECS)
|
||||
|
||||
$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H)
|
||||
$(CUDA) -o $@ -c nvc_device.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_atom.cu_o: pair_gpu_atom.cu pair_gpu_texture.h pair_gpu_atom.h $(NVC_H)
|
||||
$(CUDA) -o $@ -c pair_gpu_atom.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
|
||||
$(CUDA) -o $@ -c pair_gpu_nbor.cu
|
||||
|
||||
$(OBJ_DIR)/lj_gpu_memory.cu_o: lj_gpu_memory.cu lj_gpu_memory.h $(ALL_H)
|
||||
$(CUDA) -o $@ -c lj_gpu_memory.cu
|
||||
|
||||
$(OBJ_DIR)/lj_gpu.cu_o: lj_gpu.cu $(ALL_H) lj_gpu_memory.h lj_gpu_kernel.h
|
||||
$(CUDA) -o $@ -c lj_gpu.cu
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_memory.cu_o: gb_gpu_memory.cu gb_gpu_memory.h $(ALL_H)
|
||||
$(CUDA) -o $@ -c gb_gpu_memory.cu
|
||||
|
||||
$(OBJ_DIR)/gb_gpu.cu_o: gb_gpu.cu $(ALL_H) gb_gpu_memory.h gb_gpu_kernel.h gb_gpu_extra.h
|
||||
$(CUDA) -o $@ -c gb_gpu.cu
|
||||
|
||||
$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
|
||||
$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o
|
||||
|
||||
$(CUDA_LIB): $(OBJS)
|
||||
$(AR) -crusv $(CUDA_LIB) $(OBJS)
|
||||
|
||||
clean:
|
||||
rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo
|
||||
|
||||
veryclean: clean
|
||||
rm -rf *~ *.linkinfo
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
/***************************************************************************
|
||||
README
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
README for building LAMMPS GPU Library
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jun 25 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
||||
GENERAL NOTES
|
||||
|
||||
This library, pair_gpu_lib.a, provides routines for GPGPU acceleration of
|
||||
LAMMPS pair styles. Currently, only CUDA enabled GPUs are supported.
|
||||
Compilation of this library requires installing the CUDA GPU driver and
|
||||
CUDA toolkit for your operating system. In addition to the LAMMPS library,
|
||||
the binary nvc_get_devices will also be built. This can be used to query
|
||||
the names and properties of GPU devices on your system.
|
||||
|
||||
NOTE: Installation of the CUDA SDK is not required.
|
||||
|
||||
Current pair styles supporting GPU Accelartion:
|
||||
|
||||
1. lj/cut/gpu
|
||||
2. gayberne/gpu
|
||||
|
||||
|
||||
|
||||
MULTIPLE LAMMPS PROCESSES
|
||||
|
||||
When using GPGPU acceleration, you are restricted to one physical GPU per
|
||||
LAMMPS process. This can be multiple GPUs on a single node or across multiple
|
||||
nodes. Intructions on GPU assignment can be found in the LAMMPS documentation.
|
||||
|
||||
|
||||
|
||||
SPEEDUPS
|
||||
|
||||
The speedups that can be obtained using this library are highly
|
||||
dependent on the GPU architecture and the computational expense of
|
||||
the pair potential. When comparing a single precision Tesla C1060 run to a
|
||||
serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for lj/cut
|
||||
with a cutoff of 2.5. For gayberne with a cutoff of 7, the speedup is >103x
|
||||
for 8000 particles. The speedup will improve with an increase in the
|
||||
number of particles or an increase in the cutoff.
|
||||
|
||||
|
||||
|
||||
BUILDING AND PRECISION MODES
|
||||
|
||||
To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for
|
||||
your machine. Type make. Additionally, the GPU package must be installed and
|
||||
compiled for LAMMPS. The library supports 3 precision modes as determined by
|
||||
the CUDA_PREC variable:
|
||||
|
||||
CUDA_PREC = -D_SINGLE_SINGLE # Single precision for all calculations
|
||||
CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations
|
||||
CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
|
||||
|
||||
NOTE: Double precision is only supported on certain GPUS (with
|
||||
compute capability>=1.3).
|
||||
|
||||
NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
|
||||
package has been installed before installing the GPU package in LAMMPS.
|
||||
|
||||
|
||||
|
||||
GPU MEMORY
|
||||
|
||||
Upon initialization of the pair style, the library will reserve memory for
|
||||
64K atoms per GPU or 70% of each cards GPU memory, whichever value is limiting.
|
||||
The value of 70% can be changed by editing the PERCENT_GPU_MEMORY definition
|
||||
in the source file. The value of 64K cannot be increased and is the maximum
|
||||
number of atoms allowed per GPU. Using the 'neigh_modify one' modifier in
|
||||
your LAMMPS input script can help to increase maximum number of atoms per
|
||||
GPU for cards with limited memory.
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,518 @@
|
|||
/***************************************************************************
|
||||
gb_gpu.cu
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Gay-Berne anisotropic potential GPU calcultation
|
||||
|
||||
*** Force decomposition by Atom Version ***
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Jun 23 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include "nvc_macros.h"
|
||||
#include "nvc_timer.h"
|
||||
#include "nvc_device.h"
|
||||
#include "gb_gpu_memory.h"
|
||||
#include "gb_gpu_kernel.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
|
||||
#define GBMT GB_GPU_Memory<numtyp,acctyp>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
||||
// -- Only pack neighbors matching the specified inclusive range of forms
|
||||
// -- Only pack neighbors within cutoff
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp>
|
||||
__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
|
||||
const int start, const int inum,
|
||||
const int *dev_ij, const int form_low,
|
||||
const int form_high, const int nall) {
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
|
||||
if (ii<inum) {
|
||||
int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
nbor+=nbor_pitch;
|
||||
int *nbor_newj=nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,7);
|
||||
|
||||
int newj=0;
|
||||
for ( ; list<list_end; list++) {
|
||||
int j=*list;
|
||||
if (j>=nall)
|
||||
j%=nall;
|
||||
int jtype=_x_<numtyp>(j,7);
|
||||
|
||||
if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=_x_<numtyp>(j,0)-ix;
|
||||
rsq*=rsq;
|
||||
numtyp t=_x_<numtyp>(j,1)-iy;
|
||||
rsq+=t*t;
|
||||
t=_x_<numtyp>(j,2)-iz;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq< _cutsq_<numtyp>(itype,jtype)) {
|
||||
*nbor=j;
|
||||
nbor+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
*nbor_newj=newj;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
||||
// -- Only pack neighbors matching the specified inclusive range of forms
|
||||
// -- Only pack neighbors within cutoff
|
||||
// -- Fast version of routine that uses shared memory for LJ constants
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp>
|
||||
__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
|
||||
const int start, const int inum,
|
||||
const int *dev_ij, const int form_low,
|
||||
const int form_high, const int nall) {
|
||||
|
||||
int ii=threadIdx.x;
|
||||
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
int itype=ii/MAX_SHARED_TYPES;
|
||||
int jtype=ii%MAX_SHARED_TYPES;
|
||||
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
|
||||
form[ii]=_form_(itype,jtype);
|
||||
}
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
|
||||
if (ii<inum) {
|
||||
int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
nbor+=nbor_pitch;
|
||||
int *nbor_newj=nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
|
||||
|
||||
int newj=0;
|
||||
for ( ; list<list_end; list++) {
|
||||
int j=*list;
|
||||
if (j>=nall)
|
||||
j%=nall;
|
||||
int mtype=itype+_x_<numtyp>(j,7);
|
||||
|
||||
if (form[mtype]>=form_low && form[mtype]<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=_x_<numtyp>(j,0)-ix;
|
||||
rsq*=rsq;
|
||||
numtyp t=_x_<numtyp>(j,1)-iy;
|
||||
rsq+=t*t;
|
||||
t=_x_<numtyp>(j,2)-iz;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq<cutsq[mtype]) {
|
||||
*nbor=j;
|
||||
nbor+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
*nbor_newj=newj;
|
||||
}
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
|
||||
const int inum, const int form_low, const int form_high) {
|
||||
if (gbm.shared_types)
|
||||
kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
|
||||
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
|
||||
else
|
||||
kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
|
||||
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Convert something to a string
|
||||
// ---------------------------------------------------------------------------
|
||||
#include <sstream>
|
||||
template <class t>
|
||||
inline string gb_gpu_toa(const t& in) {
|
||||
ostringstream o;
|
||||
o.precision(2);
|
||||
o << in;
|
||||
return o.str();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Return string with GPU info
|
||||
// ---------------------------------------------------------------------------
|
||||
string gb_gpu_name(const int id, const int max_nbors) {
|
||||
string name=GBMF[0].gpu.name(id)+", "+
|
||||
gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
|
||||
gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
|
||||
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
|
||||
gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
|
||||
max_nbors))+" Atoms";
|
||||
return name;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **shape,
|
||||
double **well, double **cutsq, double **sigma,
|
||||
double **epsilon, double *host_lshape, int **form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int max_nbors, const int thread, const int gpu_id) {
|
||||
assert(thread<MAX_GPU_THREADS);
|
||||
|
||||
if (GBMF[thread].gpu.num_devices()==0)
|
||||
return 0;
|
||||
|
||||
ij_size=IJ_SIZE;
|
||||
return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
|
||||
well, cutsq, sigma, epsilon, host_lshape, form,
|
||||
host_lj1, host_lj2, host_lj3, host_lj4, offset,
|
||||
special_lj, max_nbors, false, gpu_id);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Clear memory on host and device
|
||||
// ---------------------------------------------------------------------------
|
||||
void gb_gpu_clear(const int thread) {
|
||||
GBMF[thread].clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// copy atom positions, quaternions, and optionally types to device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
|
||||
double **host_quat, const int *host_type,
|
||||
const bool rebuild, cudaStream_t &stream) {
|
||||
atom.time_atom.start();
|
||||
atom.reset_write_buffer();
|
||||
|
||||
// Rows 1-3 of dev_x are position; rows 4-7 are quaternion
|
||||
atom.add_atom_data(host_x[0],3);
|
||||
atom.add_atom_data(host_x[0]+1,3);
|
||||
atom.add_atom_data(host_x[0]+2,3);
|
||||
atom.add_atom_data(host_quat[0],4);
|
||||
atom.add_atom_data(host_quat[0]+1,4);
|
||||
atom.add_atom_data(host_quat[0]+2,4);
|
||||
atom.add_atom_data(host_quat[0]+3,4);
|
||||
|
||||
int csize=7;
|
||||
|
||||
// If a rebuild occured, copy type data
|
||||
if (rebuild) {
|
||||
atom.add_atom_data(host_type);
|
||||
csize++;
|
||||
}
|
||||
|
||||
atom.copy_atom_data(csize,stream);
|
||||
atom.time_atom.stop();
|
||||
}
|
||||
|
||||
void gb_gpu_atom(double **host_x, double **host_quat,
|
||||
const int *host_type, const bool rebuild, const int thread) {
|
||||
_gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
|
||||
GBMF[thread].pair_stream);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Signal that we need to transfer a new neighbor list
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
|
||||
const int inum, int *ilist, const int *numj,
|
||||
const int *type, bool &success) {
|
||||
if (nall>gbm.max_atoms) {
|
||||
success=false;
|
||||
return 0;
|
||||
}
|
||||
success=true;
|
||||
|
||||
gbm.nbor.time_nbor.start();
|
||||
|
||||
gbm.atom.nall(nall);
|
||||
gbm.atom.inum(inum);
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
int p=0, acc=0;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[p+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
}
|
||||
}
|
||||
gbm.last_ellipse=p;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[p+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
}
|
||||
}
|
||||
gbm.nbor.ij_total=0;
|
||||
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
|
||||
gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
|
||||
gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.nbor.dev_nbor.row_size(),2,inum,
|
||||
gbm.pair_stream);
|
||||
} else {
|
||||
gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
|
||||
gbm.last_ellipse=inum;
|
||||
}
|
||||
|
||||
gbm.nbor.time_nbor.stop();
|
||||
|
||||
if (gbm.multiple_forms)
|
||||
return gbm.host_olist.begin();
|
||||
return ilist;
|
||||
}
|
||||
|
||||
int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
|
||||
int *ilist, const int *numj, const int *type,
|
||||
const int thread, bool &success) {
|
||||
return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
|
||||
success);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy a set of ij_size ij interactions to device and compute energies,
|
||||
// forces, and torques for those interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
|
||||
gbm.nbor.time_nbor.add_to_total();
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
|
||||
|
||||
gbm.nbor.time_nbor.start();
|
||||
gbm.nbor.add(num_ij,gbm.pair_stream);
|
||||
gbm.nbor.time_nbor.stop();
|
||||
}
|
||||
|
||||
void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
|
||||
_gb_gpu_nbors(GBMF[thread],num_ij,eflag);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques for all ij interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
|
||||
const bool rebuild) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=BLOCK_1D;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
gbm.time_kernel.start();
|
||||
if (gbm.last_ellipse>0) {
|
||||
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
|
||||
static_cast<double>(BX)));
|
||||
pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
|
||||
gbm.time_gayberne.start();
|
||||
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
|
||||
gbm.time_gayberne.stop();
|
||||
|
||||
if (gbm.last_ellipse==gbm.atom.inum()) {
|
||||
gbm.time_kernel2.start();
|
||||
gbm.time_kernel2.stop();
|
||||
gbm.time_gayberne2.start();
|
||||
gbm.time_gayberne2.stop();
|
||||
gbm.time_pair.start();
|
||||
gbm.time_pair.stop();
|
||||
return;
|
||||
}
|
||||
|
||||
// ------------ SPHERE_ELLIPSE ---------------
|
||||
|
||||
gbm.time_kernel2.start();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
|
||||
gbm.last_ellipse)/BX));
|
||||
pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
|
||||
ELLIPSE_SPHERE);
|
||||
gbm.time_kernel2.stop();
|
||||
|
||||
gbm.time_gayberne2.start();
|
||||
kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
||||
gbm.time_gayberne2.stop();
|
||||
} else {
|
||||
gbm.atom.ans.zero();
|
||||
gbm.time_kernel.stop();
|
||||
gbm.time_gayberne.start();
|
||||
gbm.time_gayberne.stop();
|
||||
gbm.time_kernel2.start();
|
||||
gbm.time_kernel2.stop();
|
||||
gbm.time_gayberne2.start();
|
||||
gbm.time_gayberne2.stop();
|
||||
}
|
||||
|
||||
// ------------ LJ ---------------
|
||||
gbm.time_pair.start();
|
||||
if (gbm.last_ellipse<gbm.atom.inum()) {
|
||||
if (gbm.shared_types)
|
||||
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),
|
||||
gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),
|
||||
gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse,
|
||||
gbm.atom.inum(), gbm.atom.nall());
|
||||
else
|
||||
kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(),
|
||||
gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
||||
}
|
||||
gbm.time_pair.stop();
|
||||
} else {
|
||||
gbm.time_kernel.start();
|
||||
pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
|
||||
gbm.time_gayberne.start();
|
||||
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
(gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
|
||||
gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
|
||||
gbm.time_gayberne.stop();
|
||||
}
|
||||
}
|
||||
|
||||
void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
|
||||
const int thread) {
|
||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Get energies, forces, and torques to host
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp, class acctyp>
|
||||
double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
|
||||
const bool eflag, const bool vflag, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial) {
|
||||
double evdw;
|
||||
|
||||
gbm.atom.time_answer.start();
|
||||
gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
|
||||
|
||||
gbm.atom.time_atom.add_to_total();
|
||||
gbm.nbor.time_nbor.add_to_total();
|
||||
gbm.time_kernel.add_to_total();
|
||||
gbm.time_gayberne.add_to_total();
|
||||
if (gbm.multiple_forms) {
|
||||
gbm.time_kernel2.add_to_total();
|
||||
gbm.time_gayberne2.add_to_total();
|
||||
gbm.time_pair.add_to_total();
|
||||
}
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
|
||||
|
||||
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
|
||||
gbm.atom.add_forces(ilist,f);
|
||||
gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
|
||||
gbm.atom.time_answer.stop();
|
||||
gbm.atom.time_answer.add_to_total();
|
||||
return evdw;
|
||||
}
|
||||
|
||||
double gb_gpu_forces(double **f, double **tor, const int *ilist,
|
||||
const bool eflag, const bool vflag, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial, const int thread) {
|
||||
return _gb_gpu_forces<PRECISION,ACC_PRECISION>
|
||||
(GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
|
||||
vflag_atom,eatom,vatom,virial);
|
||||
}
|
||||
|
||||
void gb_gpu_time(const int i) {
|
||||
cout.precision(4);
|
||||
cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds()
|
||||
<< " s.\n"
|
||||
<< "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds()
|
||||
<< " s.\n"
|
||||
<< "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
|
||||
GBMF[i].time_kernel2.total_seconds() << " s.\n"
|
||||
<< "Force calc: " << GBMF[i].time_gayberne.total_seconds()+
|
||||
GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
|
||||
if (GBMF[i].multiple_forms)
|
||||
cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n";
|
||||
cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds()
|
||||
<< " s.\n";
|
||||
}
|
||||
|
||||
int gb_gpu_num_devices() {
|
||||
return GBMF[0].gpu.num_devices();
|
||||
}
|
||||
|
||||
double gb_gpu_bytes() {
|
||||
return GBMF[0].host_memory_usage();
|
||||
}
|
|
@ -0,0 +1,337 @@
|
|||
/***************************************************************************
|
||||
gb_gpu_extra.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Inline GPU kernel routines ala math_extra for the CPU.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Jun 23 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef GB_GPU_EXTRA_H
|
||||
#define GB_GPU_EXTRA_H
|
||||
|
||||
#include "math.h"
|
||||
#include "stdio.h"
|
||||
#include "string.h"
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Atomic update of global memory
|
||||
------------------------------------------------------------------------- */
|
||||
/*
|
||||
template <class numtyp> __device__
|
||||
inline void atomicAdd(numtyp *address, numtyp val);
|
||||
|
||||
template <>
|
||||
__device__ inline void atomicAdd<float>(float *address, float val)
|
||||
{
|
||||
int i_val = __float_as_int(val);
|
||||
int tmp0 = 0;
|
||||
int tmp1;
|
||||
|
||||
while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) {
|
||||
tmp0 = tmp1;
|
||||
i_val = __float_as_int(val + __int_as_float(tmp1));
|
||||
}
|
||||
}*/
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
dot product of 2 vectors
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
|
||||
{
|
||||
return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
cross product of 2 vectors
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_cross3(const numtyp *v1,
|
||||
const numtyp *v2, numtyp *ans)
|
||||
{
|
||||
ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
|
||||
ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
|
||||
ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
determinant of a matrix
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
|
||||
{
|
||||
numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] -
|
||||
m[3]*m[1]*m[8] + m[3]*m[2]*m[7] +
|
||||
m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
|
||||
return ans;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
diagonal matrix times a full matrix
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9],
|
||||
numtyp ans[9])
|
||||
{
|
||||
ans[0] = _well_<numtyp>(i,0)*m[0];
|
||||
ans[1] = _well_<numtyp>(i,0)*m[1];
|
||||
ans[2] = _well_<numtyp>(i,0)*m[2];
|
||||
ans[3] = _well_<numtyp>(i,1)*m[3];
|
||||
ans[4] = _well_<numtyp>(i,1)*m[4];
|
||||
ans[5] = _well_<numtyp>(i,1)*m[5];
|
||||
ans[6] = _well_<numtyp>(i,2)*m[6];
|
||||
ans[7] = _well_<numtyp>(i,2)*m[7];
|
||||
ans[8] = _well_<numtyp>(i,2)*m[8];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
diagonal matrix times a full matrix
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9],
|
||||
numtyp ans[9])
|
||||
{
|
||||
ans[0] = _shape_<numtyp>(i,0)*m[0];
|
||||
ans[1] = _shape_<numtyp>(i,0)*m[1];
|
||||
ans[2] = _shape_<numtyp>(i,0)*m[2];
|
||||
ans[3] = _shape_<numtyp>(i,1)*m[3];
|
||||
ans[4] = _shape_<numtyp>(i,1)*m[4];
|
||||
ans[5] = _shape_<numtyp>(i,1)*m[5];
|
||||
ans[6] = _shape_<numtyp>(i,2)*m[6];
|
||||
ans[7] = _shape_<numtyp>(i,2)*m[7];
|
||||
ans[8] = _shape_<numtyp>(i,2)*m[8];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
add two matrices
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_plus3(const numtyp m[9],
|
||||
const numtyp m2[9], numtyp ans[9])
|
||||
{
|
||||
ans[0] = m[0]+m2[0];
|
||||
ans[1] = m[1]+m2[1];
|
||||
ans[2] = m[2]+m2[2];
|
||||
ans[3] = m[3]+m2[3];
|
||||
ans[4] = m[4]+m2[4];
|
||||
ans[5] = m[5]+m2[5];
|
||||
ans[6] = m[6]+m2[6];
|
||||
ans[7] = m[7]+m2[7];
|
||||
ans[8] = m[8]+m2[8];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
multiply the transpose of mat1 times mat2
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9],
|
||||
const numtyp m2[9],
|
||||
numtyp ans[9])
|
||||
{
|
||||
ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
|
||||
ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
|
||||
ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
|
||||
ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
|
||||
ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
|
||||
ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
|
||||
ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
|
||||
ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
|
||||
ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
row vector times matrix
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_row_times3(const numtyp *v,
|
||||
const numtyp m[9], numtyp *ans)
|
||||
{
|
||||
ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
|
||||
ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
|
||||
ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
solve Ax = b or M ans = v
|
||||
use gaussian elimination & partial pivoting on matrix
|
||||
error_flag set to 2 if bad matrix inversion attempted
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
|
||||
const numtyp *v, numtyp *ans,
|
||||
int *error_flag)
|
||||
{
|
||||
// create augmented matrix for pivoting
|
||||
|
||||
numtyp aug[12], t;
|
||||
|
||||
aug[3] = v[0];
|
||||
aug[0] = m[0];
|
||||
aug[1] = m[1];
|
||||
aug[2] = m[2];
|
||||
aug[7] = v[1];
|
||||
aug[4] = m[3];
|
||||
aug[5] = m[4];
|
||||
aug[6] = m[5];
|
||||
aug[11] = v[2];
|
||||
aug[8] = m[6];
|
||||
aug[9] = m[7];
|
||||
aug[10] = m[8];
|
||||
|
||||
if (fabs(aug[4]) > fabs(aug[0])) {
|
||||
numtyp swapt;
|
||||
swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
|
||||
swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
|
||||
swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
|
||||
swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
|
||||
}
|
||||
if (fabs(aug[8]) > fabs(aug[0])) {
|
||||
numtyp swapt;
|
||||
swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
|
||||
swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
|
||||
swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
|
||||
swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
|
||||
}
|
||||
|
||||
if (aug[0] != (numtyp)0.0) {
|
||||
if (0!=0) {
|
||||
numtyp swapt;
|
||||
swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
|
||||
swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
|
||||
swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
|
||||
swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
|
||||
}
|
||||
} else if (aug[4] != (numtyp)0.0) {
|
||||
if (1!=0) {
|
||||
numtyp swapt;
|
||||
swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
|
||||
swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
|
||||
swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
|
||||
swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
|
||||
}
|
||||
} else if (aug[8] != (numtyp)0.0) {
|
||||
if (2!=0) {
|
||||
numtyp swapt;
|
||||
swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
|
||||
swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
|
||||
swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
|
||||
swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
|
||||
}
|
||||
} else
|
||||
*error_flag=2;
|
||||
|
||||
t = aug[4]/aug[0];
|
||||
aug[5]-=t*aug[1];
|
||||
aug[6]-=t*aug[2];
|
||||
aug[7]-=t*aug[3];
|
||||
t = aug[8]/aug[0];
|
||||
aug[9]-=t*aug[1];
|
||||
aug[10]-=t*aug[2];
|
||||
aug[11]-=t*aug[3];
|
||||
|
||||
if (fabs(aug[9]) > fabs(aug[5])) {
|
||||
numtyp swapt;
|
||||
swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
|
||||
swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
|
||||
swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
|
||||
swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
|
||||
}
|
||||
|
||||
if (aug[5] != (numtyp)0.0) {
|
||||
if (1!=1) {
|
||||
numtyp swapt;
|
||||
swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
|
||||
swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
|
||||
swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
|
||||
swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
|
||||
}
|
||||
} else if (aug[9] != (numtyp)0.0) {
|
||||
if (2!=1) {
|
||||
numtyp swapt;
|
||||
swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
|
||||
swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
|
||||
swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
|
||||
swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
|
||||
}
|
||||
}
|
||||
|
||||
t = aug[9]/aug[5];
|
||||
aug[10]-=t*aug[6];
|
||||
aug[11]-=t*aug[7];
|
||||
|
||||
if (aug[10] == (numtyp)0.0)
|
||||
*error_flag=2;
|
||||
|
||||
ans[2] = aug[11]/aug[10];
|
||||
t = (numtyp)0.0;
|
||||
t += aug[6]*ans[2];
|
||||
ans[1] = (aug[7]-t) / aug[5];
|
||||
t = (numtyp)0.0;
|
||||
t += aug[1]*ans[1];
|
||||
t += aug[2]*ans[2];
|
||||
ans[0] = (aug[3]-t) / aug[0];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute rotation matrix from quaternion conjugate
|
||||
quat = [w i j k]
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi,
|
||||
numtyp mat[9])
|
||||
{
|
||||
numtyp qi3=_x_<numtyp>(qi,3);
|
||||
numtyp qi4=_x_<numtyp>(qi,4);
|
||||
numtyp qi5=_x_<numtyp>(qi,5);
|
||||
numtyp qi6=_x_<numtyp>(qi,6);
|
||||
|
||||
numtyp w2 = qi3*qi3;
|
||||
numtyp i2 = qi4*qi4;
|
||||
numtyp j2 = qi5*qi5;
|
||||
numtyp k2 = qi6*qi6;
|
||||
numtyp twoij = 2.0*qi4*qi5;
|
||||
numtyp twoik = 2.0*qi4*qi6;
|
||||
numtyp twojk = 2.0*qi5*qi6;
|
||||
numtyp twoiw = 2.0*qi4*qi3;
|
||||
numtyp twojw = 2.0*qi5*qi3;
|
||||
numtyp twokw = 2.0*qi6*qi3;
|
||||
|
||||
mat[0] = w2+i2-j2-k2;
|
||||
mat[3] = twoij-twokw;
|
||||
mat[6] = twojw+twoik;
|
||||
|
||||
mat[1] = twoij+twokw;
|
||||
mat[4] = w2-i2+j2-k2;
|
||||
mat[7] = twojk-twoiw;
|
||||
|
||||
mat[2] = twoik-twojw;
|
||||
mat[5] = twojk+twoiw;
|
||||
mat[8] = w2-i2-j2+k2;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,861 @@
|
|||
/***************************************************************************
|
||||
gb_gpu_kernel.cu
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Routines that actually perform the force/torque computation
|
||||
|
||||
*** Force Decomposition by Atom Version ***
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Jun 23 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef GB_GPU_KERNEL
|
||||
#define GB_GPU_KERNEL
|
||||
|
||||
#include "gb_gpu_extra.h"
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void compute_eta_torque(numtyp m[9],
|
||||
numtyp m2[9],
|
||||
const int i,
|
||||
numtyp ans[9])
|
||||
{
|
||||
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
|
||||
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
|
||||
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
|
||||
den = (numtyp)1.0/den;
|
||||
|
||||
numtyp shapex=_shape_<numtyp>(i,0);
|
||||
numtyp shapey=_shape_<numtyp>(i,1);
|
||||
numtyp shapez=_shape_<numtyp>(i,2);
|
||||
|
||||
ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
|
||||
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
|
||||
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
|
||||
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
|
||||
|
||||
ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
|
||||
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
|
||||
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
|
||||
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
|
||||
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
|
||||
|
||||
ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
|
||||
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
|
||||
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
|
||||
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
|
||||
|
||||
ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
|
||||
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
|
||||
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
|
||||
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
|
||||
|
||||
ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
|
||||
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
|
||||
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
|
||||
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
|
||||
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
|
||||
|
||||
ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
|
||||
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
|
||||
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
|
||||
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
|
||||
|
||||
ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
|
||||
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
|
||||
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
|
||||
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
|
||||
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
|
||||
|
||||
ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
|
||||
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
|
||||
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
|
||||
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
|
||||
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
|
||||
|
||||
ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
|
||||
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
|
||||
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
|
||||
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
|
||||
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
|
||||
const int *dev_nbor, const int nbor_pitch,
|
||||
acctyp *ans, size_t ans_pitch, int *err_flag,
|
||||
const bool eflag, const bool vflag,
|
||||
const int inum, const int nall) {
|
||||
|
||||
__shared__ numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x);
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp torx=(numtyp)0;
|
||||
acctyp tory=(numtyp)0;
|
||||
acctyp torz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
nbor+=nbor_pitch;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,7);
|
||||
numtyp a1[9], b1[9], g1[9];
|
||||
{
|
||||
numtyp t[9];
|
||||
gpu_quat_to_mat_trans(i,a1);
|
||||
gpu_shape_times3(itype,a1,t);
|
||||
gpu_transpose_times3(a1,t,g1);
|
||||
gpu_well_times3(itype,a1,t);
|
||||
gpu_transpose_times3(a1,t,b1);
|
||||
}
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = 1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int jtype=_x_<numtyp>(j,7);
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = _x_<numtyp>(j,0)-ix;
|
||||
r12[1] = _x_<numtyp>(j,1)-iy;
|
||||
r12[2] = _x_<numtyp>(j,2)-iz;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
{
|
||||
numtyp g2[9];
|
||||
{
|
||||
gpu_shape_times3(jtype,a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
gpu_plus3(g1,g2,g12);
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
|
||||
// -- replace r12 with r12 hat
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
numtyp sigma = _sigma_<numtyp>(itype,jtype);
|
||||
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
|
||||
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
|
||||
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
|
||||
}
|
||||
|
||||
// torque for particle 1
|
||||
{
|
||||
numtyp tempv[3], tempv2[3];
|
||||
tempv[0] = -uslj_rsq*kappa[0];
|
||||
tempv[1] = -uslj_rsq*kappa[1];
|
||||
tempv[2] = -uslj_rsq*kappa[2];
|
||||
gpu_row_times3(kappa,g1,tempv2);
|
||||
gpu_cross3(tempv,tempv2,tUr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
|
||||
// Compute teta
|
||||
numtyp temp[9], tempv[3], tempv2[3];
|
||||
compute_eta_torque(g12,a1,itype,temp);
|
||||
numtyp temp1 = -eta*gum[1];
|
||||
|
||||
tempv[0] = temp1*temp[0];
|
||||
tempv[1] = temp1*temp[1];
|
||||
tempv[2] = temp1*temp[2];
|
||||
gpu_cross3(a1,tempv,tempv2);
|
||||
teta[0] = tempv2[0];
|
||||
teta[1] = tempv2[1];
|
||||
teta[2] = tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[3];
|
||||
tempv[1] = temp1*temp[4];
|
||||
tempv[2] = temp1*temp[5];
|
||||
gpu_cross3(a1+3,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[6];
|
||||
tempv[1] = temp1*temp[7];
|
||||
tempv[2] = temp1*temp[8];
|
||||
gpu_cross3(a1+6,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
}
|
||||
|
||||
numtyp chi, dchi[3], tchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// Compute b12
|
||||
numtyp b2[9], b12[9];
|
||||
{
|
||||
gpu_well_times3(jtype,a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
gpu_plus3(b1,b2,b12);
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
r12[0]*=r;
|
||||
r12[1]*=r;
|
||||
r12[2]*=r;
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
chi = gpu_dot3(r12,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
|
||||
|
||||
// compute t_chi
|
||||
numtyp tempv[3];
|
||||
gpu_row_times3(iota,b1,tempv);
|
||||
gpu_cross3(tempv,iota,tchi);
|
||||
temp1 = (numtyp)-4.0*ir*ir;
|
||||
tchi[0] *= temp1;
|
||||
tchi[1] *= temp1;
|
||||
tchi[2] *= temp1;
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag) {
|
||||
r12[0]*=-r;
|
||||
r12[1]*=-r;
|
||||
r12[2]*=-r;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
fx+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
fy+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
fz+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
fx+=temp1*dchi[0]-temp2*dUr[0];
|
||||
fy+=temp1*dchi[1]-temp2*dUr[1];
|
||||
fz+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
|
||||
// Torque on 1
|
||||
temp1 = -u_r*eta*factor_lj;
|
||||
temp2 = -u_r*chi*factor_lj;
|
||||
numtyp temp3 = -chi*eta*factor_lj;
|
||||
torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
|
||||
tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
|
||||
torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fy;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fz;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=torx;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=tory;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=torz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
|
||||
const int *dev_nbor, const int nbor_pitch,
|
||||
acctyp *ans, size_t ans_pitch, int *err_flag,
|
||||
const bool eflag, const bool vflag,
|
||||
const int start, const int inum,
|
||||
const int nall) {
|
||||
__shared__ numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
nbor+=nbor_pitch;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,7);
|
||||
|
||||
numtyp oner=_shape_<numtyp>(itype,0);
|
||||
numtyp one_well=_well_<numtyp>(itype,0);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = 1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int jtype=_x_<numtyp>(j,7);
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = _x_<numtyp>(j,0)-ix;
|
||||
r12[1] = _x_<numtyp>(j,1)-iy;
|
||||
r12[2] = _x_<numtyp>(j,2)-iz;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp r12hat[3];
|
||||
r12hat[0]=r12[0]*ir;
|
||||
r12hat[1]=r12[1]*ir;
|
||||
r12hat[2]=r12[2]*ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], eta;
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
{
|
||||
{
|
||||
numtyp g2[9];
|
||||
gpu_shape_times3(jtype,a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
g12[0]=g2[0]+oner;
|
||||
g12[4]=g2[4]+oner;
|
||||
g12[8]=g2[8]+oner;
|
||||
g12[1]=g2[1];
|
||||
g12[2]=g2[2];
|
||||
g12[3]=g2[3];
|
||||
g12[5]=g2[5];
|
||||
g12[6]=g2[6];
|
||||
g12[7]=g2[7];
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12hat,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
numtyp sigma = _sigma_<numtyp>(itype,jtype);
|
||||
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12hat);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
|
||||
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
|
||||
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
}
|
||||
|
||||
numtyp chi, dchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// Compute b12
|
||||
numtyp b12[9];
|
||||
{
|
||||
numtyp b2[9];
|
||||
gpu_well_times3(jtype,a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
b12[0]=b2[0]+one_well;
|
||||
b12[4]=b2[4]+one_well;
|
||||
b12[8]=b2[8]+one_well;
|
||||
b12[1]=b2[1];
|
||||
b12[2]=b2[2];
|
||||
b12[3]=b2[3];
|
||||
b12[5]=b2[5];
|
||||
b12[6]=b2[6];
|
||||
b12[7]=b2[7];
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
chi = gpu_dot3(r12hat,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12hat);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag) {
|
||||
r12[0]*=-1;
|
||||
r12[1]*=-1;
|
||||
r12[2]*=-1;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
fx+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
fy+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
fz+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
fx+=temp1*dchi[0]-temp2*dUr[0];
|
||||
fy+=temp1*dchi[1]-temp2*dUr[1];
|
||||
fz+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fy;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fz;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
|
||||
const int *dev_ij, const int nbor_pitch, acctyp *ans,
|
||||
size_t ans_pitch, int *err_flag, const bool eflag,
|
||||
const bool vflag, const int start, const int inum,
|
||||
const int nall) {
|
||||
__shared__ numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,7);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; list<list_end; list++) {
|
||||
|
||||
int j=*list;
|
||||
if (j < nall)
|
||||
factor_lj = 1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int jtype=_x_<numtyp>(j,7);
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix-_x_<numtyp>(j,0);
|
||||
numtyp dely = iy-_x_<numtyp>(j,1);
|
||||
numtyp delz = iz-_x_<numtyp>(j,2);
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
|
||||
_form_(itype,jtype)==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
|
||||
_lj1_<numtyp>(itype,jtype).y);
|
||||
force*=factor_lj;
|
||||
|
||||
fx+=delx*force;
|
||||
fy+=dely*force;
|
||||
fz+=delz*force;
|
||||
|
||||
if (eflag) {
|
||||
numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
|
||||
_lj3_<numtyp>(itype,jtype).y);
|
||||
energy+=factor_lj*(e-_offset_<numtyp>(1,1));
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
if (eflag) {
|
||||
*ap1+=energy;
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
}
|
||||
*ap1+=fx;
|
||||
ap1+=ans_pitch;
|
||||
*ap1+=fy;
|
||||
ap1+=ans_pitch;
|
||||
*ap1+=fz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
|
||||
const int *dev_ij, const int nbor_pitch,
|
||||
acctyp *ans, size_t ans_pitch,int *err_flag,
|
||||
const bool eflag, const bool vflag,
|
||||
const int start, const int inum, const int nall){
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
__shared__ numtyp sp_lj[4];
|
||||
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
int itype=ii/MAX_SHARED_TYPES;
|
||||
int jtype=ii%MAX_SHARED_TYPES;
|
||||
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
|
||||
form[ii]=_form_(itype,jtype);
|
||||
lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
|
||||
lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
|
||||
if (eflag) {
|
||||
lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
|
||||
lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
|
||||
offset[ii]=_offset_<numtyp>(itype,jtype);
|
||||
}
|
||||
}
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; list<list_end; list++) {
|
||||
|
||||
int j=*list;
|
||||
if (j < nall)
|
||||
factor_lj = 1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int mtype=itype+_x_<numtyp>(j,7);
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix-_x_<numtyp>(j,0);
|
||||
numtyp dely = iy-_x_<numtyp>(j,1);
|
||||
numtyp delz = iz-_x_<numtyp>(j,2);
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
|
||||
|
||||
fx+=delx*force;
|
||||
fy+=dely*force;
|
||||
fz+=delz*force;
|
||||
|
||||
if (eflag) {
|
||||
numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
|
||||
energy+=factor_lj*(e-offset[mtype]);
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
if (eflag) {
|
||||
*ap1+=energy;
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
}
|
||||
*ap1+=fx;
|
||||
ap1+=ans_pitch;
|
||||
*ap1+=fy;
|
||||
ap1+=ans_pitch;
|
||||
*ap1+=fz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,146 @@
|
|||
/***************************************************************************
|
||||
gb_gpu_memory.cu
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Global variables for GPU Gayberne Library
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jun 25 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#include "gb_gpu_memory.h"
|
||||
#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() {
|
||||
this->atom.atom_fields(8);
|
||||
this->atom.ans_fields(13);
|
||||
this->nbor.packing(true);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GB_GPU_MemoryT::~GB_GPU_Memory() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
const double gamma, const double upsilon,
|
||||
const double mu, double **host_shape,
|
||||
double **host_well, double **host_cutsq,
|
||||
double **host_sigma, double **host_epsilon,
|
||||
double *host_lshape, int **h_form, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int max_nbors,
|
||||
const bool force_d, const int me) {
|
||||
if (this->allocated)
|
||||
clear();
|
||||
|
||||
LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
|
||||
host_lj1, host_lj2, host_lj3, host_lj4, host_offset,
|
||||
host_special_lj, max_nbors, me);
|
||||
|
||||
host_form=h_form;
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_kernel.init();
|
||||
time_gayberne.init();
|
||||
time_kernel2.init();
|
||||
time_gayberne2.init();
|
||||
|
||||
// Use the write buffer from atom for data initialization
|
||||
NVC_HostT &host_write=this->atom.host_write;
|
||||
assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
|
||||
|
||||
// Allocate, cast and asynchronous memcpy of constant data
|
||||
gamma_upsilon_mu.safe_alloc(3);
|
||||
host_write[0]=static_cast<numtyp>(gamma);
|
||||
host_write[1]=static_cast<numtyp>(upsilon);
|
||||
host_write[2]=static_cast<numtyp>(mu);
|
||||
gamma_upsilon_mu.copy_from_host(host_write.begin());
|
||||
|
||||
lshape.safe_alloc(ntypes);
|
||||
lshape.cast_copy(host_lshape,host_write);
|
||||
lshape.copy_from_host(host_write.begin());
|
||||
|
||||
// Copy shape, well, sigma, epsilon, and cutsq onto GPU
|
||||
shape.safe_alloc(ntypes,3);
|
||||
shape.cast_copy(host_shape[0],host_write);
|
||||
well.safe_alloc(ntypes,3);
|
||||
well.cast_copy(host_well[0],host_write);
|
||||
|
||||
// Copy LJ data onto GPU
|
||||
int lj_types=ntypes;
|
||||
if (lj_types<=MAX_SHARED_TYPES)
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
form.safe_alloc(lj_types,lj_types);
|
||||
form.copy_2Dfrom_host(host_form[0],ntypes,ntypes);
|
||||
|
||||
// See if we want fast GB-sphere or sphere-sphere calculations
|
||||
multiple_forms=false;
|
||||
for (int i=1; i<ntypes; i++)
|
||||
for (int j=i; j<ntypes; j++)
|
||||
if (host_form[i][j]!=ELLIPSE_ELLIPSE)
|
||||
multiple_forms=true;
|
||||
|
||||
// Memory for ilist ordered by particle type
|
||||
host_olist.safe_alloc_rw(this->max_atoms);
|
||||
|
||||
// Bind constant data to textures
|
||||
lshape_bind_texture<numtyp>(lshape);
|
||||
shape_bind_texture<numtyp>(shape);
|
||||
well_bind_texture<numtyp>(well);
|
||||
form_bind_texture(form);
|
||||
|
||||
return this->nbor.host_ij.begin();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::clear() {
|
||||
if (!this->allocated)
|
||||
return;
|
||||
|
||||
int err_flag;
|
||||
this->dev_error.copy_to_host(&err_flag);
|
||||
if (err_flag == 1)
|
||||
std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N "
|
||||
<< "and RECOMPILE.\n";
|
||||
else if (err_flag == 2)
|
||||
std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
|
||||
|
||||
LJ_GPU_MemoryT::clear();
|
||||
|
||||
shape_unbind_texture<numtyp>();
|
||||
well_unbind_texture<numtyp>();
|
||||
form_unbind_texture();
|
||||
|
||||
shape.clear();
|
||||
well.clear();
|
||||
form.clear();
|
||||
lshape.clear();
|
||||
gamma_upsilon_mu.clear();
|
||||
host_olist.clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double GB_GPU_MemoryT::host_memory_usage() {
|
||||
return this->atom.host_memory_usage(this->max_atoms)+
|
||||
this->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
||||
sizeof(GB_GPU_Memory<numtyp,acctyp>)+this->max_atoms*sizeof(int);
|
||||
}
|
||||
|
||||
template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,74 @@
|
|||
/***************************************************************************
|
||||
gb_gpu_memory.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Global variables for GPU Gayberne Library
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jun 25 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef GB_GPU_MEMORY_H
|
||||
#define GB_GPU_MEMORY_H
|
||||
|
||||
#define MAX_GPU_THREADS 4
|
||||
#include "lj_gpu_memory.h"
|
||||
#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp,acctyp>
|
||||
|
||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class GB_GPU_Memory : public LJ_GPU_MemoryT {
|
||||
public:
|
||||
GB_GPU_Memory();
|
||||
~GB_GPU_Memory();
|
||||
|
||||
int* init(const int ij_size, const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **host_shape,
|
||||
double **host_well, double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double *host_lshape, int **h_form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int max_nbors, const bool force_d, const int me);
|
||||
|
||||
void clear();
|
||||
|
||||
double host_memory_usage();
|
||||
|
||||
// ---------------------------- DATA ----------------------------
|
||||
|
||||
// ilist with particles sorted by type
|
||||
NVC_HostI host_olist;
|
||||
|
||||
// --------------- Const Data for Atoms
|
||||
NVC_ConstMatT shape, well;
|
||||
NVC_ConstMatI form;
|
||||
NVC_VecT lshape, gamma_upsilon_mu;
|
||||
|
||||
|
||||
// --------------- Timing Stuff
|
||||
NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2;
|
||||
|
||||
// True if we want to use fast GB-sphere or sphere-sphere calculations
|
||||
bool multiple_forms;
|
||||
int **host_form;
|
||||
int last_ellipse;
|
||||
|
||||
private:
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,234 @@
|
|||
/***************************************************************************
|
||||
lj_gpu.cu
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Lennard-Jones potential GPU calcultation
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Aug 4 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include "nvc_macros.h"
|
||||
#include "nvc_timer.h"
|
||||
#include "nvc_device.h"
|
||||
#include "lj_gpu_memory.h"
|
||||
#include "lj_gpu_kernel.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
|
||||
#define LJMT LJ_GPU_Memory<numtyp,acctyp>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Convert something to a string
|
||||
// ---------------------------------------------------------------------------
|
||||
#include <sstream>
|
||||
template <class t>
|
||||
inline string lj_gpu_toa(const t& in) {
|
||||
ostringstream o;
|
||||
o.precision(2);
|
||||
o << in;
|
||||
return o.str();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Return string with GPU info
|
||||
// ---------------------------------------------------------------------------
|
||||
string lj_gpu_name(const int id, const int max_nbors) {
|
||||
string name=LJMF.gpu.name(id)+", "+
|
||||
lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
|
||||
lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
|
||||
lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+
|
||||
lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id),
|
||||
max_nbors))+" Atoms";
|
||||
return name;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
|
||||
double **epsilon, double **host_lj1, double **host_lj2,
|
||||
double **host_lj3, double **host_lj4, double **offset,
|
||||
double *special_lj, const int max_nbors, const int gpu_id) {
|
||||
if (LJMF.gpu.num_devices()==0)
|
||||
return 0;
|
||||
|
||||
ij_size=IJ_SIZE;
|
||||
return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
|
||||
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Clear memory on host and device
|
||||
// ---------------------------------------------------------------------------
|
||||
void lj_gpu_clear() {
|
||||
LJMF.clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// copy atom positions and optionally types to device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
|
||||
const int *host_type, const bool rebuild,
|
||||
cudaStream_t &stream) {
|
||||
atom.time_atom.start();
|
||||
atom.reset_write_buffer();
|
||||
|
||||
// First row of dev_x is x position, second is y, third is z
|
||||
atom.add_atom_data(host_x[0],3);
|
||||
atom.add_atom_data(host_x[0]+1,3);
|
||||
atom.add_atom_data(host_x[0]+2,3);
|
||||
|
||||
int csize=3;
|
||||
|
||||
// If a rebuild occured, copy type data
|
||||
if (rebuild) {
|
||||
atom.add_atom_data(host_type);
|
||||
csize++;
|
||||
}
|
||||
|
||||
atom.copy_atom_data(csize,stream);
|
||||
atom.time_atom.stop();
|
||||
}
|
||||
|
||||
void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) {
|
||||
_lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Signal that we need to transfer a new neighbor list
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class LJMTyp>
|
||||
bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum,
|
||||
int *ilist, const int *numj) {
|
||||
if (nall>ljm.max_atoms)
|
||||
return false;
|
||||
|
||||
ljm.nbor.time_nbor.start();
|
||||
|
||||
ljm.atom.nall(nall);
|
||||
ljm.atom.inum(inum);
|
||||
ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream);
|
||||
|
||||
ljm.nbor.time_nbor.stop();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
|
||||
const int *numj) {
|
||||
return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy a set of ij_size ij interactions to device and compute energies,
|
||||
// forces, and torques for those interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class LJMTyp>
|
||||
void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
|
||||
ljm.nbor.time_nbor.add_to_total();
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
|
||||
|
||||
ljm.nbor.time_nbor.start();
|
||||
ljm.nbor.add(num_ij,ljm.pair_stream);
|
||||
ljm.nbor.time_nbor.stop();
|
||||
}
|
||||
|
||||
void lj_gpu_nbors(const int num_ij) {
|
||||
_lj_gpu_nbors(LJMF,num_ij);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies and forces for all ij interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=BLOCK_1D;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
|
||||
|
||||
ljm.time_pair.start();
|
||||
if (ljm.shared_types)
|
||||
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
|
||||
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
|
||||
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
|
||||
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
|
||||
vflag, ljm.atom.inum(), ljm.atom.nall());
|
||||
else
|
||||
kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
|
||||
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
|
||||
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
|
||||
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
|
||||
vflag, ljm.atom.inum(), ljm.atom.nall());
|
||||
ljm.time_pair.stop();
|
||||
}
|
||||
|
||||
void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
|
||||
_lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Get energies and forces to host
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp, class acctyp>
|
||||
double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
|
||||
const bool eflag, const bool vflag, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial) {
|
||||
double evdw;
|
||||
|
||||
ljm.atom.time_answer.start();
|
||||
ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream);
|
||||
|
||||
ljm.atom.time_atom.add_to_total();
|
||||
ljm.nbor.time_nbor.add_to_total();
|
||||
ljm.time_pair.add_to_total();
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
|
||||
|
||||
evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
|
||||
ljm.atom.add_forces(ilist,f);
|
||||
ljm.atom.time_answer.stop();
|
||||
ljm.atom.time_answer.add_to_total();
|
||||
return evdw;
|
||||
}
|
||||
|
||||
double lj_gpu_forces(double **f, const int *ilist, const bool eflag,
|
||||
const bool vflag, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial) {
|
||||
return _lj_gpu_forces<PRECISION,ACC_PRECISION>
|
||||
(LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial);
|
||||
}
|
||||
|
||||
void lj_gpu_time() {
|
||||
cout.precision(4);
|
||||
cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n";
|
||||
cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
|
||||
cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n";
|
||||
cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n";
|
||||
}
|
||||
|
||||
int lj_gpu_num_devices() {
|
||||
return LJMF.gpu.num_devices();
|
||||
}
|
||||
|
||||
double lj_gpu_bytes() {
|
||||
return LJMF.host_memory_usage();
|
||||
}
|
|
@ -0,0 +1,248 @@
|
|||
/***************************************************************************
|
||||
lj_gpu_kernel.cu
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Routines that actually perform the force computation
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Aug 4 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJ_GPU_KERNEL
|
||||
#define LJ_GPU_KERNEL
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
|
||||
const int *dev_ij, const int nbor_pitch, acctyp *ans,
|
||||
size_t ans_pitch, const bool eflag,
|
||||
const bool vflag, const int inum, const int nall) {
|
||||
__shared__ numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x);
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=_x_<numtyp>(i,3);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; list<list_end; list++) {
|
||||
|
||||
int j=*list;
|
||||
if (j < nall)
|
||||
factor_lj = 1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int jtype=_x_<numtyp>(j,3);
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix-_x_<numtyp>(j,0);
|
||||
numtyp dely = iy-_x_<numtyp>(j,1);
|
||||
numtyp delz = iz-_x_<numtyp>(j,2);
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<_cutsq_<numtyp>(itype,jtype)) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv =r2inv*r2inv*r2inv;
|
||||
numtyp force =factor_lj*r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
|
||||
_lj1_<numtyp>(itype,jtype).y);
|
||||
|
||||
fx+=delx*force;
|
||||
fy+=dely*force;
|
||||
fz+=delz*force;
|
||||
|
||||
if (eflag) {
|
||||
numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
|
||||
_lj3_<numtyp>(itype,jtype).y);
|
||||
energy+=factor_lj*(e-_offset_<numtyp>(1,1));
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fy;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
|
||||
const int *dev_ij, const int nbor_pitch,
|
||||
acctyp *ans, size_t ans_pitch,const bool eflag,
|
||||
const bool vflag, const int inum,
|
||||
const int nall) {
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
__shared__ numtyp sp_lj[4];
|
||||
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
int itype=ii/MAX_SHARED_TYPES;
|
||||
int jtype=ii%MAX_SHARED_TYPES;
|
||||
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
|
||||
lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
|
||||
lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
|
||||
if (eflag) {
|
||||
lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
|
||||
lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
|
||||
offset[ii]=_offset_<numtyp>(itype,jtype);
|
||||
}
|
||||
}
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x);
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
numtyp ix=_x_<numtyp>(i,0);
|
||||
numtyp iy=_x_<numtyp>(i,1);
|
||||
numtyp iz=_x_<numtyp>(i,2);
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; list<list_end; list++) {
|
||||
|
||||
int j=*list;
|
||||
if (j < nall)
|
||||
factor_lj = 1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
int mtype=itype+_x_<numtyp>(j,3);
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix-_x_<numtyp>(j,0);
|
||||
numtyp dely = iy-_x_<numtyp>(j,1);
|
||||
numtyp delz = iz-_x_<numtyp>(j,2);
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<cutsq[mtype]) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
|
||||
|
||||
fx+=delx*force;
|
||||
fy+=dely*force;
|
||||
fz+=delz*force;
|
||||
|
||||
if (eflag) {
|
||||
numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
|
||||
energy+=factor_lj*(e-offset[mtype]);
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=ans_pitch;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fy;
|
||||
ap1+=ans_pitch;
|
||||
*ap1=fz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,167 @@
|
|||
/***************************************************************************
|
||||
lj_gpu_memory.cu
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Global variables for GPU Lennard-Jones Library
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Aug 4 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#include "lj_gpu_memory.h"
|
||||
#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int LJ_GPU_MemoryT::get_max_atoms(const int gpu_bytes, const int max_nbors) {
|
||||
int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
|
||||
bytes_per_atom(max_nbors));
|
||||
if (matoms>MAX_ATOMS)
|
||||
matoms=MAX_ATOMS;
|
||||
return matoms;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int max_nbors,
|
||||
const int me) {
|
||||
if (allocated)
|
||||
clear();
|
||||
|
||||
if (me>=gpu.num_devices())
|
||||
return 0;
|
||||
gpu.set(me);
|
||||
if (gpu.revision()<1.0)
|
||||
return 0;
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init();
|
||||
|
||||
// Initialize atom and nbor data
|
||||
max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
|
||||
atom.init(max_atoms);
|
||||
nbor.init(ij_size,max_atoms,max_nbors);
|
||||
|
||||
// Get a stream for computing pair potentials
|
||||
CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
|
||||
|
||||
// Use the write buffer from atom for data initialization
|
||||
NVC_HostT &host_write=atom.host_write;
|
||||
assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
|
||||
|
||||
// Copy data for bonded interactions
|
||||
special_lj.safe_alloc(4);
|
||||
special_lj.cast_copy(host_special_lj,host_write);
|
||||
|
||||
// Copy sigma, epsilon, and cutsq onto GPU
|
||||
sigma.safe_alloc(ntypes,ntypes);
|
||||
sigma.cast_copy(host_sigma[0],host_write);
|
||||
epsilon.safe_alloc(ntypes,ntypes);
|
||||
epsilon.cast_copy(host_epsilon[0],host_write);
|
||||
cutsq.safe_alloc(ntypes,ntypes);
|
||||
cutsq.cast_copy(host_cutsq[0],host_write);
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
shared_types=true;
|
||||
}
|
||||
offset.safe_alloc(lj_types,lj_types);
|
||||
offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes);
|
||||
double *t1=host_lj1[0];
|
||||
double *t2=host_lj2[0];
|
||||
for (int i=0; i<lj_types*lj_types; i++) {
|
||||
host_write[i*2]=t1[i];
|
||||
host_write[i*2+1]=t2[i];
|
||||
}
|
||||
lj1.safe_alloc(lj_types,lj_types);
|
||||
lj1.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
|
||||
ntypes,ntypes);
|
||||
t1=host_lj3[0];
|
||||
t2=host_lj4[0];
|
||||
for (int i=0; i<lj_types*lj_types; i++) {
|
||||
host_write[i*2]=t1[i];
|
||||
host_write[i*2+1]=t2[i];
|
||||
}
|
||||
lj3.safe_alloc(lj_types,lj_types);
|
||||
lj3.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
|
||||
ntypes,ntypes);
|
||||
|
||||
// Bind constant data to textures
|
||||
sigma_bind_texture<numtyp>(sigma);
|
||||
epsilon_bind_texture<numtyp>(epsilon);
|
||||
cutsq_bind_texture<numtyp>(cutsq);
|
||||
offset_bind_texture<numtyp>(offset);
|
||||
lj1_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj1);
|
||||
lj3_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj3);
|
||||
|
||||
dev_error.safe_alloc(1);
|
||||
dev_error.zero();
|
||||
|
||||
allocated=true;
|
||||
|
||||
return nbor.host_ij.begin();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void LJ_GPU_MemoryT::clear() {
|
||||
if (!allocated)
|
||||
return;
|
||||
allocated=false;
|
||||
|
||||
// Check for any pair style specific errors here
|
||||
int err_flag;
|
||||
dev_error.copy_to_host(&err_flag);
|
||||
|
||||
atom.clear();
|
||||
nbor.clear();
|
||||
|
||||
sigma_unbind_texture<numtyp>();
|
||||
epsilon_unbind_texture<numtyp>();
|
||||
cutsq_unbind_texture<numtyp>();
|
||||
offset_unbind_texture<numtyp>();
|
||||
lj1_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
|
||||
lj3_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
|
||||
|
||||
CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream));
|
||||
|
||||
dev_error.clear();
|
||||
sigma.clear();
|
||||
epsilon.clear();
|
||||
special_lj.clear();
|
||||
cutsq.clear();
|
||||
offset.clear();
|
||||
lj1.clear();
|
||||
lj3.clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double LJ_GPU_MemoryT::host_memory_usage() const {
|
||||
return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+
|
||||
sizeof(LJ_GPU_Memory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,88 @@
|
|||
/***************************************************************************
|
||||
lj_gpu_memory.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Global variables for GPU Lennard-Jones Library
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Aug 4 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJ_GPU_MEMORY_H
|
||||
#define LJ_GPU_MEMORY_H
|
||||
|
||||
#include "nvc_device.h"
|
||||
#include "pair_gpu_atom.h"
|
||||
#include "pair_gpu_nbor.h"
|
||||
|
||||
#define BLOCK_1D 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
#define PERCENT_GPU_MEMORY 0.7
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class LJ_GPU_Memory {
|
||||
public:
|
||||
LJ_GPU_Memory() : allocated(false) {}
|
||||
~LJ_GPU_Memory() { clear(); }
|
||||
|
||||
/// Allocate memory on host and device
|
||||
int* init(const int ij_size, const int ntypes, double **host_cutsq,
|
||||
double **host_sigma, double **host_epsilon,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int max_nbors, const int me);
|
||||
/// Free any memory on host and device
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on GPU per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
/// Maximum number of atoms that can be stored on GPU
|
||||
int get_max_atoms(const int gpu_bytes, const int max_nbors);
|
||||
/// Total host memory used by library
|
||||
double host_memory_usage() const;
|
||||
|
||||
// ------------------------- DATA -----------------------------
|
||||
|
||||
// Device Properties
|
||||
NVCDevice gpu;
|
||||
// Device Error Flag
|
||||
NVC_VecI dev_error;
|
||||
// Stream for asynchronous work
|
||||
cudaStream_t pair_stream;
|
||||
|
||||
// Atom Data
|
||||
PairGPUAtom<numtyp,acctyp> atom;
|
||||
// Neighbor Data
|
||||
PairGPUNbor nbor;
|
||||
|
||||
// --------------- Const Data for Atoms
|
||||
NVC_ConstMatT sigma, epsilon, cutsq, offset;
|
||||
NVC_ConstMat< typename cu_vec_traits<numtyp>::vec2 > lj1, lj3;
|
||||
NVC_VecT special_lj;
|
||||
|
||||
size_t max_atoms;
|
||||
|
||||
// Timing for pair calculation
|
||||
NVCTimer time_pair;
|
||||
|
||||
// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
protected:
|
||||
bool allocated;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -0,0 +1,96 @@
|
|||
/***************************************************************************
|
||||
nvc_device.cu
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with cuda devices
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the NVC Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Wed Jan 28 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "nvc_macros.h"
|
||||
#include "nvc_device.h"
|
||||
|
||||
// Grabs the properties for all devices
|
||||
NVCDevice::NVCDevice() {
|
||||
CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices));
|
||||
for (int dev=0; dev<_num_devices; ++dev) {
|
||||
cudaDeviceProp deviceProp;
|
||||
CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));
|
||||
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
|
||||
break;
|
||||
_properties.push_back(deviceProp);
|
||||
}
|
||||
_device=0;
|
||||
}
|
||||
|
||||
// Set the CUDA device to the specified device number
|
||||
void NVCDevice::set(int num) {
|
||||
if (_device==num)
|
||||
return;
|
||||
cudaThreadExit();
|
||||
CUDA_SAFE_CALL(cudaSetDevice(num));
|
||||
_device=num;
|
||||
}
|
||||
|
||||
// List all devices along with all properties
|
||||
void NVCDevice::print_all(ostream &out) {
|
||||
if (num_devices() == 0)
|
||||
printf("There is no device supporting CUDA\n");
|
||||
for (int i=0; i<num_devices(); ++i) {
|
||||
printf("\nDevice %d: \"%s\"\n", i, name(i).c_str());
|
||||
printf(" Revision number: %.1f\n", revision(i));
|
||||
printf(" Total amount of global memory: %.2f GB\n",
|
||||
gigabytes(i));
|
||||
#if CUDART_VERSION >= 2000
|
||||
printf(" Number of multiprocessors: %d\n",
|
||||
_properties[i].multiProcessorCount);
|
||||
printf(" Number of cores: %d\n",cores(i));
|
||||
#endif
|
||||
printf(" Total amount of constant memory: %u bytes\n",
|
||||
_properties[i].totalConstMem);
|
||||
printf(" Total amount of shared memory per block: %u bytes\n",
|
||||
_properties[i].sharedMemPerBlock);
|
||||
printf(" Total number of registers available per block: %d\n",
|
||||
_properties[i].regsPerBlock);
|
||||
printf(" Warp size: %d\n",
|
||||
_properties[i].warpSize);
|
||||
printf(" Maximum number of threads per block: %d\n",
|
||||
_properties[i].maxThreadsPerBlock);
|
||||
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
|
||||
_properties[i].maxThreadsDim[0],
|
||||
_properties[i].maxThreadsDim[1],
|
||||
_properties[i].maxThreadsDim[2]);
|
||||
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
|
||||
_properties[i].maxGridSize[0],
|
||||
_properties[i].maxGridSize[1],
|
||||
_properties[i].maxGridSize[2]);
|
||||
printf(" Maximum memory pitch: %u bytes\n",
|
||||
_properties[i].memPitch);
|
||||
printf(" Texture alignment: %u bytes\n",
|
||||
_properties[i].textureAlignment);
|
||||
printf(" Clock rate: %.2f GHz\n",
|
||||
clock_rate(i));
|
||||
#if CUDART_VERSION >= 2000
|
||||
printf(" Concurrent copy and execution: %s\n",
|
||||
_properties[i].deviceOverlap ? "Yes" : "No");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
/***************************************************************************
|
||||
nvc_device.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with cuda devices
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the NVC Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Wed Jan 28 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVC_DEVICE
|
||||
#define NVC_DEVICE
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/// Class for looking at device properties
|
||||
/** \note Calls to change the device outside of the class results in incorrect
|
||||
* behavior
|
||||
* \note There is no error checking for indexing past the number of devices **/
|
||||
class NVCDevice {
|
||||
public:
|
||||
/// Grabs the properties for all devices
|
||||
NVCDevice();
|
||||
|
||||
/// Return the number of devices that support CUDA
|
||||
inline int num_devices() { return _properties.size(); }
|
||||
|
||||
/// Set the CUDA device to the specified device number
|
||||
void set(int num);
|
||||
|
||||
/// Get the current device number
|
||||
inline int device_num() { return _device; }
|
||||
|
||||
/// Get the current CUDA device name
|
||||
inline string name() { return name(_device); }
|
||||
/// Get the CUDA device name
|
||||
inline string name(const int i) { return string(_properties[i].name); }
|
||||
|
||||
/// Get the number of cores in the current device
|
||||
inline unsigned cores() { return cores(_device); }
|
||||
/// Get the number of cores
|
||||
inline unsigned cores(const int i)
|
||||
{ return _properties[i].multiProcessorCount*8; }
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
/// Get the gigabytes of global memory
|
||||
inline double gigabytes(const int i)
|
||||
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
|
||||
|
||||
/// Get the bytes of global memory in the current device
|
||||
inline size_t bytes() { return bytes(_device); }
|
||||
/// Get the bytes of global memory
|
||||
inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
|
||||
|
||||
/// Return the GPGPU revision number for current device
|
||||
inline double revision() { return revision(_device); }
|
||||
/// Return the GPGPU revision number
|
||||
inline double revision(const int i)
|
||||
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
|
||||
|
||||
/// Clock rate in GHz for current device
|
||||
inline double clock_rate() { return clock_rate(_device); }
|
||||
/// Clock rate in GHz
|
||||
inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;}
|
||||
|
||||
/// List all devices along with all properties
|
||||
void print_all(ostream &out);
|
||||
|
||||
private:
|
||||
int _device, _num_devices;
|
||||
vector<cudaDeviceProp> _properties;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -0,0 +1,31 @@
|
|||
/***************************************************************************
|
||||
nvc_get_devices.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
List properties of cuda devices
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the NVC Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Wed Jan 28 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#include "nvc_device.h"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
NVCDevice gpu;
|
||||
gpu.print_all(cout);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
#ifndef NVC_MACROS_H
|
||||
#define NVC_MACROS_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include "math_constants.h"
|
||||
#define INT_MUL(x,y) (__mul24(x,y))
|
||||
//#define INT_MUL(x,y) ((x)*(y))
|
||||
|
||||
template <class numbr>
|
||||
static __inline__ __device__ numbr cuda_inf() { return CUDART_INF_F; }
|
||||
|
||||
#ifdef CUDA_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double cuda_inf<double>() { return CUDART_INF; }
|
||||
#endif
|
||||
|
||||
template <class numbr>
|
||||
static __inline__ __device__ numbr cuda_zero() { return 0.0; }
|
||||
|
||||
template <>
|
||||
static __inline__ __device__ float cuda_zero<float>() { return 0.0f; }
|
||||
|
||||
#ifdef DEBUG
|
||||
|
||||
# define CU_SAFE_CALL_NO_SYNC( call ) do { \
|
||||
CUresult err = call; \
|
||||
if( CUDA_SUCCESS != err) { \
|
||||
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \
|
||||
err, __FILE__, __LINE__ ); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CU_SAFE_CALL( call ) do { \
|
||||
CU_SAFE_CALL_NO_SYNC(call); \
|
||||
CUresult err = cuCtxSynchronize(); \
|
||||
if( CUDA_SUCCESS != err) { \
|
||||
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \
|
||||
err, __FILE__, __LINE__ ); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CUDA_SAFE_CALL_NO_SYNC( call) do { \
|
||||
cudaError err = call; \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
|
||||
__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CUDA_SAFE_CALL( call) do { \
|
||||
CUDA_SAFE_CALL_NO_SYNC(call); \
|
||||
cudaError err = cudaThreadSynchronize(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
|
||||
__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CUFFT_SAFE_CALL( call) do { \
|
||||
cufftResult err = call; \
|
||||
if( CUFFT_SUCCESS != err) { \
|
||||
fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CUT_SAFE_CALL( call) \
|
||||
if( CUTTrue != call) { \
|
||||
fprintf(stderr, "Cut error in file '%s' in line %i.\n", \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
//! Check for CUDA error
|
||||
# define CUT_CHECK_ERROR(errorMessage) do { \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
err = cudaThreadSynchronize(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
//! Check for malloc error
|
||||
# define CUT_SAFE_MALLOC( mallocCall ) do{ \
|
||||
if( !(mallocCall)) { \
|
||||
fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while(0);
|
||||
|
||||
//! Check if conditon is true (flexible assert)
|
||||
# define CUT_CONDITION( val) \
|
||||
if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) { \
|
||||
exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
#else // not DEBUG
|
||||
|
||||
#define CUT_BANK_CHECKER( array, index) array[index]
|
||||
|
||||
// void macros for performance reasons
|
||||
# define CUT_CHECK_ERROR(errorMessage)
|
||||
# define CUT_CHECK_ERROR_GL()
|
||||
# define CUT_CONDITION( val)
|
||||
# define CU_SAFE_CALL_NO_SYNC( call) call
|
||||
# define CU_SAFE_CALL( call) call
|
||||
# define CUDA_SAFE_CALL_NO_SYNC( call) call
|
||||
# define CUDA_SAFE_CALL( call) call
|
||||
# define CUT_SAFE_CALL( call) call
|
||||
# define CUFFT_SAFE_CALL( call) call
|
||||
# define CUT_SAFE_MALLOC( mallocCall ) mallocCall
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -0,0 +1,447 @@
|
|||
/***************************************************************************
|
||||
nvc_memory.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Routines for memory management on CUDA devices
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the NVC Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jun 25 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVC_MEMORY_H
|
||||
#define NVC_MEMORY_H
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#define NVC_HostT NVC_Host<numtyp>
|
||||
#define NVC_HostD NVC_Host<double>
|
||||
#define NVC_HostS NVC_Host<float>
|
||||
#define NVC_HostI NVC_Host<int>
|
||||
|
||||
#define NVC_VecT NVC_Vec<numtyp>
|
||||
#define NVC_VecD NVC_Vec<double>
|
||||
#define NVC_VecS NVC_Vec<float>
|
||||
#define NVC_VecI NVC_Vec<int>
|
||||
#define NVC_VecI2 NVC_Vec<int2>
|
||||
#define NVC_VecU2 NVC_Vec<uint2>
|
||||
|
||||
#define NVC_MatT NVC_Mat<numtyp>
|
||||
#define NVC_MatD NVC_Mat<double>
|
||||
#define NVC_MatS NVC_Mat<float>
|
||||
#define NVC_MatI NVC_Mat<int>
|
||||
|
||||
#define NVC_ConstMatT NVC_ConstMat<numtyp>
|
||||
#define NVC_ConstMatD NVC_ConstMat<double>
|
||||
#define NVC_ConstMatS NVC_ConstMat<float>
|
||||
#define NVC_ConstMatI NVC_ConstMat<int>
|
||||
#define NVC_ConstMatD2 NVC_ConstMat<double2>
|
||||
|
||||
namespace NVC {
|
||||
|
||||
// Get a channel for float array
|
||||
template <class numtyp>
|
||||
inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) {
|
||||
channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
// Get a channel for float2 array
|
||||
template <>
|
||||
inline void cuda_gb_get_channel<float2>(cudaChannelFormatDesc &channel) {
|
||||
channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
// Get a channel for double array
|
||||
template <>
|
||||
inline void cuda_gb_get_channel<double>(cudaChannelFormatDesc &channel) {
|
||||
channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
// Get a channel for double array
|
||||
template <>
|
||||
inline void cuda_gb_get_channel<double2>(cudaChannelFormatDesc &channel) {
|
||||
channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
// Get a channel for int array
|
||||
template <>
|
||||
inline void cuda_gb_get_channel<int>(cudaChannelFormatDesc &channel) {
|
||||
channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Page-locked Row Vector on Host
|
||||
template <class numtyp>
|
||||
class NVC_Host {
|
||||
public:
|
||||
NVC_Host() { _cols=0; }
|
||||
~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); }
|
||||
|
||||
// Allocate page-locked memory with fast write/slow read on host
|
||||
inline void safe_alloc_w(const size_t cols) {
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes,
|
||||
cudaHostAllocWriteCombined));
|
||||
_end=_array+cols;
|
||||
}
|
||||
|
||||
// Allocate page-locked memory with fast read/write on host
|
||||
inline void safe_alloc_rw(const size_t cols) {
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes));
|
||||
_end=_array+cols;
|
||||
}
|
||||
|
||||
/// Free any memory associated with device
|
||||
inline void clear()
|
||||
{ if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } }
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { memset(_array,0,row_bytes()); }
|
||||
|
||||
/// Set first n elements to zero
|
||||
inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); }
|
||||
|
||||
inline numtyp * begin() { return _array; }
|
||||
inline const numtyp * begin() const { return _array; }
|
||||
inline numtyp * end() { return _end; }
|
||||
inline const numtyp * end() const { return _end; }
|
||||
|
||||
inline size_t numel() const { return _cols; }
|
||||
inline size_t rows() const { return 1; }
|
||||
inline size_t cols() const { return _cols; }
|
||||
inline size_t row_size() const { return _cols; }
|
||||
inline size_t row_bytes() const { return _row_bytes; }
|
||||
|
||||
inline numtyp & operator[](const int i) { return _array[i]; }
|
||||
inline const numtyp & operator[](const int i) const { return _array[i]; }
|
||||
|
||||
/// Copy from device (numel is not bytes)
|
||||
inline void copy_from_device(const numtyp *device_p, size_t numel) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp),
|
||||
cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
/// Copy to device (numel is not bytes)
|
||||
inline void copy_to_device(numtyp *device_p, size_t numel) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
/// Copy to 2D matrix on device (numel is not bytes)
|
||||
inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
|
||||
const size_t rows, const size_t cols) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp),
|
||||
_array,cols*sizeof(numtyp),
|
||||
cols*sizeof(numtyp),rows,
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
/// Asynchronous copy from device (numel is not bytes)
|
||||
inline void copy_from_device(const numtyp *device_p, size_t numel,
|
||||
cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
|
||||
cudaMemcpyDeviceToHost,stream));
|
||||
}
|
||||
|
||||
/// Asynchronous copy to device (numel is not bytes)
|
||||
inline void copy_to_device(numtyp *device_p, size_t numel,
|
||||
cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
/// Asynchronous copy to 2D matrix on device (numel is not bytes)
|
||||
inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
|
||||
const size_t rows, const size_t cols,
|
||||
cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp),
|
||||
_array,cols*sizeof(numtyp),
|
||||
cols*sizeof(numtyp),rows,
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
private:
|
||||
numtyp *_array, *_end;
|
||||
size_t _row_bytes, _row_size, _rows, _cols;
|
||||
};
|
||||
|
||||
/// Row vector on device
|
||||
template <class numtyp>
|
||||
class NVC_Vec {
|
||||
public:
|
||||
NVC_Vec() { _cols=0; }
|
||||
~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); }
|
||||
|
||||
// Row vector on device
|
||||
inline void safe_alloc(const size_t cols) {
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes));
|
||||
_end=_array+cols;
|
||||
}
|
||||
|
||||
/// Free any memory associated with device
|
||||
inline void clear()
|
||||
{ if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); }
|
||||
|
||||
inline numtyp * begin() { return _array; }
|
||||
inline const numtyp * begin() const { return _array; }
|
||||
inline numtyp * end() { return _end; }
|
||||
inline const numtyp * end() const { return _end; }
|
||||
|
||||
inline size_t numel() const { return _cols; }
|
||||
inline size_t rows() const { return 1; }
|
||||
inline size_t cols() const { return _cols; }
|
||||
inline size_t row_size() const { return _cols; }
|
||||
inline size_t row_bytes() const { return _row_bytes; }
|
||||
|
||||
/// Copy from host
|
||||
inline void copy_from_host(const numtyp *host_p)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(),
|
||||
cudaMemcpyHostToDevice)); }
|
||||
|
||||
/// Asynchronous copy from host
|
||||
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(),
|
||||
cudaMemcpyHostToDevice, stream)); }
|
||||
|
||||
/// Copy to host
|
||||
inline void copy_to_host(numtyp *host_p)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(),
|
||||
cudaMemcpyDeviceToHost)); }
|
||||
|
||||
/// Copy n elements to host
|
||||
inline void copy_to_host(numtyp *host_p, const int n)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp),
|
||||
cudaMemcpyDeviceToHost)); }
|
||||
|
||||
/// Cast and then copy to device
|
||||
template <class numtyp2>
|
||||
inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
|
||||
for (int i=0; i<numel(); i++)
|
||||
host_write[i]=static_cast<numtyp>(buffer[i]);
|
||||
copy_from_host(host_write.begin());
|
||||
}
|
||||
|
||||
/// Bind to texture
|
||||
template <class texture>
|
||||
inline void bind_texture(texture &texi, cudaChannelFormatDesc &channel) {
|
||||
NVC::cuda_gb_get_channel<numtyp>(channel);
|
||||
texi.addressMode[0] = cudaAddressModeClamp;
|
||||
texi.addressMode[1] = cudaAddressModeClamp;
|
||||
texi.filterMode = cudaFilterModePoint;
|
||||
texi.normalized = false;
|
||||
CUDA_SAFE_CALL(cudaBindTexture(NULL,&texi,_array,&channel));
|
||||
}
|
||||
|
||||
/// Output the vector (debugging)
|
||||
inline void print(std::ostream &out) { print (out, numel()); }
|
||||
|
||||
// Output first n elements of vector
|
||||
inline void print(std::ostream &out, const int n) {
|
||||
numtyp *t=new numtyp[n];
|
||||
copy_to_host(t,n);
|
||||
for (int i=0; i<n; i++)
|
||||
out << t[i] << " ";
|
||||
delete []t;
|
||||
}
|
||||
|
||||
private:
|
||||
numtyp *_array, *_end;
|
||||
size_t _row_bytes, _row_size, _rows, _cols;
|
||||
};
|
||||
|
||||
/// 2D Matrix on device (can have extra column storage to get correct alignment)
|
||||
template <class numtyp>
|
||||
class NVC_Mat {
|
||||
public:
|
||||
NVC_Mat() { _rows=0; }
|
||||
~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); }
|
||||
|
||||
// Row major matrix on device
|
||||
// - Coalesced access using adjacent cols on same row
|
||||
// - NVC_Mat(row,col) given by array[row*row_size()+col]
|
||||
inline void safe_alloc(const size_t rows, const size_t cols) {
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch,
|
||||
cols*sizeof(numtyp),rows));
|
||||
_row_size=_pitch/sizeof(numtyp);
|
||||
_end=_array+_row_size*cols;
|
||||
}
|
||||
|
||||
/// Free any memory associated with device
|
||||
inline void clear()
|
||||
{ if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); }
|
||||
|
||||
inline numtyp * begin() { return _array; }
|
||||
inline const numtyp * begin() const { return _array; }
|
||||
inline numtyp * end() { return _end; }
|
||||
inline const numtyp * end() const { return _end; }
|
||||
|
||||
|
||||
inline size_t numel() const { return _cols*_rows; }
|
||||
inline size_t rows() const { return _rows; }
|
||||
inline size_t cols() const { return _cols; }
|
||||
inline size_t row_size() const { return _row_size; }
|
||||
inline size_t row_bytes() const { return _pitch; }
|
||||
|
||||
/// Copy from host (elements not bytes)
|
||||
inline void copy_from_host(const numtyp *host_p, const size_t numel)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice)); }
|
||||
|
||||
/// Asynchronous copy from host (elements not bytes)
|
||||
inline void copy_from_host(const numtyp *host_p, const size_t numel,
|
||||
cudaStream_t &stream)
|
||||
{ CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice, stream)); }
|
||||
|
||||
/// Asynchronous Copy from Host
|
||||
/** \note Used when the number of columns/rows allocated on host smaller than
|
||||
* on device **/
|
||||
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
|
||||
const size_t cols, cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp),
|
||||
cols*sizeof(numtyp), rows,
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
private:
|
||||
numtyp *_array, *_end;
|
||||
size_t _pitch, _row_size, _rows, _cols;
|
||||
};
|
||||
|
||||
/// Const 2D Matrix on device (requires texture binding)
|
||||
template <class numtyp>
|
||||
class NVC_ConstMat {
|
||||
public:
|
||||
NVC_ConstMat() { _rows=0; }
|
||||
~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); }
|
||||
|
||||
/// Row major matrix on device
|
||||
inline void safe_alloc(const size_t rows, const size_t cols) {
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
|
||||
NVC::cuda_gb_get_channel<numtyp>(_channel);
|
||||
CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows));
|
||||
}
|
||||
|
||||
/// Bind to texture
|
||||
template <class texture>
|
||||
inline void bind_texture(texture &texi) {
|
||||
texi.addressMode[0] = cudaAddressModeClamp;
|
||||
texi.addressMode[1] = cudaAddressModeClamp;
|
||||
texi.filterMode = cudaFilterModePoint;
|
||||
texi.normalized = false;
|
||||
CUDA_SAFE_CALL(cudaBindTextureToArray(&texi,_array,&_channel));
|
||||
}
|
||||
|
||||
/// Free any memory associated with device
|
||||
inline void clear()
|
||||
{ if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFreeArray(_array)); } }
|
||||
|
||||
inline size_t numel() const { return _cols*_rows; }
|
||||
inline size_t rows() const { return _rows; }
|
||||
inline size_t cols() const { return _cols; }
|
||||
inline size_t row_size() const { return _cols; }
|
||||
inline size_t row_bytes() const { return _cols*sizeof(numtyp); }
|
||||
|
||||
/// Copy from Host
|
||||
inline void copy_from_host(const numtyp *host_p) {
|
||||
CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p,
|
||||
numel()*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
/// Copy from Host
|
||||
/** \note Used when the number of columns/rows allocated on host smaller than
|
||||
* on device **/
|
||||
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
|
||||
const size_t cols) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p,
|
||||
cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
/// Asynchronous Copy from Host
|
||||
inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p,
|
||||
numel()*sizeof(numtyp),
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
/// Asynchronous Copy from Host
|
||||
/** \note Used when the number of columns/rows allocated on host smaller than
|
||||
* on device **/
|
||||
inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
|
||||
const size_t cols, cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p,
|
||||
cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
|
||||
cudaMemcpyHostToDevice,stream));
|
||||
}
|
||||
|
||||
/// Cast buffer to numtyp in host_write and copy to array
|
||||
template <class numtyp2>
|
||||
inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
|
||||
int n=numel();
|
||||
for (int i=0; i<n; i++) {
|
||||
host_write[i]=static_cast<numtyp>(*buffer); buffer++;
|
||||
}
|
||||
copy_from_host(host_write.begin());
|
||||
}
|
||||
|
||||
/// Cast buffer to numtyp in host_write and copy to array
|
||||
/** \note Used when the number of columns/rows allocated on host smaller than
|
||||
* on device **/
|
||||
template <class numtyp2>
|
||||
inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write,
|
||||
const size_t rows, const size_t cols) {
|
||||
int n=rows*cols;
|
||||
for (int i=0; i<n; i++) {
|
||||
host_write[i]=static_cast<numtyp>(*buffer); buffer++;
|
||||
}
|
||||
copy_2Dfrom_host(host_write.begin(),rows,cols);
|
||||
}
|
||||
|
||||
/// Cast buffer to numtyp in host_write and copy to array asynchronously
|
||||
template <class numtyp2>
|
||||
inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write,
|
||||
cudaStream_t &stream) {
|
||||
int n=numel();
|
||||
for (int i=0; i<n; i++) {
|
||||
host_write[i]=static_cast<numtyp>(*buffer); buffer++;
|
||||
}
|
||||
copy_from_host(host_write.begin(),stream);
|
||||
}
|
||||
|
||||
private:
|
||||
size_t _rows, _cols;
|
||||
cudaArray *_array;
|
||||
cudaChannelFormatDesc _channel;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -0,0 +1,83 @@
|
|||
/***************************************************************************
|
||||
nvc_timer.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Class for timing CUDA routines
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the NVC Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Feb 3 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVC_TIMER_H
|
||||
#define NVC_TIMER_H
|
||||
|
||||
#include "nvc_macros.h"
|
||||
|
||||
#define cudaEventDestroy(a)
|
||||
|
||||
/// Class for timing CUDA events
|
||||
class NVCTimer {
|
||||
public:
|
||||
NVCTimer() : _total_time(0.0f), initialized(false) { }
|
||||
|
||||
~NVCTimer() {
|
||||
if (initialized)
|
||||
{ cudaEventDestroy(start_event); cudaEventDestroy(stop_event); }
|
||||
}
|
||||
|
||||
inline void init() {
|
||||
if (!initialized) {
|
||||
initialized=true;
|
||||
CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
|
||||
CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );
|
||||
}
|
||||
}
|
||||
|
||||
/// Start timing
|
||||
inline void start() { cudaEventRecord(start_event,0); }
|
||||
|
||||
/// Stop timing and store event time
|
||||
inline void stop() { cudaEventRecord(stop_event,0); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
inline void zero()
|
||||
{ cudaEventRecord(start_event,0); cudaEventRecord(stop_event,0); }
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline void add_to_total() { _total_time+=time(); }
|
||||
|
||||
/// Return the time (ms) of last start to stop - Forces synchronization
|
||||
inline double time() {
|
||||
float timer;
|
||||
cudaEventSynchronize(stop_event);
|
||||
CUDA_SAFE_CALL( cudaEventElapsedTime(&timer,start_event,stop_event) );
|
||||
return timer;
|
||||
}
|
||||
|
||||
/// Return the total time in ms
|
||||
inline double total_time() { return _total_time; }
|
||||
|
||||
/// Return the total time in seconds
|
||||
inline double total_seconds() { return _total_time/1000.0; }
|
||||
|
||||
private:
|
||||
cudaEvent_t start_event, stop_event;
|
||||
double _total_time;
|
||||
bool initialized;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -0,0 +1,173 @@
|
|||
/***************************************************************************
|
||||
pair_gpu_atom.cu
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Memory routines for moving atom and force data between host and gpu
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Aug 4 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#include "pair_gpu_atom.h"
|
||||
#define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int PairGPUAtomT::bytes_per_atom() const {
|
||||
return atom_fields()*sizeof(numtyp)+ans_fields()*sizeof(acctyp);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::init(const int max_atoms) {
|
||||
if (allocated)
|
||||
clear();
|
||||
|
||||
_max_atoms=max_atoms;
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_atom.init();
|
||||
time_answer.init();
|
||||
|
||||
// Device matrices for atom and force data
|
||||
dev_x.safe_alloc(atom_fields(),max_atoms);
|
||||
x_bind_texture<numtyp>(dev_x);
|
||||
ans.safe_alloc(ans_fields(),max_atoms);
|
||||
|
||||
// Get a host write only buffer
|
||||
host_write.safe_alloc_w(max_atoms*4);
|
||||
// Get a host read/write buffer
|
||||
host_read.safe_alloc_rw(ans.row_size()*ans_fields());
|
||||
|
||||
allocated=true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::clear() {
|
||||
if (!allocated)
|
||||
return;
|
||||
allocated=false;
|
||||
|
||||
x_unbind_texture<numtyp>();
|
||||
ans.clear();
|
||||
host_write.clear();
|
||||
host_read.clear();
|
||||
dev_x.clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAtomT::host_memory_usage(const int max_atoms) const {
|
||||
return max_atoms*atom_fields()*sizeof(numtyp)+
|
||||
ans_fields()*(max_atoms)*sizeof(acctyp)+
|
||||
sizeof(PairGPUAtom<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
|
||||
cudaStream_t &s) {
|
||||
_eflag=eflag;
|
||||
_vflag=vflag;
|
||||
|
||||
int csize=ans_fields();
|
||||
if (!eflag)
|
||||
csize--;
|
||||
if (!vflag)
|
||||
csize-=6;
|
||||
|
||||
host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom,
|
||||
double **vatom, double *virial) {
|
||||
double evdwl=0.0;
|
||||
int gap=ans.row_size()-_inum;
|
||||
|
||||
acctyp *ap=host_read.begin();
|
||||
if (_eflag) {
|
||||
if (eflag_atom) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
evdwl+=*ap;
|
||||
eatom[ilist[i]]+=*ap*0.5;
|
||||
ap++;
|
||||
}
|
||||
} else
|
||||
for (int i=0; i<_inum; i++) {
|
||||
evdwl+=*ap;
|
||||
ap++;
|
||||
}
|
||||
ap+=gap;
|
||||
evdwl*=0.5;
|
||||
}
|
||||
_read_loc=ap;
|
||||
gap=ans.row_size();
|
||||
if (_vflag) {
|
||||
if (vflag_atom) {
|
||||
for (int ii=0; ii<_inum; ii++) {
|
||||
int i=ilist[ii];
|
||||
ap=_read_loc+ii;
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=gap;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int ii=0; ii<_inum; ii++) {
|
||||
ap=_read_loc+ii;
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=gap;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
_read_loc+=gap*6;
|
||||
}
|
||||
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::add_forces(const int *ilist, double **f) {
|
||||
int gap=ans.row_size();
|
||||
for (int ii=0; ii<_inum; ii++) {
|
||||
acctyp *ap=_read_loc+ii;
|
||||
int i=ilist[ii];
|
||||
f[i][0]+=*ap;
|
||||
ap+=gap;
|
||||
f[i][1]+=*ap;
|
||||
ap+=gap;
|
||||
f[i][2]+=*ap;
|
||||
}
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) {
|
||||
int gap=ans.row_size();
|
||||
_read_loc+=gap*3;
|
||||
for (int ii=0; ii<n; ii++) {
|
||||
acctyp *ap=_read_loc+ii;
|
||||
int i=ilist[ii];
|
||||
tor[i][0]+=*ap;
|
||||
ap+=gap;
|
||||
tor[i][1]+=*ap;
|
||||
ap+=gap;
|
||||
tor[i][2]+=*ap;
|
||||
}
|
||||
}
|
||||
|
||||
template class PairGPUAtom<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,151 @@
|
|||
/***************************************************************************
|
||||
pair_gpu_atom.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Memory routines for moving atom and force data between host and gpu
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Aug 4 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef PAIR_GPU_ATOM_H
|
||||
#define PAIR_GPU_ATOM_H
|
||||
|
||||
// PRECISION - Precision for rsq, energy, force, and torque calculation
|
||||
// ACC_PRECISION - Precision for accumulation of energies, forces, and torques
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define PRECISION float
|
||||
#define ACC_PRECISION double
|
||||
#endif
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define PRECISION double
|
||||
#define ACC_PRECISION double
|
||||
#endif
|
||||
|
||||
#ifndef PRECISION
|
||||
#define PRECISION float
|
||||
#define ACC_PRECISION double
|
||||
#endif
|
||||
|
||||
#define MAX_ATOMS 65536
|
||||
#include "nvc_timer.h"
|
||||
#include "pair_gpu_texture.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class PairGPUAtom {
|
||||
public:
|
||||
PairGPUAtom() : _atom_fields(4), _ans_fields(10), allocated(false) {}
|
||||
~PairGPUAtom() { clear(); }
|
||||
|
||||
// Accessors
|
||||
inline int atom_fields() const { return _atom_fields; }
|
||||
inline int ans_fields() const { return _ans_fields; }
|
||||
inline int max_atoms() const { return _max_atoms; }
|
||||
inline int nall() const { return _nall; }
|
||||
inline int inum() const { return _inum; }
|
||||
|
||||
/// Set number of atoms for future copy operations
|
||||
inline void nall(const int n) { _nall=n; }
|
||||
/// Set number of inum for future copy operations
|
||||
inline void inum(const int n) { _inum=n; }
|
||||
/// Set the number of atom fields (x, y, z, type, etc)
|
||||
inline void atom_fields(const int n) { _atom_fields=n; }
|
||||
/// Set the number of answer fields (energy, virial, force, etc.)
|
||||
inline void ans_fields(const int n) { _ans_fields=n; }
|
||||
|
||||
/// Memory usage per atom in this class
|
||||
/** \note atom_fields and ans_fields should be set for correct answer **/
|
||||
int bytes_per_atom() const;
|
||||
|
||||
/// Must be called once to allocate host and device memory
|
||||
/** \note atom_fields and ans_fields should be set first if not default **/
|
||||
void init(const int max_atoms);
|
||||
|
||||
/// Free all memory on host and device
|
||||
void clear();
|
||||
|
||||
/// Return the total amount of host memory used by class
|
||||
double host_memory_usage(const int max_atoms) const;
|
||||
|
||||
|
||||
// -------------------------COPY TO GPU ----------------------------------
|
||||
|
||||
/// Reset the write buffer pointer (Start copying new atom data)
|
||||
inline void reset_write_buffer() { _write_loc=host_write.begin(); }
|
||||
|
||||
/// Add a row to write buffer with unit stride
|
||||
/** Copies nall() elements **/
|
||||
template<class cpytyp>
|
||||
inline void add_atom_data(const cpytyp *host_ptr)
|
||||
{ for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i]; _write_loc++; } }
|
||||
|
||||
/// Add a row to write buffer with non-unit stride
|
||||
/** Copies nall() elements **/
|
||||
template<class cpytyp>
|
||||
inline void add_atom_data(const cpytyp *hostptr, const int stride) {
|
||||
int t=_nall*stride;
|
||||
for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; }
|
||||
}
|
||||
|
||||
/// Copy num_rows x nall() write buffer to x in GPU
|
||||
/** num_rows<=atom_fields() **/
|
||||
inline void copy_atom_data(const int num_rows, cudaStream_t &stream)
|
||||
{ dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); }
|
||||
|
||||
|
||||
// -------------------------COPY FROM GPU -------------------------------
|
||||
|
||||
/// Copy answers from GPU into read buffer
|
||||
void copy_answers(const bool eflag, const bool vflag, cudaStream_t &s);
|
||||
|
||||
/// Copy energy and virial data into LAMMPS memory
|
||||
double energy_virial(const int *ilist, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial);
|
||||
|
||||
/// Add forces from the GPU into a LAMMPS pointer
|
||||
void add_forces(const int *ilist, double **f);
|
||||
|
||||
/// Add torques from the GPU into a LAMMPS pointer
|
||||
void add_torques(const int *ilist, double **tor, const int n);
|
||||
|
||||
// ------------------------------ DATA ----------------------------------
|
||||
|
||||
// atom_fields() x n (example: rows 1-3 position, 4 is type)
|
||||
NVC_ConstMatT dev_x;
|
||||
// ans_fields() x n Storage for Forces, etc.
|
||||
// example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force
|
||||
// example: if (!eflag) 1-3 is force
|
||||
NVC_Mat<acctyp> ans;
|
||||
|
||||
// Buffer for moving floating point data to GPU
|
||||
NVC_HostT host_write;
|
||||
// Buffer for moving floating point data to CPU
|
||||
NVC_Host<acctyp> host_read;
|
||||
|
||||
// Timing Stuff
|
||||
NVCTimer time_atom, time_answer;
|
||||
|
||||
private:
|
||||
bool allocated, _eflag, _vflag;
|
||||
int _atom_fields, _ans_fields;
|
||||
int _max_atoms, _nall, _inum;
|
||||
numtyp * _write_loc;
|
||||
acctyp * _read_loc;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -0,0 +1,80 @@
|
|||
/***************************************************************************
|
||||
pair_gpu_nbor.cu
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Neighbor memory operations for LAMMPS GPU Library
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Aug 4 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#include "pair_gpu_nbor.h"
|
||||
|
||||
int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
|
||||
if (_use_packing)
|
||||
return (max_nbors*2+4)*sizeof(int);
|
||||
else
|
||||
return (max_nbors+3)*sizeof(int);
|
||||
}
|
||||
|
||||
void PairGPUNbor::init(const int ij_size, const int max_atoms,
|
||||
const int max_nbors) {
|
||||
if (allocated)
|
||||
clear();
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_nbor.init();
|
||||
|
||||
if (_use_packing)
|
||||
dev_nbor.safe_alloc(max_nbors+4,max_atoms);
|
||||
else
|
||||
dev_nbor.safe_alloc(3,max_atoms);
|
||||
|
||||
ij.safe_alloc(max_nbors*max_atoms);
|
||||
host_ij.safe_alloc_w(ij_size);
|
||||
|
||||
allocated=true;
|
||||
}
|
||||
|
||||
void PairGPUNbor::clear() {
|
||||
if (!allocated)
|
||||
return;
|
||||
allocated=false;
|
||||
|
||||
ij.clear();
|
||||
host_ij.clear();
|
||||
dev_nbor.clear();
|
||||
}
|
||||
|
||||
double PairGPUNbor::host_memory_usage() const {
|
||||
return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor);
|
||||
}
|
||||
|
||||
void PairGPUNbor::reset(const int inum, int *ilist, const int *numj,
|
||||
cudaStream_t &s) {
|
||||
ij_total=0;
|
||||
|
||||
dev_nbor.copy_from_host(ilist,inum);
|
||||
int acc=0;
|
||||
for (int i=0; i<inum; i++) {
|
||||
host_ij[i]=numj[ilist[i]];
|
||||
host_ij[i+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
}
|
||||
|
||||
host_ij.copy_to_2Ddevice(dev_nbor.begin()+dev_nbor.row_size(),
|
||||
dev_nbor.row_size(),2,inum, s);
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
/***************************************************************************
|
||||
pair_gpu_nbor.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Neighbor memory operations for LAMMPS GPU Library
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Aug 4 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef PAIR_GPU_NBOR_H
|
||||
#define PAIR_GPU_NBOR_H
|
||||
|
||||
#include "nvc_macros.h"
|
||||
#include "nvc_timer.h"
|
||||
#include "nvc_memory.h"
|
||||
|
||||
#define BLOCK_1D 64
|
||||
#define IJ_SIZE 131072
|
||||
|
||||
class PairGPUNbor {
|
||||
public:
|
||||
PairGPUNbor() : _use_packing(false), allocated(false) {}
|
||||
~PairGPUNbor() { clear(); }
|
||||
|
||||
/// Determine whether neighbor packing should be used
|
||||
/** If true, twice as much memory is reserved to allow packing neighbors by
|
||||
* atom for coalesced access after cutoff evaluation. This can be used
|
||||
* for expensive potentials where it is more efficient to evaluate the
|
||||
* cutoff separately from the potential in order to reduce thread divergence
|
||||
* for expensive routines **/
|
||||
void packing(const bool use_packing) { _use_packing=use_packing; }
|
||||
|
||||
/// Called once to allocate memory
|
||||
void init(const int ij_size, const int max_atoms, const int max_nbors);
|
||||
|
||||
/// Free all memory on host and device
|
||||
void clear();
|
||||
|
||||
/// Bytes per atom used on device
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
/// Total host memory used by class
|
||||
double host_memory_usage() const;
|
||||
|
||||
/// Reset neighbor data (first time or from a rebuild)
|
||||
void reset(const int inum, int *ilist, const int *numj, cudaStream_t &s);
|
||||
/// Add neighbor data from host
|
||||
inline void add(const int num_ij, cudaStream_t &s)
|
||||
{ host_ij.copy_to_device(ij.begin()+ij_total,num_ij,s); ij_total+=num_ij; }
|
||||
|
||||
/// Pack neighbors satisfying cutoff by atom for coalesced access
|
||||
void pack_nbors(const int GX, const int BX, const int start,
|
||||
const int inum, const int form_low, const int form_high);
|
||||
|
||||
|
||||
// ------------------------------- Data -------------------------------
|
||||
|
||||
// Store IJ interactions on device
|
||||
NVC_VecI ij;
|
||||
// Buffer for moving ij data to GPU
|
||||
NVC_HostI host_ij;
|
||||
|
||||
// --------------- Atom neighbors
|
||||
// 3 x n
|
||||
// - 1st row is i
|
||||
// - 2nd row is numj (number of neighbors)
|
||||
// - 3rd row is starting address in host_ij of neighbors
|
||||
NVC_MatI dev_nbor;
|
||||
|
||||
// --------------- Timing Stuff
|
||||
NVCTimer time_nbor;
|
||||
|
||||
int ij_total;
|
||||
private:
|
||||
bool allocated, _use_packing;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -0,0 +1,292 @@
|
|||
/***************************************************************************
|
||||
pair_gpu_texture.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Tricks for templating textures
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS GPU Library
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Jun 23 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : wmbrown@sandia.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#include "nvc_memory.h"
|
||||
|
||||
#ifndef PAIR_GPU_TEXTURE_H
|
||||
#define PAIR_GPU_TEXTURE_H
|
||||
|
||||
template <class numtyp> class cu_vec_traits;
|
||||
template <> class cu_vec_traits<float> { public: typedef float2 vec2; };
|
||||
template <> class cu_vec_traits<double> { public: typedef double2 vec2; };
|
||||
|
||||
// ------------------------------- x ------------------------------------
|
||||
|
||||
static texture<float, 2, cudaReadModeElementType> x_float_tex;
|
||||
static texture<int2, 2, cudaReadModeElementType> x_double_tex;
|
||||
template <class numtyp> inline void x_bind_texture(NVC_ConstMatT &mat)
|
||||
{ mat.bind_texture(x_float_tex); }
|
||||
|
||||
template <> inline void x_bind_texture<double>(NVC_ConstMatD &mat)
|
||||
{ mat.bind_texture(x_double_tex); }
|
||||
template <class numtyp> inline void x_unbind_texture()
|
||||
{ cudaUnbindTexture(x_float_tex); }
|
||||
template <> inline void x_unbind_texture<double>()
|
||||
{ cudaUnbindTexture(x_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp _x_(const int i, const int j) {
|
||||
return tex2D(x_float_tex,i,j);
|
||||
}
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double _x_<double>(const int i,const int j) {
|
||||
int2 t=tex2D(x_double_tex,i,j);
|
||||
return __hiloint2double(t.y, t.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- form ------------------------------------
|
||||
|
||||
static texture<int, 2, cudaReadModeElementType> form_tex;
|
||||
inline void form_bind_texture(NVC_ConstMatI &mat)
|
||||
{ mat.bind_texture(form_tex); }
|
||||
inline void form_unbind_texture()
|
||||
{ cudaUnbindTexture(form_tex); }
|
||||
static __inline__ __device__ int _form_(const int i, const int j) {
|
||||
return tex2D(form_tex,i,j);
|
||||
}
|
||||
|
||||
// ------------------------------- lshape ------------------------------------
|
||||
|
||||
static texture<float, 1, cudaReadModeElementType> lshape_float_tex;
|
||||
static texture<int2, 1, cudaReadModeElementType> lshape_double_tex;
|
||||
static cudaChannelFormatDesc channel_lshape;
|
||||
template <class numtyp> inline void lshape_bind_texture(NVC_VecT &vec)
|
||||
{ vec.bind_texture(lshape_float_tex,channel_lshape); }
|
||||
template <> inline void lshape_bind_texture<double>(NVC_VecD &vec)
|
||||
{ vec.bind_texture(lshape_double_tex,channel_lshape); }
|
||||
template <class numtyp> inline void lshape_unbind_texture()
|
||||
{ cudaUnbindTexture(lshape_float_tex); }
|
||||
template <> inline void lshape_unbind_texture<double>()
|
||||
{ cudaUnbindTexture(lshape_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp _lshape_(const int i)
|
||||
{ return tex1Dfetch(lshape_float_tex,i); }
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double _lshape_<double>(const int i) {
|
||||
int2 t=tex1Dfetch(lshape_double_tex,i);
|
||||
return __hiloint2double(t.y, t.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- shape ------------------------------------
|
||||
|
||||
static texture<float, 2, cudaReadModeElementType> shape_float_tex;
|
||||
static texture<int2, 2, cudaReadModeElementType> shape_double_tex;
|
||||
template <class numtyp> inline void shape_bind_texture(NVC_ConstMatT &mat)
|
||||
{ mat.bind_texture(shape_float_tex); }
|
||||
template <> inline void shape_bind_texture<double>(NVC_ConstMatD &mat)
|
||||
{ mat.bind_texture(shape_double_tex); }
|
||||
template <class numtyp> inline void shape_unbind_texture()
|
||||
{ cudaUnbindTexture(shape_float_tex); }
|
||||
template <> inline void shape_unbind_texture<double>()
|
||||
{ cudaUnbindTexture(shape_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp _shape_(const int i, const int j) {
|
||||
return tex2D(shape_float_tex,j,i);
|
||||
}
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double _shape_<double>(const int i, const int j) {
|
||||
int2 t=tex2D(shape_double_tex,j,i);
|
||||
return __hiloint2double(t.y, t.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- well ------------------------------------
|
||||
|
||||
static texture<float, 2, cudaReadModeElementType> well_float_tex;
|
||||
static texture<int2, 2, cudaReadModeElementType> well_double_tex;
|
||||
template <class numtyp> inline void well_bind_texture(NVC_ConstMatT &mat)
|
||||
{ mat.bind_texture(well_float_tex); }
|
||||
template <> inline void well_bind_texture<double>(NVC_ConstMatD &mat)
|
||||
{ mat.bind_texture(well_double_tex); }
|
||||
template <class numtyp> inline void well_unbind_texture()
|
||||
{ cudaUnbindTexture(well_float_tex); }
|
||||
template <> inline void well_unbind_texture<double>()
|
||||
{ cudaUnbindTexture(well_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp _well_(const int i, const int j)
|
||||
{ return tex2D(well_float_tex,j,i); }
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double _well_<double>(const int i,const int j) {
|
||||
int2 t=tex2D(well_double_tex,j,i);
|
||||
return __hiloint2double(t.y, t.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- sigma ------------------------------------
|
||||
|
||||
static texture<float, 2, cudaReadModeElementType> sigma_float_tex;
|
||||
static texture<int2, 2, cudaReadModeElementType> sigma_double_tex;
|
||||
template <class numtyp> inline void sigma_bind_texture(NVC_ConstMatT &mat)
|
||||
{ mat.bind_texture(sigma_float_tex); }
|
||||
template <> inline void sigma_bind_texture<double>(NVC_ConstMatD &mat)
|
||||
{ mat.bind_texture(sigma_double_tex); }
|
||||
template <class numtyp> inline void sigma_unbind_texture()
|
||||
{ cudaUnbindTexture(sigma_float_tex); }
|
||||
template <> inline void sigma_unbind_texture<double>()
|
||||
{ cudaUnbindTexture(sigma_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp _sigma_(const int i, const int j) {
|
||||
return tex2D(sigma_float_tex,j,i);
|
||||
}
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double _sigma_<double>(const int i,const int j) {
|
||||
int2 t=tex2D(sigma_double_tex,j,i);
|
||||
return __hiloint2double(t.y, t.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- epsilon ------------------------------------
|
||||
|
||||
static texture<float, 2, cudaReadModeElementType> epsilon_float_tex;
|
||||
static texture<int2, 2, cudaReadModeElementType> epsilon_double_tex;
|
||||
template <class numtyp> inline void epsilon_bind_texture(NVC_ConstMatT &mat)
|
||||
{ mat.bind_texture(epsilon_float_tex); }
|
||||
template <> inline void epsilon_bind_texture<double>(NVC_ConstMatD &mat)
|
||||
{ mat.bind_texture(epsilon_double_tex); }
|
||||
template <class numtyp> inline void epsilon_unbind_texture()
|
||||
{ cudaUnbindTexture(epsilon_float_tex); }
|
||||
template <> inline void epsilon_unbind_texture<double>()
|
||||
{ cudaUnbindTexture(epsilon_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp _epsilon_(const int i, const int j) {
|
||||
return tex2D(epsilon_float_tex,j,i);
|
||||
}
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double _epsilon_<double>(const int i,const int j) {
|
||||
int2 t=tex2D(epsilon_double_tex,j,i);
|
||||
return __hiloint2double(t.y, t.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- cutsq ------------------------------------
|
||||
|
||||
static texture<float, 2, cudaReadModeElementType> cutsq_float_tex;
|
||||
static texture<int2, 2, cudaReadModeElementType> cutsq_double_tex;
|
||||
template <class numtyp> inline void cutsq_bind_texture(NVC_ConstMatT &mat)
|
||||
{ mat.bind_texture(cutsq_float_tex); }
|
||||
template <> inline void cutsq_bind_texture<double>(NVC_ConstMatD &mat)
|
||||
{ mat.bind_texture(cutsq_double_tex); }
|
||||
template <class numtyp> inline void cutsq_unbind_texture()
|
||||
{ cudaUnbindTexture(cutsq_float_tex); }
|
||||
template <> inline void cutsq_unbind_texture<double>()
|
||||
{ cudaUnbindTexture(cutsq_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp _cutsq_(const int i, const int j) {
|
||||
return tex2D(cutsq_float_tex,j,i);
|
||||
}
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double _cutsq_<double>(const int i,const int j) {
|
||||
int2 t=tex2D(cutsq_double_tex,j,i);
|
||||
return __hiloint2double(t.y, t.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- lj1 ------------------------------------
|
||||
|
||||
static texture<float2, 2, cudaReadModeElementType> lj1_float_tex;
|
||||
static texture<int4, 2, cudaReadModeElementType> lj1_double_tex;
|
||||
template <class numtyp> inline void lj1_bind_texture(NVC_ConstMatT &mat)
|
||||
{ mat.bind_texture(lj1_float_tex); }
|
||||
template <> inline void lj1_bind_texture<double2>(NVC_ConstMatD2 &mat)
|
||||
{ mat.bind_texture(lj1_double_tex); }
|
||||
template <class numtyp> inline void lj1_unbind_texture()
|
||||
{ cudaUnbindTexture(lj1_float_tex); }
|
||||
template <> inline void lj1_unbind_texture<double2>()
|
||||
{ cudaUnbindTexture(lj1_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__
|
||||
typename cu_vec_traits<numtyp>::vec2 _lj1_(const int i, const int j) {
|
||||
return tex2D(lj1_float_tex,j,i);
|
||||
}
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double2 _lj1_<double>(const int i,const int j) {
|
||||
int4 t=tex2D(lj1_double_tex,j,i);
|
||||
double2 ans;
|
||||
ans.x=__hiloint2double(t.y, t.x);
|
||||
ans.y=__hiloint2double(t.w, t.z);
|
||||
return ans;
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- lj3 ------------------------------------
|
||||
|
||||
static texture<float2, 2, cudaReadModeElementType> lj3_float_tex;
|
||||
static texture<int4, 2, cudaReadModeElementType> lj3_double_tex;
|
||||
template <class numtyp> inline void lj3_bind_texture(NVC_ConstMatT &mat)
|
||||
{ mat.bind_texture(lj3_float_tex); }
|
||||
template <> inline void lj3_bind_texture<double2>(NVC_ConstMatD2 &mat)
|
||||
{ mat.bind_texture(lj3_double_tex); }
|
||||
template <class numtyp> inline void lj3_unbind_texture()
|
||||
{ cudaUnbindTexture(lj3_float_tex); }
|
||||
template <> inline void lj3_unbind_texture<double2>()
|
||||
{ cudaUnbindTexture(lj3_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__
|
||||
typename cu_vec_traits<numtyp>::vec2 _lj3_(const int i, const int j) {
|
||||
return tex2D(lj3_float_tex,j,i);
|
||||
}
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double2 _lj3_<double>(const int i,const int j) {
|
||||
int4 t=tex2D(lj3_double_tex,j,i);
|
||||
double2 ans;
|
||||
ans.x=__hiloint2double(t.y, t.x);
|
||||
ans.y=__hiloint2double(t.w, t.z);
|
||||
return ans;
|
||||
}
|
||||
#endif
|
||||
|
||||
// ------------------------------- offset ------------------------------------
|
||||
|
||||
static texture<float, 2, cudaReadModeElementType> offset_float_tex;
|
||||
static texture<int2, 2, cudaReadModeElementType> offset_double_tex;
|
||||
template <class numtyp> inline void offset_bind_texture(NVC_ConstMatT &mat)
|
||||
{ mat.bind_texture(offset_float_tex); }
|
||||
template <> inline void offset_bind_texture<double>(NVC_ConstMatD &mat)
|
||||
{ mat.bind_texture(offset_double_tex); }
|
||||
template <class numtyp> inline void offset_unbind_texture()
|
||||
{ cudaUnbindTexture(offset_float_tex); }
|
||||
template <> inline void offset_unbind_texture<double>()
|
||||
{ cudaUnbindTexture(offset_double_tex); }
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp _offset_(const int i, const int j) {
|
||||
return tex2D(offset_float_tex,j,i);
|
||||
}
|
||||
#ifdef GB_GPU_DOUBLE
|
||||
template <>
|
||||
static __inline__ __device__ double _offset_<double>(const int i,const int j) {
|
||||
int2 t=tex2D(offset_double_tex,j,i);
|
||||
return __hiloint2double(t.y, t.x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -0,0 +1,34 @@
|
|||
# Install/unInstall package classes in LAMMPS
|
||||
# edit Makefile.package to include/exclude GPU library
|
||||
|
||||
if ($1 == 1) then
|
||||
|
||||
# sed -i 's/\S*gpu //' ../Makefile.package
|
||||
# sed -i 's|^PKGPATH =\s*|&-L../../lib/gpu |' ../Makefile.package
|
||||
# sed -i 's|^PKGLIB =\s*|&-lpair_gpu_lib |' ../Makefile.package
|
||||
|
||||
cp -f pair_lj_cut_gpu.h ../
|
||||
cp -f pair_lj_cut_gpu.cpp ../
|
||||
|
||||
if (-e ../pair_gayberne.cpp) then
|
||||
cp -f style_gpu.h ../
|
||||
cp -f pair_gayberne_gpu.h ../
|
||||
cp -f pair_gayberne_gpu.cpp ../
|
||||
else
|
||||
grep -v erne style_gpu.h > ../style_gpu.h
|
||||
endif
|
||||
|
||||
else if ($1 == 0) then
|
||||
|
||||
# sed -i 's/\S*gpu //' ../Makefile.package
|
||||
|
||||
rm -f ../style_gpu.h
|
||||
touch ../style_gpu.h
|
||||
|
||||
rm -f ../pair_lj_cut_gpu.h
|
||||
rm -f ../pair_lj_cut_gpu.cpp
|
||||
|
||||
rm -f ../pair_gayberne_gpu.h
|
||||
rm -f ../pair_gayberne_gpu.cpp
|
||||
|
||||
endif
|
|
@ -0,0 +1,369 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
cetain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: Mike Brown (SNL)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "math.h"
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "string.h"
|
||||
#include "pair_gayberne_gpu.h"
|
||||
#include "math_extra.h"
|
||||
#include "atom.h"
|
||||
#include "atom_vec.h"
|
||||
#include "comm.h"
|
||||
#include "force.h"
|
||||
#include "neighbor.h"
|
||||
#include "neigh_list.h"
|
||||
#include "integrate.h"
|
||||
#include "memory.h"
|
||||
#include "error.h"
|
||||
#include "neigh_request.h"
|
||||
#include "universe.h"
|
||||
|
||||
#ifdef GB_GPU_OMP
|
||||
#include "omp.h"
|
||||
#endif
|
||||
|
||||
#define MIN(a,b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// External functions from cuda library for atom decomposition
|
||||
int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **shape,
|
||||
double **well, double **cutsq, double **sigma,
|
||||
double **epsilon, double *host_lshape, int **form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int max_nbors, const int thread, const int gpu_id);
|
||||
void gb_gpu_clear(const int thread);
|
||||
int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
|
||||
int *ilist, const int *numj, const int *type,
|
||||
const int thread, bool &success);
|
||||
void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread);
|
||||
void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type,
|
||||
const bool rebuild, const int thread);
|
||||
void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
|
||||
const int thread);
|
||||
double gb_gpu_forces(double **f, double **tor, const int *ilist,
|
||||
const bool eflag, const bool vflag, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial, const int thread);
|
||||
std::string gb_gpu_name(const int i, const int max_nbors);
|
||||
void gb_gpu_time(const int thread);
|
||||
int gb_gpu_num_devices();
|
||||
double gb_gpu_bytes();
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), my_thread(0),
|
||||
omp_chunk(0), nthreads(1),
|
||||
multi_gpu_mode(ONE_NODE),
|
||||
multi_gpu_param(0)
|
||||
{
|
||||
ij_new[0]=NULL;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
free all arrays
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
PairGayBerneGPU::~PairGayBerneGPU()
|
||||
{
|
||||
if (comm->me == 0 && screen) {
|
||||
printf("\n\n-------------------------------------");
|
||||
printf("--------------------------------\n");
|
||||
printf(" GPU Time Stamps: ");
|
||||
printf("\n-------------------------------------");
|
||||
printf("--------------------------------\n");
|
||||
gb_gpu_time(my_thread);
|
||||
std::cout << "Procs: " << universe->nprocs << std::endl;
|
||||
printf("-------------------------------------");
|
||||
printf("--------------------------------\n\n");
|
||||
}
|
||||
#pragma omp parallel
|
||||
{
|
||||
#ifdef GB_GPU_OMP
|
||||
int my_thread=omp_get_thread_num();
|
||||
#endif
|
||||
gb_gpu_clear(my_thread);
|
||||
if (ij_new[my_thread]!=NULL) {
|
||||
ij_new[my_thread]=NULL;
|
||||
delete [] ij_new[my_thread];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void PairGayBerneGPU::compute(int eflag, int vflag)
|
||||
{
|
||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||
else evflag = vflag_fdotr = 0;
|
||||
if (vflag_atom)
|
||||
error->all("Per-atom virial not available with GPU Gay-Berne.");
|
||||
|
||||
int nlocal = atom->nlocal;
|
||||
int nall = nlocal + atom->nghost;
|
||||
int inum = list->inum;
|
||||
|
||||
bool rebuild=false;
|
||||
if (neighbor->ncalls > last_neighbor) {
|
||||
last_neighbor=neighbor->ncalls;
|
||||
rebuild=true;
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
|
||||
bool success=true;
|
||||
#ifdef GB_GPU_OMP
|
||||
int my_thread=omp_get_thread_num();
|
||||
if (rebuild) {
|
||||
omp_chunk=static_cast<int>(ceil(static_cast<double>(inum)/nthreads));
|
||||
if (my_thread==nthreads-1)
|
||||
thread_inum[my_thread]=inum-(nthreads-1)*omp_chunk;
|
||||
else
|
||||
thread_inum[my_thread]=omp_chunk;
|
||||
olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal,
|
||||
thread_inum[my_thread],
|
||||
list->ilist+omp_chunk*my_thread,
|
||||
list->numneigh, atom->type, my_thread,
|
||||
success);
|
||||
}
|
||||
#else
|
||||
if (rebuild)
|
||||
olist[my_thread]=gb_gpu_reset_nbors(nall, atom->nlocal, inum, list->ilist,
|
||||
list->numneigh, atom->type, my_thread,
|
||||
success);
|
||||
#endif
|
||||
if (!success)
|
||||
error->one("Total # of atoms exceeds maximum allowed per GPGPU.\n");
|
||||
|
||||
// copy atom data to GPU
|
||||
gb_gpu_atom(atom->x,atom->quat,atom->type,rebuild,my_thread);
|
||||
|
||||
int i,j,ii,jj,jnum;
|
||||
double factor_lj;
|
||||
int *jlist;
|
||||
|
||||
if (rebuild==true) {
|
||||
int num_ij = 0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
int *ijp=ij_new[my_thread];
|
||||
#ifdef GB_GPU_OMP
|
||||
int mgo=my_thread*omp_chunk;
|
||||
int mgot=mgo+thread_inum[my_thread];
|
||||
#else
|
||||
int mgo=0, mgot=inum;
|
||||
#endif
|
||||
for (ii = mgo; ii<mgot; ii++) {
|
||||
i = olist[my_thread][ii];
|
||||
jlist = list->firstneigh[i];
|
||||
jnum = list->numneigh[i];
|
||||
|
||||
for (jj = 0; jj < jnum; jj++) {
|
||||
j = jlist[jj];
|
||||
|
||||
*ijp=j;
|
||||
ijp++;
|
||||
num_ij++;
|
||||
|
||||
if (num_ij==ij_size) {
|
||||
memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
|
||||
gb_gpu_nbors(num_ij,eflag,my_thread);
|
||||
ijp=ij_new[my_thread];
|
||||
num_ij=0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (num_ij>0) {
|
||||
memcpy(ij[my_thread],ij_new[my_thread],num_ij*sizeof(int));
|
||||
gb_gpu_nbors(num_ij,eflag,my_thread);
|
||||
}
|
||||
}
|
||||
|
||||
gb_gpu_gayberne(eflag,vflag,rebuild,my_thread);
|
||||
double lvirial[6];
|
||||
for (int i=0; i<6; i++) lvirial[i]=0.0;
|
||||
double my_eng=gb_gpu_forces(atom->f,atom->torque,olist[my_thread],eflag,vflag,
|
||||
eflag_atom, vflag_atom, eatom, vatom, lvirial,
|
||||
my_thread);
|
||||
#pragma omp critical
|
||||
{
|
||||
eng_vdwl+=my_eng;
|
||||
virial[0]+=lvirial[0];
|
||||
virial[1]+=lvirial[1];
|
||||
virial[2]+=lvirial[2];
|
||||
virial[3]+=lvirial[3];
|
||||
virial[4]+=lvirial[4];
|
||||
virial[5]+=lvirial[5];
|
||||
}
|
||||
|
||||
} //End parallel
|
||||
|
||||
if (vflag_fdotr) virial_compute();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
global settings
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairGayBerneGPU::settings(int narg, char **arg)
|
||||
{
|
||||
if (narg != 4 && narg != 6) error->all("Illegal pair_style command");
|
||||
|
||||
// Set multi_gpu_mode to one_node for multiple gpus on 1 node
|
||||
// -- param is starting gpu id
|
||||
// Set multi_gpu_mode to one_gpu to select the same gpu id on every node
|
||||
// -- param is id of gpu
|
||||
// Set multi_gpu_mode to multi_gpu to get ma
|
||||
// -- param is number of gpus per node
|
||||
multi_gpu_mode=ONE_NODE;
|
||||
multi_gpu_param=0;
|
||||
if (narg==6) {
|
||||
multi_gpu_param=atoi(arg[5]);
|
||||
if (strcmp("one_node",arg[4])==0)
|
||||
multi_gpu_mode=ONE_NODE;
|
||||
else if (strcmp("one_gpu",arg[4])==0)
|
||||
multi_gpu_mode=ONE_GPU;
|
||||
else if (strcmp("multi_gpu",arg[4])==0) {
|
||||
multi_gpu_mode=MULTI_GPU;
|
||||
if (multi_gpu_param<1)
|
||||
error->all("Illegal pair_style command");
|
||||
} else
|
||||
error->all("Illegal pair_style command");
|
||||
narg-=2;
|
||||
}
|
||||
|
||||
PairGayBerne::settings(narg,arg);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
init specific to this pair style
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairGayBerneGPU::init_style()
|
||||
{
|
||||
// Set the GPU ID
|
||||
int my_gpu=comm->me+multi_gpu_param;
|
||||
int ngpus=universe->nprocs;
|
||||
if (multi_gpu_mode==ONE_GPU) {
|
||||
my_gpu=multi_gpu_param;
|
||||
ngpus=1;
|
||||
} else if (multi_gpu_mode==MULTI_GPU) {
|
||||
ngpus=multi_gpu_param;
|
||||
my_gpu=comm->me%ngpus;
|
||||
if (ngpus>universe->nprocs)
|
||||
ngpus=universe->nprocs;
|
||||
}
|
||||
|
||||
if (!atom->quat_flag || !atom->torque_flag || !atom->avec->shape_type)
|
||||
error->all("Pair gayberne requires atom attributes quat, torque, shape");
|
||||
if (atom->radius_flag)
|
||||
error->all("Pair gayberne cannot be used with atom attribute diameter");
|
||||
|
||||
int irequest = neighbor->request(this);
|
||||
|
||||
// per-type shape precalculations
|
||||
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
if (setwell[i]) {
|
||||
double *one = atom->shape[i];
|
||||
shape[i][0] = one[0]*one[0];
|
||||
shape[i][1] = one[1]*one[1];
|
||||
shape[i][2] = one[2]*one[2];
|
||||
lshape[i] = (one[0]*one[1]+one[2]*one[2])*sqrt(one[0]*one[1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Repeat cutsq calculation because done after call to init_style
|
||||
double cut;
|
||||
for (int i = 1; i <= atom->ntypes; i++)
|
||||
for (int j = i; j <= atom->ntypes; j++) {
|
||||
cut = init_one(i,j);
|
||||
cutsq[i][j] = cutsq[j][i] = cut*cut;
|
||||
}
|
||||
|
||||
// If compiled with OpenMP and only 1 proc, try to use multiple GPUs w/threads
|
||||
#ifdef GB_GPU_OMP
|
||||
if (multi_gpu_mode!=ONE_GPU)
|
||||
nthreads=ngpus=gb_gpu_num_devices();
|
||||
else
|
||||
nthreads=ngpus=1;
|
||||
if (nthreads>MAX_GPU_THREADS)
|
||||
nthreads=MAX_GPU_THREADS;
|
||||
omp_set_num_threads(nthreads);
|
||||
#endif
|
||||
|
||||
#pragma omp parallel firstprivate(my_gpu)
|
||||
{
|
||||
#ifdef GB_GPU_OMP
|
||||
int my_thread = omp_get_thread_num();
|
||||
if (multi_gpu_mode!=ONE_GPU)
|
||||
my_gpu=my_thread;
|
||||
if (multi_gpu_mode==ONE_NODE)
|
||||
my_gpu+=multi_gpu_param;
|
||||
#endif
|
||||
|
||||
ij[my_thread]=gb_gpu_init(ij_size, atom->ntypes+1, gamma, upsilon, mu,
|
||||
shape, well, cutsq, sigma, epsilon, lshape, form,
|
||||
lj1, lj2, lj3, lj4, offset, force->special_lj,
|
||||
neighbor->oneatom, my_thread, my_gpu);
|
||||
if (ij[my_thread]==0)
|
||||
error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
|
||||
|
||||
if (ij_new[my_thread]!=NULL)
|
||||
delete [] ij_new[my_thread];
|
||||
ij_new[my_thread]=new int[ij_size];
|
||||
}
|
||||
|
||||
last_neighbor = -1;
|
||||
neighbor->requests[irequest]->half = 0;
|
||||
neighbor->requests[irequest]->full = 1;
|
||||
if (force->newton_pair)
|
||||
error->all("Cannot use newton with GPU GayBerne pair style.");
|
||||
|
||||
if (comm->me == 0 && screen) {
|
||||
printf("\n-------------------------------------");
|
||||
printf("-------------------------------------\n");
|
||||
printf("- Using GPGPU acceleration for Gay-Berne:\n");
|
||||
printf("-------------------------------------");
|
||||
printf("-------------------------------------\n");
|
||||
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
int gpui=my_gpu;
|
||||
if (multi_gpu_mode==ONE_NODE)
|
||||
gpui=i+multi_gpu_param;
|
||||
else if (multi_gpu_mode==MULTI_GPU)
|
||||
gpui=i;
|
||||
std::string gpu_string=gb_gpu_name(gpui,neighbor->oneatom);
|
||||
printf("GPU %d: %s\n",gpui,gpu_string.c_str());
|
||||
}
|
||||
printf("-------------------------------------");
|
||||
printf("-------------------------------------\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
double PairGayBerneGPU::memory_usage()
|
||||
{
|
||||
double bytes=Pair::memory_usage()+nthreads*ij_size*sizeof(int);
|
||||
return bytes+gb_gpu_bytes();
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef PAIR_GPU_H
|
||||
#define PAIR_GPU_H
|
||||
|
||||
#include "pair_gayberne.h"
|
||||
#define MAX_GPU_THREADS 4
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class PairGayBerneGPU : public PairGayBerne {
|
||||
public:
|
||||
PairGayBerneGPU(LAMMPS *lmp);
|
||||
~PairGayBerneGPU();
|
||||
void compute(int, int);
|
||||
void settings(int, char **);
|
||||
void init_style();
|
||||
double memory_usage();
|
||||
|
||||
enum { ONE_NODE, ONE_GPU, MULTI_GPU };
|
||||
|
||||
private:
|
||||
int ij_size;
|
||||
int *ij[MAX_GPU_THREADS], *ij_new[MAX_GPU_THREADS], *olist[MAX_GPU_THREADS];
|
||||
|
||||
int my_thread, nthreads, thread_inum[MAX_GPU_THREADS], omp_chunk;
|
||||
|
||||
int last_neighbor, multi_gpu_mode, multi_gpu_param;
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,270 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
cetain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: Mike Brown (SNL)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "math.h"
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "string.h"
|
||||
#include "pair_lj_cut_gpu.h"
|
||||
#include "math_extra.h"
|
||||
#include "atom.h"
|
||||
#include "atom_vec.h"
|
||||
#include "comm.h"
|
||||
#include "force.h"
|
||||
#include "neighbor.h"
|
||||
#include "neigh_list.h"
|
||||
#include "integrate.h"
|
||||
#include "memory.h"
|
||||
#include "error.h"
|
||||
#include "neigh_request.h"
|
||||
#include "universe.h"
|
||||
|
||||
#define MIN(a,b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// External functions from cuda library for force decomposition
|
||||
int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,
|
||||
double **sigma, double **epsilon, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int max_nbors,
|
||||
const int gpu_id);
|
||||
void lj_gpu_clear();
|
||||
bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
|
||||
const int *numj);
|
||||
void lj_gpu_nbors(const int num_ij);
|
||||
void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild);
|
||||
void lj_gpu(const bool eflag, const bool vflag, const bool rebuild);
|
||||
double lj_gpu_forces(double **f, const int *ilist, const bool eflag,
|
||||
const bool vflag, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial);
|
||||
std::string lj_gpu_name(const int gpu_id, const int max_nbors);
|
||||
void lj_gpu_time();
|
||||
int lj_gpu_num_devices();
|
||||
double lj_gpu_bytes();
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
PairLJCutGPU::PairLJCutGPU(LAMMPS *lmp) : PairLJCut(lmp), multi_gpu_mode(0)
|
||||
{
|
||||
ij_new=NULL;
|
||||
respa_enable = 0;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
free all arrays
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
PairLJCutGPU::~PairLJCutGPU()
|
||||
{
|
||||
if (comm->me == 0 && screen) {
|
||||
printf("\n\n-------------------------------------");
|
||||
printf("--------------------------------\n");
|
||||
printf(" GPU Time Stamps: ");
|
||||
printf("\n-------------------------------------");
|
||||
printf("--------------------------------\n");
|
||||
lj_gpu_time();
|
||||
std::cout << "Procs: " << universe->nprocs << std::endl;
|
||||
printf("-------------------------------------");
|
||||
printf("--------------------------------\n\n");
|
||||
}
|
||||
lj_gpu_clear();
|
||||
if (ij_new!=NULL) {
|
||||
ij_new=NULL;
|
||||
delete [] ij_new;
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void PairLJCutGPU::compute(int eflag, int vflag)
|
||||
{
|
||||
if (eflag || vflag) ev_setup(eflag,vflag);
|
||||
else evflag = vflag_fdotr = 0;
|
||||
if (vflag_atom)
|
||||
error->all("Per-atom virial not available with GPU Gay-Berne.");
|
||||
|
||||
int nlocal = atom->nlocal;
|
||||
int nall = nlocal + atom->nghost;
|
||||
int inum = list->inum;
|
||||
int *ilist = list->ilist;
|
||||
|
||||
bool rebuild=false;
|
||||
if (neighbor->ncalls > last_neighbor) {
|
||||
last_neighbor=neighbor->ncalls;
|
||||
rebuild=true;
|
||||
}
|
||||
|
||||
// copy nbors to GPU
|
||||
if (rebuild)
|
||||
if (!lj_gpu_reset_nbors(nall, inum, ilist, list->numneigh))
|
||||
error->one("Total # of atoms exceed maximum allowed per GPGPU.\n");
|
||||
|
||||
// copy atom data to GPU
|
||||
lj_gpu_atom(atom->x,atom->type,rebuild);
|
||||
|
||||
int i,j,ii,jj,jnum;
|
||||
double factor_lj;
|
||||
int *jlist;
|
||||
|
||||
if (rebuild==true) {
|
||||
int num_ij = 0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
int *ijp=ij_new;
|
||||
for (ii = 0; ii<inum; ii++) {
|
||||
i = ilist[ii];
|
||||
jlist = list->firstneigh[i];
|
||||
jnum = list->numneigh[i];
|
||||
|
||||
for (jj = 0; jj < jnum; jj++) {
|
||||
j = jlist[jj];
|
||||
|
||||
*ijp=j;
|
||||
ijp++;
|
||||
num_ij++;
|
||||
|
||||
if (num_ij==ij_size) {
|
||||
memcpy(ij,ij_new,num_ij*sizeof(int));
|
||||
lj_gpu_nbors(num_ij);
|
||||
ijp=ij_new;
|
||||
num_ij=0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (num_ij>0) {
|
||||
memcpy(ij,ij_new,num_ij*sizeof(int));
|
||||
lj_gpu_nbors(num_ij);
|
||||
}
|
||||
}
|
||||
|
||||
lj_gpu(eflag,vflag,rebuild);
|
||||
eng_vdwl=lj_gpu_forces(atom->f,ilist,eflag,vflag, eflag_atom, vflag_atom,
|
||||
eatom, vatom, virial);
|
||||
|
||||
if (vflag_fdotr) virial_compute();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
global settings
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairLJCutGPU::settings(int narg, char **arg)
|
||||
{
|
||||
if (narg != 1 && narg != 3) error->all("Illegal pair_style command");
|
||||
|
||||
// Set multi_gpu_mode to one_node for multiple gpus on 1 node
|
||||
// -- param is starting gpu id
|
||||
// Set multi_gpu_mode to one_gpu to select the same gpu id on every node
|
||||
// -- param is id of gpu
|
||||
// Set multi_gpu_mode to multi_gpu to get ma
|
||||
// -- param is number of gpus per node
|
||||
multi_gpu_mode=ONE_NODE;
|
||||
multi_gpu_param=0;
|
||||
if (narg==3) {
|
||||
multi_gpu_param=atoi(arg[2]);
|
||||
if (strcmp("one_node",arg[1])==0)
|
||||
multi_gpu_mode=ONE_NODE;
|
||||
else if (strcmp("one_gpu",arg[1])==0)
|
||||
multi_gpu_mode=ONE_GPU;
|
||||
else if (strcmp("multi_gpu",arg[1])==0) {
|
||||
multi_gpu_mode=MULTI_GPU;
|
||||
if (multi_gpu_param<1)
|
||||
error->all("Illegal pair_style command");
|
||||
} else
|
||||
error->all("Illegal pair_style command");
|
||||
narg-=2;
|
||||
}
|
||||
|
||||
PairLJCut::settings(narg,arg);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
init specific to this pair style
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairLJCutGPU::init_style()
|
||||
{
|
||||
// Set the GPU ID
|
||||
int my_gpu=comm->me+multi_gpu_param;
|
||||
int ngpus=universe->nprocs;
|
||||
if (multi_gpu_mode==ONE_GPU) {
|
||||
my_gpu=multi_gpu_param;
|
||||
ngpus=1;
|
||||
} else if (multi_gpu_mode==MULTI_GPU) {
|
||||
ngpus=multi_gpu_param;
|
||||
my_gpu=comm->me%ngpus;
|
||||
if (ngpus>universe->nprocs)
|
||||
ngpus=universe->nprocs;
|
||||
}
|
||||
|
||||
int irequest = neighbor->request(this);
|
||||
cut_respa=NULL;
|
||||
|
||||
// Repeat cutsq calculation because done after call to init_style
|
||||
double cut;
|
||||
for (int i = 1; i <= atom->ntypes; i++)
|
||||
for (int j = i; j <= atom->ntypes; j++) {
|
||||
cut = init_one(i,j);
|
||||
cutsq[i][j] = cutsq[j][i] = cut*cut;
|
||||
}
|
||||
|
||||
ij=lj_gpu_init(ij_size, atom->ntypes+1, cutsq, sigma, epsilon, lj1, lj2, lj3,
|
||||
lj4, offset, force->special_lj, neighbor->oneatom, my_gpu);
|
||||
if (ij==0)
|
||||
error->one("AT LEAST ONE PROCESS COULD NOT ALLOCATE A CUDA-ENABLED GPU.");
|
||||
|
||||
if (ij_new!=NULL)
|
||||
delete [] ij_new;
|
||||
ij_new=new int[ij_size];
|
||||
|
||||
last_neighbor = -1;
|
||||
neighbor->requests[irequest]->half = 0;
|
||||
neighbor->requests[irequest]->full = 1;
|
||||
if (force->newton_pair)
|
||||
error->all("Cannot use newton with GPU LJCut pair style.");
|
||||
|
||||
if (comm->me == 0 && screen) {
|
||||
printf("\n-------------------------------------");
|
||||
printf("-------------------------------------\n");
|
||||
printf("- Using GPGPU acceleration for LJ-Cut:\n");
|
||||
printf("-------------------------------------");
|
||||
printf("-------------------------------------\n");
|
||||
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
int gpui=my_gpu;
|
||||
if (multi_gpu_mode==ONE_NODE)
|
||||
gpui=i+multi_gpu_param;
|
||||
else if (multi_gpu_mode==MULTI_GPU)
|
||||
gpui=i;
|
||||
std::string gpu_string=lj_gpu_name(gpui,neighbor->oneatom);
|
||||
printf("GPU %d: %s\n",gpui,gpu_string.c_str());
|
||||
}
|
||||
printf("-------------------------------------");
|
||||
printf("-------------------------------------\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
double PairLJCutGPU::memory_usage()
|
||||
{
|
||||
double bytes=Pair::memory_usage()+ij_size*sizeof(int);
|
||||
return bytes+lj_gpu_bytes();
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef PAIR_LJ_CUT_GPU_H
|
||||
#define PAIR_LJ_CUT_GPU_H
|
||||
|
||||
#include "pair_lj_cut.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class PairLJCutGPU : public PairLJCut {
|
||||
public:
|
||||
PairLJCutGPU(LAMMPS *lmp);
|
||||
~PairLJCutGPU();
|
||||
void compute(int, int);
|
||||
void settings(int, char **);
|
||||
void init_style();
|
||||
double memory_usage();
|
||||
|
||||
enum { ONE_NODE, ONE_GPU, MULTI_GPU };
|
||||
|
||||
private:
|
||||
int ij_size;
|
||||
int *ij, *ij_new;
|
||||
|
||||
int last_neighbor, multi_gpu_mode, multi_gpu_param;
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,41 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef AtomInclude
|
||||
#endif
|
||||
|
||||
#ifdef AtomClass
|
||||
# endif
|
||||
|
||||
#ifdef ComputeInclude
|
||||
#endif
|
||||
|
||||
#ifdef ComputeClass
|
||||
#endif
|
||||
|
||||
#ifdef FixInclude
|
||||
#endif
|
||||
|
||||
#ifdef FixClass
|
||||
#endif
|
||||
|
||||
#ifdef PairInclude
|
||||
#include "pair_lj_cut_gpu.h"
|
||||
#include "pair_gayberne_gpu.h"
|
||||
#endif
|
||||
|
||||
#ifdef PairClass
|
||||
PairStyle(lj/cut/gpu,PairLJCutGPU)
|
||||
PairStyle(gayberne/gpu,PairGayBerneGPU)
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue