Updating lib/gpu to version 2.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3785 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
pscrozi 2010-02-04 21:33:18 +00:00
parent 5ce854b780
commit 4ae4792b00
27 changed files with 1602 additions and 573 deletions

72
lib/gpu/Makefile.cyg Normal file
View File

@ -0,0 +1,72 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
BIN_DIR = .
OBJ_DIR = .
AR = ar
CUDA_CPP = /cygdrive/c/CUDA/bin/nvcc -I/cygdrive/c/CUDA/include -O3 -DWINDLL -DUNIX -Xptxas -v --use_fast_math
CUDA_ARCH = -arch=sm_13
CUDA_PREC = -D_SINGLE_SINGLE
CUDA_LINK = -L/cygdrive/c/CUDA/lib -lcudart $(CUDA_LIB)
CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
CUDA_LIB = $(OBJ_DIR)/gpu.dll
# Headers for CUDA Stuff
NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
# Headers for Pair Stuff
PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
# Dependencies for the Texture Tar
TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
ALL_H = $(NVC_H) $(PAIR_H)
EXECS = $(BIN_DIR)/nvc_get_devices
OBJS = $(OBJ_DIR)/nvc_device.obj $(OBJ_DIR)/pair_gpu_nbor.obj \
$(OBJ_DIR)/pair_tex_tar.obj $(OBJ_DIR)/pair_gpu_cell.obj
all: $(CUDA_LIB) $(EXECS)
$(OBJ_DIR)/nvc_device.obj : nvc_device.cu $(NVC_H)
$(CUDA) -o $@ -c nvc_device.cu
$(OBJ_DIR)/pair_gpu_nbor.obj: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
$(CUDA) -o $@ -c pair_gpu_nbor.cu
$(OBJ_DIR)/pair_tex_tar.obj: $(TAR_H)
$(CUDA) -o $@ -c pair_tex_tar.cu
$(OBJ_DIR)/pair_gpu_cell.obj: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
$(CUDA) -o $@ -c pair_gpu_cell.cu
$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.obj
$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.obj
$(CUDA_LIB): $(OBJS) $(TAR_H)
$(CUDA) -o $@ -shared $(OBJS)
clean:
rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.exe *.exp *.lib *.dll *.linkinfo
veryclean: clean
rm -rf *~ *.linkinfo

View File

@ -1,25 +1,29 @@
#***************************************************************************
# Makefile
# -------------------
# W. Michael Brown
#
# _________________________________________________________________________
# Build for the LAMMPS GPU Force Library
#
# _________________________________________________________________________
#
# begin : Tue June 23 2009
# copyright : (C) 2009 by W. Michael Brown
# email : wmbrown@sandia.gov
# ***************************************************************************/
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
BIN_DIR = .
OBJ_DIR = .
AR = ar
CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
CUDA_ARCH = -arch=sm_13
CUDA_PREC = -D_SINGLE_SINGLE
CUDA_LINK = -L/usr/local/cuda/lib64 -lcudart $(CUDA_LIB)
CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
@ -28,7 +32,7 @@ CUDA_LIB = $(OBJ_DIR)/libgpu.a
# Headers for CUDA Stuff
NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
# Headers for Pair Stuff
PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h
PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
# Dependencies for the Texture Tar
TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
@ -38,7 +42,7 @@ ALL_H = $(NVC_H) $(PAIR_H)
EXECS = $(BIN_DIR)/nvc_get_devices
OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/pair_gpu_nbor.cu_o \
$(OBJ_DIR)/pair_tex_tar.cu_o
$(OBJ_DIR)/pair_tex_tar.cu_o $(OBJ_DIR)/pair_gpu_cell.cu_o
all: $(CUDA_LIB) $(EXECS)
@ -51,6 +55,9 @@ $(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor
$(OBJ_DIR)/pair_tex_tar.cu_o: $(TAR_H)
$(CUDA) -o $@ -c pair_tex_tar.cu
$(OBJ_DIR)/pair_gpu_cell.cu_o: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
$(CUDA) -o $@ -c pair_gpu_cell.cu
$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o

View File

@ -1,29 +1,25 @@
/***************************************************************************
README
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
README for building LAMMPS GPU Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
GENERAL NOTES
This library, pair_gpu_lib.a, provides routines for GPGPU acceleration
This library, libgpu.a, provides routines for GPU acceleration
of LAMMPS pair styles. Currently, only CUDA enabled GPUs are
supported. Compilation of this library requires installing the CUDA
GPU driver and CUDA toolkit for your operating system. In addition to
@ -33,14 +29,14 @@ devices on your system.
NOTE: Installation of the CUDA SDK is not required.
Current pair styles supporting GPU Accelartion:
Current pair styles supporting GPU acceleration:
1. lj/cut/gpu
2. gayberne/gpu
MULTIPLE LAMMPS PROCESSES
When using GPGPU acceleration, you are restricted to one physical GPU
When using GPU acceleration, you are restricted to one physical GPU
per LAMMPS process. This can be multiple GPUs on a single node or
across multiple nodes. Intructions on GPU assignment can be found in
the LAMMPS documentation.
@ -66,6 +62,9 @@ the CUDA_PREC variable:
CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations
CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
NOTE: For the lj/cut pair style, only single precision will be used, even
if double precision is specified.
NOTE: Double precision is only supported on certain GPUS (with
compute capability>=1.3).
@ -74,14 +73,14 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
package has been installed before installing the GPU package in LAMMPS.
GPU MEMORY
Upon initialization of the pair style, the library will reserve memory
for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
Upon initialization of the gayberne/gpu pair style, the library will reserve
memory for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
is limiting. The value of 70% can be changed by editing the
PERCENT_GPU_MEMORY definition in the source file. The value of 64K
cannot be increased and is the maximum number of atoms allowed per
PERCENT_GPU_MEMORY definition in the source file. For gayberne/gpu, the value
of 64K cannot be increased and is the maximum number of atoms allowed per
GPU. Using the 'neigh_modify one' modifier in your LAMMPS input script
can help to increase maximum number of atoms per GPU for cards with
limited memory.

View File

@ -1,27 +1,21 @@
/***************************************************************************
gb_gpu.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Gay-Berne anisotropic potential GPU calcultation
*** Force decomposition by Atom Version ***
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
@ -190,30 +184,30 @@ inline string gb_gpu_toa(const t& in) {
// ---------------------------------------------------------------------------
// Return string with GPU info
// ---------------------------------------------------------------------------
string gb_gpu_name(const int id, const int max_nbors) {
string name=GBMF[0].gpu.name(id)+", "+
EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
string sname=GBMF[0].gpu.name(id)+", "+
gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
max_nbors))+" Atoms";
return name;
strcpy(name,sname.c_str());
}
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int max_nbors, const int thread, const int gpu_id) {
EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int max_nbors, const int thread, const int gpu_id) {
assert(thread<MAX_GPU_THREADS);
GBMF[thread].gpu.init();
if (GBMF[thread].gpu.num_devices()==0)
return false;
@ -227,7 +221,7 @@ bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
void gb_gpu_clear(const int thread) {
EXTERN void gb_gpu_clear(const int thread) {
GBMF[thread].clear();
}
@ -262,7 +256,7 @@ inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
atom.time_atom.stop();
}
void gb_gpu_atom(double **host_x, double **host_quat,
EXTERN void gb_gpu_atom(double **host_x, double **host_quat,
const int *host_type, const bool rebuild, const int thread) {
_gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
GBMF[thread].pair_stream);
@ -327,7 +321,7 @@ int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
return ilist;
}
int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
EXTERN int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
int *ilist, const int *numj, const int *type,
const int thread, bool &success) {
return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
@ -340,7 +334,7 @@ int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
// ---------------------------------------------------------------------------
template <class gbmtyp>
void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij,
const bool eflag) {
const bool eflag) {
gbm.nbor.time_nbor.add_to_total();
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
@ -350,8 +344,8 @@ void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij,
gbm.nbor.time_nbor.stop();
}
void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
const int thread) {
EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
const int thread) {
_gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
}
@ -453,7 +447,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
}
}
void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
const int thread) {
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
}
@ -490,7 +484,7 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
return evdw;
}
double gb_gpu_forces(double **f, double **tor, const int *ilist,
EXTERN double gb_gpu_forces(double **f, double **tor, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial, const int thread) {
@ -499,7 +493,7 @@ double gb_gpu_forces(double **f, double **tor, const int *ilist,
vflag_atom,eatom,vatom,virial);
}
void gb_gpu_time(const int i) {
EXTERN void gb_gpu_time(const int i) {
cout.precision(4);
cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds()
<< " s.\n"
@ -515,10 +509,10 @@ void gb_gpu_time(const int i) {
<< " s.\n";
}
int gb_gpu_num_devices() {
EXTERN int gb_gpu_num_devices() {
return GBMF[0].gpu.num_devices();
}
double gb_gpu_bytes() {
EXTERN double gb_gpu_bytes() {
return GBMF[0].host_memory_usage();
}

View File

@ -1,25 +1,21 @@
/***************************************************************************
gb_gpu_extra.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Inline GPU kernel routines ala math_extra for the CPU.
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_EXTRA_H
#define GB_GPU_EXTRA_H

View File

@ -1,26 +1,21 @@
/***************************************************************************
gb_gpu_kernel.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Routines that actually perform the force/torque computation
*** Force Decomposition by Atom Version ***
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_KERNEL
#define GB_GPU_KERNEL

View File

@ -1,25 +1,21 @@
/***************************************************************************
gb_gpu_memory.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Global variables for GPU Gayberne Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "gb_gpu_memory.h"
#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>

View File

@ -1,25 +1,21 @@
/***************************************************************************
gb_gpu_memory.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Global variables for GPU Gayberne Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_MEMORY_H
#define GB_GPU_MEMORY_H

View File

@ -1,25 +1,21 @@
/***************************************************************************
lj_gpu.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Lennard-Jones potential GPU calcultation
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
@ -27,18 +23,39 @@
#include "nvc_timer.h"
#include "nvc_device.h"
#include "pair_gpu_texture.h"
#include "pair_gpu_cell.h"
#include "lj_gpu_memory.cu"
#include "lj_gpu_kernel.h"
#ifdef WINDLL
#include <windows.h>
BOOL APIENTRY DllMain(HANDLE hModule, DWORD dwReason, LPVOID lpReserved)
{
return TRUE;
}
#endif
#ifdef WINDLL
#define EXTERN extern "C" __declspec(dllexport)
#else
#define EXTERN
#endif
using namespace std;
static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
#define LJMT LJ_GPU_Memory<numtyp,acctyp>
static float kernelTime = 0.0;
static int ncell1D;
static float *energy, *d_energy;
static float3 *d_force, *f_temp, *v_temp, *d_virial;
static cell_list cell_list_gpu;
// ---------------------------------------------------------------------------
// Convert something to a string
// ---------------------------------------------------------------------------
#include <sstream>
template <class t>
inline string lj_gpu_toa(const t& in) {
ostringstream o;
@ -50,113 +67,54 @@ inline string lj_gpu_toa(const t& in) {
// ---------------------------------------------------------------------------
// Return string with GPU info
// ---------------------------------------------------------------------------
string lj_gpu_name(const int id, const int max_nbors) {
string name=LJMF.gpu.name(id)+", "+
EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) {
string sname=LJMF.gpu.name(id)+", "+
lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+
lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id),
max_nbors))+" Atoms";
return name;
lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ";
strcpy(name,sname.c_str());
}
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
double **epsilon, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, const int max_nbors, const int gpu_id) {
LJMF.gpu.init();
EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
double **epsilon, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, double *boxlo, double *boxhi,
double cell_size, double skin,
const int max_nbors, const int gpu_id) {
LJMF.gpu.init();
if (LJMF.gpu.num_devices()==0)
return false;
ij_size=IJ_SIZE;
return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
ncell1D = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
init_cell_list_const(cell_size, skin, boxlo, boxhi);
return ret;
}
// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
void lj_gpu_clear() {
EXTERN void lj_gpu_clear() {
free(energy);
free(v_temp);
cudaFreeHost(f_temp);
cudaFree(d_force);
cudaFree(d_energy);
cudaFree(d_virial);
clear_cell_list(cell_list_gpu);
LJMF.clear();
}
// ---------------------------------------------------------------------------
// copy atom positions and optionally types to device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
const int *host_type, const bool rebuild,
cudaStream_t &stream) {
atom.time_atom.start();
atom.reset_write_buffer();
// First row of dev_x is x position, second is y, third is z
atom.add_atom_data(host_x[0],3);
atom.add_atom_data(host_x[0]+1,3);
atom.add_atom_data(host_x[0]+2,3);
int csize=3;
// If a rebuild occured, copy type data
if (rebuild) {
atom.add_atom_data(host_type);
csize++;
}
atom.copy_atom_data(csize,stream);
atom.time_atom.stop();
}
void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) {
_lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream);
}
// ---------------------------------------------------------------------------
// Signal that we need to transfer a new neighbor list
// ---------------------------------------------------------------------------
template <class LJMTyp>
bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum,
int *ilist, const int *numj) {
if (nall>ljm.max_atoms)
return false;
ljm.nbor.time_nbor.start();
ljm.atom.nall(nall);
ljm.atom.inum(inum);
ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream);
ljm.nbor.time_nbor.stop();
return true;
}
bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
const int *numj) {
return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj);
}
// ---------------------------------------------------------------------------
// Copy a set of ij_size ij interactions to device and compute energies,
// forces, and torques for those interactions
// ---------------------------------------------------------------------------
template <class LJMTyp>
void _lj_gpu_nbors(LJMTyp &ljm, const int *ij, const int num_ij) {
ljm.nbor.time_nbor.add_to_total();
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
memcpy(ljm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
ljm.nbor.time_nbor.start();
ljm.nbor.add(num_ij,ljm.pair_stream);
ljm.nbor.time_nbor.stop();
}
void lj_gpu_nbors(const int *ij, const int num_ij) {
_lj_gpu_nbors(LJMF,ij,num_ij);
}
// ---------------------------------------------------------------------------
// Calculate energies and forces for all ij interactions
@ -169,6 +127,7 @@ void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
ljm.time_pair.start();
if (ljm.shared_types)
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
@ -181,47 +140,279 @@ void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
vflag, ljm.atom.inum(), ljm.atom.nall());
ljm.time_pair.stop();
}
EXTERN void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
_lj_gpu<PRECISION,ACC_PRECISION>(LJMF, eflag,vflag,rebuild);
}
template <class numtyp, class acctyp>
double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
double **host_x, int *host_type, const int inum,
const int nall, const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
ljm.atom.nall(nall);
ljm.atom.inum(inum);
ljm.nbor.time_nbor.start();
ljm.nbor.time_nbor.stop();
double evdwl=0.0;
static int buffer = CELL_SIZE;
static int ncell = (int)pow((float)ncell1D,3);
static int first_call = 1;
// allocate memory on CPU and GPU
if (first_call) {
energy = (float*) malloc(inum*sizeof(float));
v_temp = (float3*)malloc(inum*2*sizeof(float3));
cudaMallocHost((void**)&f_temp, inum*sizeof(float3));
cudaMalloc((void**)&d_force, inum*sizeof(float3));
cudaMalloc((void**)&d_energy, inum*sizeof(float));
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
init_cell_list(cell_list_gpu, nall, ncell, buffer);
first_call = 0;
}
if (!first_call && ago == 0) {
free(energy);
free(v_temp);
cudaFreeHost(f_temp);
cudaFree(d_force);
cudaFree(d_energy);
cudaFree(d_virial);
energy = (float*) malloc(inum*sizeof(float));
v_temp = (float3*)malloc(inum*2*sizeof(float3));
cudaMallocHost((void**)&f_temp, inum*sizeof(float3));
cudaMalloc((void**)&d_force, inum*sizeof(float3));
cudaMalloc((void**)&d_energy, inum*sizeof(float));
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
clear_cell_list(cell_list_gpu);
init_cell_list(cell_list_gpu, nall, ncell, buffer);
}
// build cell-list on GPU
ljm.atom.time_atom.start();
build_cell_list(host_x[0], host_type, cell_list_gpu,
ncell, ncell1D, buffer, inum, nall, ago);
ljm.atom.time_atom.stop();
ljm.time_pair.start();
#ifdef TIMING
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
// call the cell-list force kernel
const int BX=BLOCK_1D;
dim3 GX(ncell1D, ncell1D*ncell1D);
if (eflag == 0 && vflag == 0) {
kernel_lj_cell<false,false><<<GX, BX, 0>>>
(d_force, d_energy, d_virial,
cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.type,
cell_list_gpu.natom,
inum, nall, ncell);
} else {
kernel_lj_cell<true,true><<<GX, BX, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES>>>
(d_force, d_energy, d_virial,
cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.type,
cell_list_gpu.natom,
inum, nall, ncell);
}
#ifdef TIMING
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float kTime;
cudaEventElapsedTime(&kTime, start, stop);
kernelTime += kTime;
printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
cudaEventDestroy(start);
cudaEventDestroy(stop);
#endif
// copy results from GPU to CPU
cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
if (eflag) {
cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < inum; i++) {
evdwl += energy[i];
}
evdwl *= 0.5f;
}
if (vflag) {
cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
for (int i = 0; i < inum; i++) {
virial[0] += v_temp[2*i].x;
virial[1] += v_temp[2*i].y;
virial[2] += v_temp[2*i].z;
virial[3] += v_temp[2*i+1].x;
virial[4] += v_temp[2*i+1].y;
virial[5] += v_temp[2*i+1].z;
}
for (int i = 0; i < 6; i++)
virial[i] *= 0.5f;
}
for (int i = 0; i < inum; i++) {
force[i][0] += f_temp[i].x;
force[i][1] += f_temp[i].y;
force[i][2] += f_temp[i].z;
}
ljm.time_pair.stop();
}
void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
_lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild);
}
// ---------------------------------------------------------------------------
// Get energies and forces to host
// ---------------------------------------------------------------------------
template<class numtyp, class acctyp>
double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial) {
double evdw;
ljm.atom.time_answer.start();
ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream);
ljm.atom.time_atom.add_to_total();
ljm.nbor.time_nbor.add_to_total();
ljm.time_pair.add_to_total();
CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream));
evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
ljm.atom.add_forces(ilist,f);
ljm.atom.time_answer.stop();
ljm.atom.time_answer.add_to_total();
return evdw;
return evdwl;
}
double lj_gpu_forces(double **f, const int *ilist, const bool eflag,
const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial) {
return _lj_gpu_forces<PRECISION,ACC_PRECISION>
(LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial);
EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall,
const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
return _lj_gpu_cell<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall,
ago, eflag, vflag, boxlo, boxhi);
}
void lj_gpu_time() {
template <class numtyp, class acctyp>
double _lj_gpu_n2(LJMT &ljm, double **force, double *virial,
double **host_x, int *host_type, const int inum, const int nall, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
ljm.atom.nall(nall);
ljm.atom.inum(inum);
ljm.nbor.time_nbor.start();
ljm.nbor.time_nbor.stop();
double evdwl=0.0;
#ifdef NOUSE
static int first_call = 1;
if (first_call) {
energy = (float*) malloc(inum*sizeof(float));
v_temp = (float3*) malloc(inum*2*sizeof(float3));
cudaMallocHost((void**)&f_temp, inum*sizeof(float3));
cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
cudaMalloc((void**)&d_force, inum*sizeof(float3));
cudaMalloc((void**)&d_energy, inum*sizeof(float));
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
cudaMalloc((void**)&d_pos, nall*sizeof(float3));
cudaMalloc((void**)&d_type, nall*sizeof(int));
first_call = 0;
}
ljm.atom.time_atom.start();
double *atom_pos = host_x[0];
for (int i = 0; i < 3*nall; i+=3) {
pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
}
cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
cudaMemcpy(d_type, host_type, nall*sizeof(int), cudaMemcpyHostToDevice);
ljm.atom.time_atom.stop();
ljm.time_pair.start();
// Compute the block size and grid size to keep all cores busy
const int BX=BLOCK_1D;
dim3 GX(static_cast<int>(ceil(static_cast<double>(inum)/BX)));
#ifdef TIMING
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
// N^2 force kernel
kernel_lj_n2<numtyp, acctyp><<<GX, BX>>>(d_force, d_energy, d_virial,
d_pos, d_type, eflag, vflag, inum, nall);
#ifdef TIMING
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float kTime;
cudaEventElapsedTime(&kTime, start, stop);
kernelTime += kTime;
printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
cudaEventDestroy(start);
cudaEventDestroy(stop);
#endif
// copy results from GPU to CPU
cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
if (eflag) {
cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < inum; i++) {
evdwl += energy[i];
}
evdwl *= 0.5f;
}
if (vflag) {
cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
for (int i = 0; i < inum; i++) {
virial[0] += v_temp[2*i].x;
virial[1] += v_temp[2*i].y;
virial[2] += v_temp[2*i].z;
virial[3] += v_temp[2*i+1].x;
virial[4] += v_temp[2*i+1].y;
virial[5] += v_temp[2*i+1].z;
}
for (int i = 0; i < 6; i++)
virial[i] *= 0.5f;
}
for (int i = 0; i < inum; i++) {
force[i][0] += f_temp[i].x;
force[i][1] += f_temp[i].y;
force[i][2] += f_temp[i].z;
}
#endif
ljm.time_pair.stop();
ljm.atom.time_atom.add_to_total();
ljm.nbor.time_nbor.add_to_total();
ljm.time_pair.add_to_total();
return evdwl;
}
EXTERN double lj_gpu_n2(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall,
const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
return _lj_gpu_n2<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall,
eflag, vflag, boxlo, boxhi);
}
EXTERN void lj_gpu_time() {
cout.precision(4);
cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n";
cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
@ -229,10 +420,10 @@ void lj_gpu_time() {
cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n";
}
int lj_gpu_num_devices() {
EXTERN int lj_gpu_num_devices() {
return LJMF.gpu.num_devices();
}
double lj_gpu_bytes() {
EXTERN double lj_gpu_bytes() {
return LJMF.host_memory_usage();
}

View File

@ -1,29 +1,220 @@
/***************************************************************************
lj_gpu_kernel.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Routines that actually perform the force computation
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef LJ_GPU_KERNEL
#define LJ_GPU_KERNEL
/* Cell list version of LJ kernel */
template<bool eflag, bool vflag>
__global__ void kernel_lj_cell(float3 *force3,
float *energy, float3 *virial,
float3 *cell_list, unsigned int *cell_idx,
int *cell_type, int *cell_atom,
const int inum, const int nall, const int ncell)
{
// calculate 3D block idx from 2d block
int bx = blockIdx.x;
int by = blockIdx.y % gridDim.x;
int bz = blockIdx.y / gridDim.x;
int tid = threadIdx.x;
// compute cell idx from 3D block idx
int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
__shared__ int typeSh[CELL_SIZE];
__shared__ float posSh[CELL_SIZE*3];
__shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
extern __shared__ float smem[];
__shared__ float *lj3Sh;
__shared__ float *lj4Sh;
__shared__ float *offsetSh;
// load force parameters into shared memory
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
int itype = i/MAX_SHARED_TYPES;
int jtype = i%MAX_SHARED_TYPES;
cutsqSh[i] = _cutsq_<float>(itype,jtype);
lj1Sh[i] = _lj1_<float>(itype,jtype).x;
lj2Sh[i] = _lj1_<float>(itype,jtype).y;
}
// Only allocate shared memory when needed,
// this reduces shared memory limitation on occupancy
if (eflag || vflag) {
lj3Sh = smem;
lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
int itype = i/MAX_SHARED_TYPES;
int jtype = i%MAX_SHARED_TYPES;
lj3Sh[i] = _lj3_<float>(itype,jtype).x+0.01;
lj4Sh[i] = _lj3_<float>(itype,jtype).y;
offsetSh[i]= _offset_<float>(itype,jtype);
}
}
__syncthreads();
int nborz0 = max(bz-1,0), nborz1 = min(bz+1, gridDim.x-1),
nbory0 = max(by-1,0), nbory1 = min(by+1, gridDim.x-1),
nborx0 = max(bx-1,0), nborx1 = min(bx+1, gridDim.x-1);
for (int ii = 0; ii < ceil((float)(cell_atom[cid])/BLOCK_1D); ii++) {
float3 f = {0.0f, 0.0f, 0.0f};
float ener = 0.0f;
float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
int itype;
float ix, iy, iz;
int i = tid + ii*BLOCK_1D;
unsigned int answer_pos = cell_idx[cid*CELL_SIZE+i];
// load current cell atom position and type into sMem
for (int j = tid; j < cell_atom[cid]; j += BLOCK_1D) {
int pid = cid*CELL_SIZE + j;
float3 pos = cell_list[pid];
posSh[j ] = pos.x;
posSh[j+ CELL_SIZE] = pos.y;
posSh[j+2*CELL_SIZE] = pos.z;
typeSh[j] = cell_type[pid];
}
__syncthreads();
if (answer_pos < inum) {
itype = typeSh[i];
ix = posSh[i ];
iy = posSh[i+ CELL_SIZE];
iz = posSh[i+2*CELL_SIZE];
// compute force from current cell
for (int j = 0; j < cell_atom[cid]; j++) {
if (j == i) continue;
float delx = ix - posSh[j ];
float dely = iy - posSh[j+ CELL_SIZE];
float delz = iz - posSh[j+2*CELL_SIZE];
int jtype = typeSh[j];
int mtype = itype + jtype*MAX_SHARED_TYPES;
float r2inv = delx*delx + dely*dely + delz*delz;
if (r2inv < cutsqSh[mtype]) {
r2inv = 1.0f/r2inv;
float r6inv = r2inv * r2inv * r2inv;
float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
f.x += delx * force;
f.y += dely * force;
f.z += delz * force;
if (eflag) {
float e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
ener += (e - offsetSh[mtype]);
}
if (vflag) {
v0.x += delx*delx*force;
v0.y += dely*dely*force;
v0.z += delz*delz*force;
v1.x += delx*dely*force;
v1.y += delx*delz*force;
v1.z += dely*delz*force;
}
}
}
}
__syncthreads();
// compute force from neigboring cells
for (int nborz = nborz0; nborz <= nborz1; nborz++) {
for (int nbory = nbory0; nbory <= nbory1; nbory++) {
for (int nborx = nborx0; nborx <= nborx1; nborx++) {
if (nborz == bz && nbory == by && nborx == bx) continue;
// compute cell id
int cid_nbor = nborx + INT_MUL(nbory,gridDim.x) +
INT_MUL(nborz,gridDim.x*gridDim.x);
// load neighbor cell position and type into smem
for (int j = tid; j < cell_atom[cid_nbor]; j += BLOCK_1D) {
int pid = INT_MUL(cid_nbor,CELL_SIZE) + j;
float3 pos = cell_list[pid];
posSh[j ] = pos.x;
posSh[j+ CELL_SIZE] = pos.y;
posSh[j+2*CELL_SIZE] = pos.z;
typeSh[j] = cell_type[pid];
}
__syncthreads();
// compute force
if (answer_pos < inum) {
for (int j = 0; j < cell_atom[cid_nbor]; j++) {
float delx = ix - posSh[j ];
float dely = iy - posSh[j+ CELL_SIZE];
float delz = iz - posSh[j+2*CELL_SIZE];
int jtype = typeSh[j];
int mtype = itype + jtype*MAX_SHARED_TYPES;
float r2inv = delx*delx + dely*dely + delz*delz;
if (r2inv < cutsqSh[mtype]) {
r2inv = 1.0f/r2inv;
float r6inv = r2inv * r2inv * r2inv;
float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
f.x += delx * force;
f.y += dely * force;
f.z += delz * force;
if (eflag) {
float e=r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
ener += (e-offsetSh[mtype]);
}
if (vflag) {
v0.x += delx*delx*force;
v0.y += dely*dely*force;
v0.z += delz*delz*force;
v1.x += delx*dely*force;
v1.y += delx*delz*force;
v1.z += dely*delz*force;
}
}
}
}
__syncthreads();
}
}
}
if (answer_pos < inum) {
force3[answer_pos] = f;
if (eflag)
energy[answer_pos] = ener;
if (vflag) {
virial[2*answer_pos] = v0;
virial[2*answer_pos+1] = v1;
}
}
}
}
/* Neigbhor list version of LJ kernel */
template<class numtyp, class acctyp>
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
const int *dev_ij, const int nbor_pitch, acctyp *ans,
@ -36,7 +227,6 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
if (ii<4)
sp_lj[ii]=special_lj[ii];
ii+=INT_MUL(blockIdx.x,blockDim.x);
__syncthreads();
if (ii<inum) {
@ -158,7 +348,6 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
}
}
ii+=INT_MUL(blockIdx.x,blockDim.x);
__syncthreads();
if (ii<inum) {
@ -184,9 +373,11 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
numtyp factor_lj;
for ( ; list<list_end; list++) {
int j=*list;
int j= *list;
if (j < nall)
factor_lj = 1.0;
else {
@ -247,4 +438,116 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
} // if ii
}
/* Brute force O(N^2) version of LJ kernel */
template<class numtyp, class acctyp>
__global__ void kernel_lj_n2(float3 *force3,
float *energy, float3 *virial,
float3 *pos, int *type,
const bool eflag, const bool vflag, const int inum, const int nall)
{
int gid = threadIdx.x + INT_MUL(blockIdx.x, blockDim.x);
int tid = threadIdx.x;
__shared__ float posSh[BLOCK_1D*3];
__shared__ int typeSh[BLOCK_1D];
__shared__ numtyp cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj3Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj4Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp offsetSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
int itype=tid/MAX_SHARED_TYPES;
int jtype=tid%MAX_SHARED_TYPES;
cutsqSh[tid]=_cutsq_<numtyp>(itype,jtype);
lj1Sh[tid]=_lj1_<numtyp>(itype,jtype).x;
lj2Sh[tid]=_lj1_<numtyp>(itype,jtype).y;
lj3Sh[tid]=_lj3_<numtyp>(itype,jtype).x;
lj4Sh[tid]=_lj3_<numtyp>(itype,jtype).y;
offsetSh[tid]=_offset_<numtyp>(itype,jtype);
}
__syncthreads();
float3 f = {0.0f, 0.0f, 0.0f};
float ener = 0.0f;
float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
int itype, jtype;
int mtype;
numtyp ix, iy, iz;
if (gid < inum) {
ix = pos[gid].x;
iy = pos[gid].y;
iz = pos[gid].z;
itype = type[gid];
}
int pid = tid;
int nIter = ceil((float)nall/BLOCK_1D);
for (int jj = 0; jj < nIter; jj++, pid += BLOCK_1D) {
if (pid < nall) {
posSh[tid ] = pos[pid].x;
posSh[tid+ BLOCK_1D] = pos[pid].y;
posSh[tid+2*BLOCK_1D] = pos[pid].z;
typeSh[tid] = type[pid];
}
__syncthreads();
if (gid < inum) {
int pid_j = jj*BLOCK_1D;
for (int j = 0; j < BLOCK_1D; j++, pid_j++) {
if (jj == blockIdx.x && tid == j) continue;
if (pid_j < nall) {
numtyp delx = ix - posSh[j ];
numtyp dely = iy - posSh[j+ BLOCK_1D];
numtyp delz = iz - posSh[j+2*BLOCK_1D];
jtype = typeSh[j];
mtype = itype + jtype*MAX_SHARED_TYPES;
numtyp r2inv = delx * delx + dely * dely + delz * delz;
if (r2inv < cutsqSh[mtype]) {
r2inv = (numtyp)1.0/r2inv;
numtyp r6inv = r2inv * r2inv * r2inv;
numtyp force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
f.x += delx * force;
f.y += dely * force;
f.z += delz * force;
if (eflag) {
numtyp e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
ener +=(e-offsetSh[mtype]);
}
if (vflag) {
v0.x += delx*delx*force;
v0.y += dely*dely*force;
v0.z += delz*delz*force;
v1.x += delx*dely*force;
v1.y += delx*delz*force;
v1.z += dely*delz*force;
}
}
}
}
}
__syncthreads();
}
if (gid < inum) {
if (eflag)
energy[gid] = ener;
if (vflag) {
virial[2*gid ] = v0;
virial[2*gid+1] = v1;
}
force3[gid] = f;
}
}
#endif

View File

@ -1,25 +1,21 @@
/***************************************************************************
lj_gpu_memory.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Global variables for GPU Lennard-Jones Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "lj_gpu_memory.h"
#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>

View File

@ -1,25 +1,21 @@
/***************************************************************************
lj_gpu_memory.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Global variables for GPU Lennard-Jones Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef LJ_GPU_MEMORY_H
#define LJ_GPU_MEMORY_H
@ -30,8 +26,10 @@
#include "pair_gpu_nbor.h"
#define BLOCK_1D 64
#define CELL_SIZE 64
#define MAX_SHARED_TYPES 8
#define PERCENT_GPU_MEMORY 0.7
#define BIG_NUMBER 100000000
template <class numtyp, class acctyp>
class LJ_GPU_Memory {

View File

@ -1,25 +1,21 @@
/***************************************************************************
nvc_device.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Utilities for dealing with cuda devices
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Wed Jan 28 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include <stdlib.h>
#include <stdio.h>

View File

@ -1,25 +1,21 @@
/***************************************************************************
nvc_device.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Utilities for dealing with cuda devices
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Wed Jan 28 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef NVC_DEVICE
#define NVC_DEVICE

View File

@ -1,25 +1,21 @@
/***************************************************************************
nvc_get_devices.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
List properties of cuda devices
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Wed Jan 28 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "nvc_device.h"

View File

@ -1,3 +1,22 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef NVC_MACROS_H
#define NVC_MACROS_H

View File

@ -1,25 +1,21 @@
/***************************************************************************
nvc_memory.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Routines for memory management on CUDA devices
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef NVC_MEMORY_H
#define NVC_MEMORY_H

View File

@ -1,25 +1,21 @@
/***************************************************************************
nvc_timer.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Class for timing CUDA routines
__________________________________________________________________________
This file is part of the NVC Library
__________________________________________________________________________
begin : Tue Feb 3 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef NVC_TIMER_H
#define NVC_TIMER_H

View File

@ -1,25 +1,21 @@
/***************************************************************************
nvc_texture_traits.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Tricks for templating textures
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef NVC_TEXTURE_TRAITS_H
#define NVC_TEXTURE_TRAITS_H

View File

@ -1,25 +1,21 @@
/***************************************************************************
pair_gpu_atom.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Memory routines for moving atom and force data between host and gpu
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "pair_gpu_texture.h"
#include "pair_gpu_atom.h"

View File

@ -1,25 +1,21 @@
/***************************************************************************
pair_gpu_atom.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Memory routines for moving atom and force data between host and gpu
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_ATOM_H
#define PAIR_GPU_ATOM_H

454
lib/gpu/pair_gpu_cell.cu Normal file
View File

@ -0,0 +1,454 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "lj_gpu_memory.h"
#include "pair_gpu_cell.h"
static __constant__ float d_boxlo[3];
static __constant__ float d_boxhi[3];
static __constant__ float d_cell_size[1];
static __constant__ float d_skin[1];
void init_cell_list_const(double cell_size, double skin,
double *boxlo, double *boxhi)
{
float cell_size1 = cell_size;
float skin1 = skin;
float boxlo1[3], boxhi1[3];
for (int i = 0; i < 3; i++) {
boxlo1[i] = boxlo[i];
boxhi1[i] = boxhi[i];
}
cudaMemcpyToSymbol(d_cell_size, &cell_size1, sizeof(float),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_boxlo, boxlo1, 3*sizeof(float),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_boxhi, boxhi1, 3*sizeof(float),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_skin, &skin1, sizeof(float),
0, cudaMemcpyHostToDevice);
}
__global__ void kernel_set_cell_list(unsigned int *cell_idx)
{
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
cell_idx[gid] = BIG_NUMBER;
}
// build the cell list
__global__ void kernel_build_cell_list(float3 *cell_list,
unsigned int *cell_idx,
int *cell_type,
int *cell_atom,
float3 *pos,
int *type,
const int inum,
const int nall)
{
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
float cSize = d_cell_size[0];
int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
if (gid < nall) {
float3 p = pos[gid];
p.x = fmaxf(p.x, d_boxlo[0]-cSize);
p.x = fminf(p.x, d_boxhi[0]+cSize);
p.y = fmaxf(p.y, d_boxlo[1]-cSize);
p.y = fminf(p.y, d_boxhi[1]+cSize);
p.z = fmaxf(p.z, d_boxlo[2]-cSize);
p.z = fminf(p.z, d_boxhi[2]+cSize);
int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncell1D
+ (int)(p.z/cSize + 1.0) * ncell1D * ncell1D;
int atom_pos = atomicAdd(&cell_atom[cell_id], 1);
int pid = cell_id*CELL_SIZE + atom_pos;
cell_list[pid] = pos[gid];
cell_type[pid] = type[gid];
cell_idx [pid] = gid;
}
}
__global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild)
{
float cSize = d_cell_size[0];
int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
// calculate 3D block idx from 2d block
int bx = blockIdx.x;
int by = blockIdx.y % gridDim.x;
int bz = blockIdx.y / gridDim.x;
int tid = threadIdx.x;
// compute cell idx from 3D block idx
int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
float skin = d_skin[0];
float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin;
float hix = lowx + cSize + skin;
float lowy = d_boxlo[1] + (by-1)*cSize - 0.5*skin;
float hiy = lowy + cSize + skin;
float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin;
float hiz = lowz + cSize + skin;
for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
int pid = pbase + i;
float3 p = cell_list[pid];
p.x = fmaxf(p.x, d_boxlo[0]-cSize);
p.x = fminf(p.x, d_boxhi[0]+cSize);
p.y = fmaxf(p.y, d_boxlo[1]-cSize);
p.y = fminf(p.y, d_boxhi[1]+cSize);
p.z = fmaxf(p.z, d_boxlo[2]-cSize);
p.z = fminf(p.z, d_boxhi[2]+cSize);
if (p.x < lowx || p.x > hix || p.y < lowy || p.y > hiy || p.z < lowz || p.z > hiz) {
*rebuild = 1;
}
}
}
__global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell)
{
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
if (gid < ncell) {
if (cell_atom[gid] > CELL_SIZE)
*overflow = 1;
}
}
__global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos)
{
// calculate 3D block idx from 2d block
int bx = blockIdx.x;
int by = blockIdx.y % gridDim.x;
int bz = blockIdx.y / gridDim.x;
int tid = threadIdx.x;
// compute cell idx from 3D block idx
int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
int pid = pbase + i;
cell_list[pid] = pos[cell_idx[pid]];
}
}
__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit);
void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size)
{
int i = 0;
const unsigned int bitSize = sizeof(unsigned int)*8;
const unsigned int bitStep = 4;
const int gSize = size/BLOCK_1D;
while (bitSize > i*bitStep) {
radixSortBlocks<<<gSize, BLOCK_1D, 2*BLOCK_1D*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
i++;
}
}
#ifdef __DEVICE_EMULATION__
#define __SYNC __syncthreads();
#else
#define __SYNC
#endif
#define WARP_SIZE 32
template<class T, int maxlevel>
__device__ T scanwarp(T val, T* sData)
{
// The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp =
// 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
sData[idx] = 0;
idx += WARP_SIZE;
sData[idx] = val; __SYNC
#ifdef __DEVICE_EMULATION__
T t = sData[idx - 1]; __SYNC
sData[idx] += t; __SYNC
t = sData[idx - 2]; __SYNC
sData[idx] += t; __SYNC
t = sData[idx - 4]; __SYNC
sData[idx] += t; __SYNC
t = sData[idx - 8]; __SYNC
sData[idx] += t; __SYNC
t = sData[idx - 16]; __SYNC
sData[idx] += t; __SYNC
#else
if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } __SYNC
if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } __SYNC
if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } __SYNC
if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } __SYNC
if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } __SYNC
#endif
return sData[idx] - val; // convert inclusive -> exclusive
}
__device__ unsigned int scan(unsigned int idata)
{
extern __shared__ unsigned int ptr[];
unsigned int idx = threadIdx.x;
unsigned int val = idata;
val = scanwarp<unsigned int, 4>(val, ptr);
__syncthreads();
if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
{
ptr[idx >> 5] = val + idata;
}
__syncthreads();
#ifndef __DEVICE_EMULATION__
if (idx < WARP_SIZE)
#endif
{
ptr[idx] = scanwarp<unsigned int, 2>(ptr[idx], ptr);
}
__syncthreads();
val += ptr[idx >> 5];
return val;
}
__device__ unsigned int rank(unsigned int preds)
{
unsigned int address = scan(preds);
__shared__ unsigned int numtrue;
if (threadIdx.x == BLOCK_1D - 1)
{
numtrue = address + preds;
}
__syncthreads();
unsigned int rank;
unsigned int idx = threadIdx.x;
rank = (preds) ? address : numtrue + idx - address;
return rank;
}
__device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit)
{
extern __shared__ unsigned int sMem1[];
__shared__ float sMem2[BLOCK_1D];
__shared__ int sMem3[BLOCK_1D];
int tid = threadIdx.x;
for(unsigned int shift = startbit; shift < (startbit + nbits); ++shift) {
unsigned int lsb;
lsb = !(((*key) >> shift) & 0x1);
unsigned int r;
r = rank(lsb);
// This arithmetic strides the ranks across 4 CTA_SIZE regions
sMem1[r] = *key;
__syncthreads();
// The above allows us to read without 4-way bank conflicts:
*key = sMem1[tid];
__syncthreads();
sMem2[r] = (*value1).x;
__syncthreads();
(*value1).x = sMem2[tid];
__syncthreads();
sMem2[r] = (*value1).y;
__syncthreads();
(*value1).y = sMem2[tid];
__syncthreads();
sMem2[r] = (*value1).z;
__syncthreads();
(*value1).z = sMem2[tid];
__syncthreads();
sMem3[r] = *value2;
__syncthreads();
*value2 = sMem3[tid];
__syncthreads();
}
}
__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit)
{
extern __shared__ unsigned int sMem[];
int gid = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int key;
float3 value1;
int value2;
key = keys[gid];
value1 = values1[gid];
value2 = values2[gid];
__syncthreads();
radixSortBlock(&key, &value1, &value2, nbits, startbit);
keys[gid] = key;
values1[gid] = value1;
values2[gid] = value2;
}
static float3 *d_pos, *pos_temp;
static int *d_type;
static int *d_overflow, *d_rebuild;
void init_cell_list(cell_list &cell_list_gpu,
const int nall,
const int ncell,
const int buffer)
{
cudaMalloc((void**)&(cell_list_gpu.pos), ncell*buffer*sizeof(float3));
cudaMalloc((void**)&(cell_list_gpu.idx), ncell*buffer*sizeof(unsigned int));
cudaMalloc((void**)&(cell_list_gpu.type), ncell*buffer*sizeof(int));
cudaMalloc((void**)&(cell_list_gpu.natom), ncell*sizeof(int));
cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
cudaMalloc((void**)&d_pos, nall*sizeof(float3));
cudaMalloc((void**)&d_type, nall*sizeof(int));
cudaMalloc((void**)&d_overflow, sizeof(int));
cudaMalloc((void**)&d_rebuild, sizeof(int));
cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
cudaMemset(cell_list_gpu.pos, 0, ncell*buffer*sizeof(float3));
}
void clear_cell_list(cell_list &cell_list_gpu)
{
cudaFree(cell_list_gpu.pos);
cudaFree(cell_list_gpu.idx);
cudaFree(cell_list_gpu.natom);
cudaFree(cell_list_gpu.type);
cudaFreeHost(pos_temp);
cudaFree(d_pos);
cudaFree(d_type);
cudaFree(d_overflow);
cudaFree(d_rebuild);
}
void build_cell_list(double *atom_pos, int *atom_type,
cell_list &cell_list_gpu,
const int ncell, const int ncell1D, const int buffer,
const int inum, const int nall, const int ago)
{
cudaMemset(d_overflow, 0, sizeof(int));
cudaMemset(d_rebuild, 0, sizeof(int));
// copy position and type to GPU
for (int i = 0; i < 3*nall; i+=3) {
pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
}
cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
cudaMemcpy(d_type, atom_type, nall*sizeof(int), cudaMemcpyHostToDevice);
static int first_build = 1;
int rebuild = 0;
// copy the last built cell-list and test whether it needs to be rebuilt
if (!first_build) {
dim3 block(BLOCK_1D);
dim3 grid(ncell1D, ncell1D*ncell1D);
kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.natom, d_pos);
cudaMemset(d_rebuild, 0, sizeof(int));
int *temp = (int*)malloc(sizeof(int)*ncell);
kernel_test_rebuild<<<grid, block>>>(cell_list_gpu.pos,
cell_list_gpu.natom,
d_rebuild);
cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost);
}
/*if (!first_build) {
dim3 block(BLOCK_1D);
dim3 grid(ncell1D, ncell1D*ncell1D);
kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.natom, d_pos);
}*/
if (ago == 0) rebuild = 1;
// build cell-list for the first time
if (first_build || rebuild) {
first_build = 0;
// cout << "Building cell list..." << endl;
cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
// initialize d_cell_idx for the sorting routine
kernel_set_cell_list<<<ncell, buffer>>>(cell_list_gpu.idx);
// build cell list
dim3 blockDim(128);
dim3 gridDim(static_cast<int>(ceil(static_cast<double>(nall)/blockDim.x)));
kernel_build_cell_list<<<gridDim, blockDim>>>(cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.type,
cell_list_gpu.natom,
d_pos, d_type, inum, nall);
// check cell list overflow
int overflow;
int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/BLOCK_1D));
kernel_test_overflow<<<gDimCell, BLOCK_1D>>>(cell_list_gpu.natom,
d_overflow, ncell);
cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost);
if (overflow > 0) {
printf("\n\nBLOCK_1D too small for cell list, please increase it!\n\n");
exit(0);
}
// sort atoms in every cell by atom index to avoid floating point associativity problem.
sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos,
cell_list_gpu.type, ncell*buffer);
cudaThreadSynchronize();
}
}

62
lib/gpu/pair_gpu_cell.h Normal file
View File

@ -0,0 +1,62 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_CELL_H
#define PAIR_GPU_CELL_H
typedef struct {
float3 *pos;
unsigned int *idx;
int *type;
int *natom;
} cell_list;
__global__ void kernel_set_cell_list(unsigned int *cell_idx);
__global__ void kernel_build_cell_list(float3 *cell_list,
unsigned int *cell_idx,
int *cell_type,
int *cell_atom,
float3 *pos,
int *type,
const int inum,
const int nall);
__global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild);
__global__ void kernel_copy_list(float3 *cell_list,
unsigned int *cell_idx,
int *cell_atom,
float3 *pos);
__global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell);
void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size);
void init_cell_list_const(double cell_size, double skin,
double *boxlo, double *boxhi);
void init_cell_list(cell_list &cell_list_gpu,
const int nall,
const int ncell,
const int buffer);
void build_cell_list(double *atom_pos, int *atom_type,
cell_list &cell_list_gpu,
const int ncell, const int ncell1D, const int buffer,
const int inum, const int nall, const int ago);
void clear_cell_list(cell_list &cell_list_gpu);
#endif

View File

@ -1,25 +1,21 @@
/***************************************************************************
pair_gpu_nbor.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Neighbor memory operations for LAMMPS GPU Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "pair_gpu_nbor.h"

View File

@ -1,25 +1,21 @@
/***************************************************************************
pair_gpu_nbor.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Neighbor memory operations for LAMMPS GPU Library
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Aug 4 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_NBOR_H
#define PAIR_GPU_NBOR_H

View File

@ -1,25 +1,21 @@
/***************************************************************************
pair_gpu_texture.h
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Tricks for templating textures
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "nvc_traits.h"
#include "nvc_memory.h"

View File

@ -1,26 +1,21 @@
/***************************************************************************
pair_tex_tar.cu
-------------------
W. Michael Brown
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
"Tar" of header and source files that need texture reference definitions
within file scope.
__________________________________________________________________________
This file is part of the LAMMPS GPU Library
__________________________________________________________________________
begin : Tue Jun 23 2009
copyright : (C) 2009 by W. Michael Brown
email : wmbrown@sandia.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
----------------------------------------------------------------------- */
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "pair_gpu_atom.cu"
#include "lj_gpu.cu"