git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15268 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2016-07-06 23:26:44 +00:00
parent fd27214f7d
commit 65c1e16401
174 changed files with 0 additions and 22457 deletions

View File

@ -1,4 +0,0 @@
#Makefile for liblammpscuda.a
#No need to modify anything here! The CUDA path is inserted into Makefile.common
include Makefile.cudalib

View File

@ -1,123 +0,0 @@
#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed
# make options:
# emu=1 switch to cuda emulation mode (otherwise: use gpu)
# dbg=1 print a lot of debugging output during runtime
# verbose=1 output nvcc command line during compilation
# keep=1 do not delete temporary compilation files (.ii, .cubin, ...)
# cufft=1 use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw)
# binning=1 create virtual particle grid (neighbor-lists otherwise); currently this is not supported
# precision=1 single precision (global setting)
# precision=2 double precision (global setting)
SHELL = /bin/sh
# System-specific settings
CUDA_INSTALL_PATH = /usr/local/cuda
#CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
# e.g. in Gentoo
# CUDA_INSTALL_PATH = /opt/cuda
#//////////////////////////////////////////////////////////////////////////////////////////////
# no need to change anything below this line
#//////////////////////////////////////////////////////////////////////////////////////////////
#use CPU FFT if cufft=0 is requested.
FALLBACK_FFT = 1
#default settings for compiler switches
ifdef COMPILELIB
include Makefile.defaults
else
include ../../lib/cuda/Makefile.defaults
endif
#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer}
CUDA_FLAGS := -I${CUDA_INSTALL_PATH}/include -DUNIX
CUDA_USRLIB_CONDITIONAL := -L${CUDA_INSTALL_PATH}/lib -L${CUDA_INSTALL_PATH}/lib64
# debug setting
ifeq ($(strip $(dbg)), 1)
CUDA_FLAGS += -D_DEBUG -g
NVCC_FLAGS += -g -G
else
NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O3
endif
# skip timing on Mac and Windows manually
ifeq ($(strip $(prec_timer)), 0)
CUDA_FLAGS += -DNO_PREC_TIMING
endif
# set fft routine
ifeq ($(strip $(cufft)), 0)
ifneq ($(FALLBACK_FFT), 1)
FFT_INC = -DFFT_NONE
FFT_PATH =
FFT_LIB =
CUDA_FLAGS += -DFFT_NONE
endif
else
CUDA_FLAGS += -DFFT_CUFFT
CUDA_USRLIB_CONDITIONAL += -lcufft
endif
# make global precision setting
ifeq ($(strip $(precision)), 1)
CUDA_FLAGS += -DCUDA_PRECISION=1
else
ifeq ($(strip $(precision)), 3)
CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2
else
ifeq ($(strip $(precision)), 4)
CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2
else
CUDA_FLAGS += -DCUDA_PRECISION=2
endif
endif
endif
# make architecture settings
ifeq ($(strip $(arch)), 13)
CUDA_FLAGS += -DCUDA_ARCH=13
SMVERSIONFLAGS := -arch sm_13
else
ifeq ($(strip $(arch)), 20)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_20
else
ifeq ($(strip $(arch)), 21)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_21
else
ifeq ($(strip $(arch)), 30)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_30
else
ifeq ($(strip $(arch)), 35)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_35
else
CUDA_FLAGS += -DCUDA_ARCH=99
SMVERSIONFLAGS := -arch sm_13
endif
endif
endif
endif
endif
CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \
-I$(CUDA_INSTALL_PATH)/include

View File

@ -1,87 +0,0 @@
#Makefile for liblammpscuda.a
#No need to modify anything here! The CUDA path is inserted into Makefile.common
.DEFAULT: lib
COMPILELIB := 1
SHELL = /bin/sh
CUDA_SRC_DIR = ../cuda
CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake
CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) )
include $(CUDA_TEMP)
CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu)
CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o)
CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO))
CUDA_DEP = $(CUDA_OBJ:.o=.d)
NVCC_FLAGS :=
VPATH = $(CUDA_SRC_DIR)
#rewriting default settings if new ones are specified
ifdef precision
tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults)
endif
ifdef arch
tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults)
endif
ifdef cufft
tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults)
endif
ifdef dbg
tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults)
endif
ifdef prec_timer
tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults)
endif
include Makefile.common
tmp := $(shell sed -i '2 d' Makefile.lammps)
tmp := $(shell sed -i '2 d' Makefile.lammps)
tmp := $(shell sed -i '1a CUDA_FLAGS := ${CUDA_FLAGS}' Makefile.lammps)
tmp := $(shell sed -i '2a CUDA_USRLIB_CONDITIONAL := ${CUDA_USRLIB_CONDITIONAL}' Makefile.lammps)
# verbose nvcc output during compilation
ifeq ($(verbose), 1)
VERBOSE :=
NVCC_FLAGS += --ptxas-options=-v
else
VERBOSE := @
endif
# keep temporary compilation files of nvcc
ifeq ($(keep), 1)
NVCC_FLAGS += -keep -Xptxas="--verbose"
endif
NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
CUDA_INCLUDES = -I./ -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA
CUDA_USRLIB =
# Link target
lib: $(CUDA_OBJ)
$(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a
clean:
rm $(CUDA_SRC_DIR)/*.o
rm liblammpscuda.a
# Library target
# Cuda compilation rules
%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h
$(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $<

View File

@ -1,19 +0,0 @@
#precision setting: 1 single, 2 double, 4 mixed
precision ?= 2
#verbose setting: 0 no, 1 yes
verbose ?= 1
#GPU architecture (compute capability): 13, 20, 21, 35
arch ?= 21
#Using cufft (should not be changed)
cufft ?= 1
#Using dbg mode
dbg ?= 0
#On mac machines set this to 0 in order to avoid usage of linux specific precision timer
prec_timer ?= 1

View File

@ -1,7 +0,0 @@
# Settings that the LAMMPS build will import when this package library is used
CUDA_FLAGS := -I/usr/local/cuda/include -DUNIX -DFFT_CUFFT -DCUDA_PRECISION=2 -DCUDA_ARCH=20
CUDA_USRLIB_CONDITIONAL := -L/usr/local/cuda/lib -L/usr/local/cuda/lib64
user-cuda_SYSINC = ${CUDA_FLAGS}
user-cuda_SYSLIB = -lcufft -lcuda -lcudart
user-cuda_SYSPATH = $(CUDA_USRLIB_CONDITIONAL)

View File

@ -1,59 +0,0 @@
USER-CUDA library
Christian Trott, crtrott at sandia.gov
-------------------------------------------------------------------
This directory has source files to build a library that LAMMPS links
against when using the USER-CUDA package.
This library must be built before LAMMPS is built, so LAMMPS can link
against it. The build process also write settings into the
Makefile.lammps file which are used when files in the src/USER-CUDA
package are compiled.
Thus if you re-build this library (e.g. for a different precision),
you MUST re-compile the src/USER-CUDA files as well. You can force
this to happen by uninstalling, then re-installing the USER-CUDA
package (make no-user-cuda; make yes-user-cuda) before doing
a LAMMPS build.
Build this library in two steps. First type:
make OPTIONS
where OPTIONS is one or more of the following settings:
precision=N to set the precision level
N = 1 for single precision (default)
N = 2 for double precision
N = 3 for positions in double precision
N = 4 for positions and velocities in double precision
arch=M to set GPU compute capability
M = 20 for CC2.0 (GF100/110, e.g. C2050,GTX580,GTX470) (default)
M = 21 for CC2.1 (GF104/114, e.g. GTX560, GTX460, GTX450)
M = 13 for CC1.3 (GF200, e.g. C1060, GTX285)
prec_timer=0/1 to use hi-precision timers
0 = do not use them (default)
1 = use these timers
this is usually only useful for Mac machines
dbg=0/1 to activate debug mode
0 = no debug mode (default)
1 = yes debug mode
this is only useful for developers
cufft=1 to determine usage of CUDA FFT library
0 = no CUFFT support (default)
in the future other CUDA-enabled FFT libraries might be supported
This will write settings to the Makefile.defaults file.
Then type "make" with with no arguments to build the library with the
new settings.
After the second make, two files should exist in this directory:
liblammpscuda.a the library LAMMPS will link against
Makefile.lammps settings the LAMMPS Makefile will import
Makefile.lammps is created by the make command and will have settings
consistent with the OPTIONS you selected. It is used by the LAMMPS
build, both for compile-time and link-time settings.

View File

@ -1,85 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
const unsigned int ANGLE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
#include "atom_vec_angle_cuda_cu.h"
void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata)
{
return Cuda_AtomVecCuda_Init<ANGLE_DATA_MASK>(sdata);
}
int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
}
int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
}
int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
}
int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
}
int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
}
int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
}
int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
}
int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
}
int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
}

View File

@ -1,15 +0,0 @@
#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_
#define ATOM_VEC_ANGLE_CUDA_CU_H_
extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata);
extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/

View File

@ -1,85 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
const unsigned int ATOMIC_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
#include "atom_vec_atomic_cuda_cu.h"
void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata)
{
return Cuda_AtomVecCuda_Init<ATOMIC_DATA_MASK>(sdata);
}
int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
}
int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
}
int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
}
int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
}
int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
}
int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
}
int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
}
int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
}
int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
}

View File

@ -1,15 +0,0 @@
#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_
#define ATOM_VEC_ATOMIC_CUDA_CU_H_
extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata);
extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/

View File

@ -1,85 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
const unsigned int CHARGE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
#include "atom_vec_charge_cuda_cu.h"
void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata)
{
return Cuda_AtomVecCuda_Init<CHARGE_DATA_MASK>(sdata);
}
int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
}
int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
}
int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
}
int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
}
int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
}
int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
}
int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
}
int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
}
int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
}

View File

@ -1,15 +0,0 @@
#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_
#define ATOM_VEC_CHARGE_CUDA_CU_H_
extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata);
extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/

View File

@ -1,628 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX atom_vec_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "cuda_wrapper_cu.h"
#include "crm_cuda_utils.cu"
#include "atom_vec_cuda_kernel.cu"
int AtomVecCuda_CountDataItems(unsigned int data_mask)
{
int n = 0;
if(data_mask & X_MASK) n += 3;
if(data_mask & V_MASK) n += 3;
if(data_mask & F_MASK) n += 3;
if(data_mask & TAG_MASK) n++;
if(data_mask & TYPE_MASK) n++;
if(data_mask & MASK_MASK) n++;
if(data_mask & IMAGE_MASK) n++;
if(data_mask & Q_MASK) n++;
if(data_mask & MOLECULE_MASK) n++;
if(data_mask & RMASS_MASK) n++;
if(data_mask & RADIUS_MASK) n++;
if(data_mask & DENSITY_MASK) n++;
if(data_mask & OMEGA_MASK) n += 3;
if(data_mask & TORQUE_MASK) n++;
//if(data_mask & NSPECIAL_MASK) n+=3;
return n;
}
void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
{
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_AtomVecCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
}
template <const unsigned int data_mask>
void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*));
if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*));
if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_AP(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*));
if(data_mask & RADIUS_MASK) cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius.dev_data, sizeof(int*));
if(data_mask & DENSITY_MASK) cudaMemcpyToSymbol(MY_AP(density) , & sdata->atom.density.dev_data, sizeof(int*));
if(data_mask & RMASS_MASK) cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(int*));
if(data_mask & OMEGA_MASK) cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega.dev_data, sizeof(int*));
//if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_AP(nspecial) , & sdata->atom.nspecial.dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*));
}
template <const unsigned int data_mask>
void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata)
{
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n");)
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n");)
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*));
cudaThreadSynchronize();
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n");)
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = (n * n_data_items) * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
dz = pbc[2] * sdata->domain.prd[2];
}
}
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
cudaMemset(sdata->flag, 0, sizeof(int));
my_gettime(CLOCK_REALTIME, &time1);
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
Cuda_AtomVecCuda_PackComm_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
if(not sdata->overlap_comm)
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
int aflag;
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
}
return n_data_items * n;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");)
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = (n * n_data_items) * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
static int count = -1;
count++;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
dz = pbc[2] * sdata->domain.prd[2];
}
}
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_gettime(CLOCK_REALTIME, &time1);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed");
Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_self +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed");
}
return n_data_items * n;
}
template <const unsigned int data_mask>
void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
{
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = (n * n_data_items) * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_gettime(CLOCK_REALTIME, &time1);
if(not sdata->overlap_comm || iswap < 0)
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask> <<< grid, threads, 0>>>(n, first, buf);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed");
}
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
{
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n", dim);)
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: pre Kernel execution failed");
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
Cuda_AtomVecCuda_Init<data_mask>(sdata);
int size = n * sizeof(double);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
int3 layout = getgrid(sdata->atom.nlocal, sizeof(int), 256, true);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
Cuda_AtomVecCuda_PackExchangeList_Kernel <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (n - 1, dim);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_exchange_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost);
int return_value = ((int*) buf_send)[0];
if(n > 1 + return_value)
cudaMemcpy(buf_send, sdata->buffer, (1 + return_value)*sizeof(double), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed");
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_exchange_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n");)
return return_value;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n");)
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1;
int size = (nsend * n_data_items + 1) * sizeof(double);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
int3 layout = getgrid(nsend, 0);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
Cuda_AtomVecCuda_PackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(nsend, (int*) copylist);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed");
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_exchange_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_exchange_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... done\n");)
return nsend * n_data_items + 1;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1;
int size = (nsend * n_data_items + 1) * sizeof(double);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*));
cudaMemset((int*)(sdata->flag), 0, sizeof(int));
if(nsend) {
int3 layout = getgrid(nsend, 0);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
cudaMemcpy(sdata->buffer, buf_send , size, cudaMemcpyHostToDevice);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_exchange_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(sdata->exchange_dim, nsend, (int*) copylist);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_exchange_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed");
}
}
int naccept;
cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
return naccept;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = nsend * n_data_items * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0];
dy = pbc[1];
dz = pbc[2];
}
}
int3 layout = getgrid(nsend);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
Cuda_AtomVecCuda_PackBorder_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, nsend, sdata->comm.maxlistlength, iswap, dx, dy, dz);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_border_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed");
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_border_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
}
return nsend * n_data_items;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = n * n_data_items * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0];
dy = pbc[1];
dz = pbc[2];
}
}
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_border_kernel_self +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed");
}
return n * n_data_items;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = n * n_data_items * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
cudaMemset((int*)(sdata->flag), 0, sizeof(int));
cudaMemcpy(sdata->buffer, (void*)buf_recv, size, cudaMemcpyHostToDevice);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_border_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask> <<< grid, threads, 0>>>(n, first);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_border_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
cudaMemcpy(&sdata->comm.grow_flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed");
}
return sdata->comm.grow_flag;
}
#include "atom_vec_angle_cuda.cu"
#include "atom_vec_atomic_cuda.cu"
#include "atom_vec_charge_cuda.cu"
#include "atom_vec_full_cuda.cu"
//#include "atom_vec_granular_cuda.cu"

View File

@ -1,512 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#define RIMLARGER 1.000001
#define RIMSMALLER 0.999999
#define SMALL 1e-5
extern __shared__ int shared[];
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = list[i];
if(j > _nmax) _flag[0] = 1;
int k = 0;
if(data_mask & X_MASK) {
((X_CFLOAT*) buffer)[i + k * n] = _x[j] + dx;
k++;
((X_CFLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy;
k++;
((X_CFLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz;
k++;
}
if(data_mask & V_MASK) {
((X_CFLOAT*) buffer)[i + k * n] = _v[j];
k++;
((X_CFLOAT*) buffer)[i + k * n] = _v[j + _nmax];
k++;
((X_CFLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax];
k++;
}
if(data_mask & OMEGA_MASK) {
((X_CFLOAT*) buffer)[i + k * n] = _omega[j];
k++;
((X_CFLOAT*) buffer)[i + k * n] = _omega[j + _nmax];
k++;
((X_CFLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax];
k++;
}
if(data_mask & RADIUS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _radius[j];
k++;
if(data_mask & RMASS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _rmass[j];
k++;
}
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = i;
j = list[i];
if(data_mask & X_MASK) {
_x[i + first] = _x[j] + dx;
_x[i + first + _nmax] = _x[j + _nmax] + dy;
_x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
}
if(data_mask & V_MASK) {
_v[i + first] = _v[j];
_v[i + first + _nmax] = _v[j + _nmax];
_v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
}
if(data_mask & OMEGA_MASK) {
_omega[i + first] = _omega[j];
_omega[i + first + _nmax] = _omega[j + _nmax];
_omega[i + first + 2 * _nmax] = _omega[j + 2 * _nmax];
}
if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j];
if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j];
}
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n, int first, void* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
int k = 0;
if(data_mask & X_MASK) {
_x[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
}
if(data_mask & V_MASK) {
_v[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
}
if(data_mask & OMEGA_MASK) {
_omega[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_omega[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_omega[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
}
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
}
}
__global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n, int dim)
{
double* buf = (double*) _buffer;
buf = &buf[1];
//X_CFLOAT lo=slablo[iswap];
//X_CFLOAT hi=slabhi[iswap];
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
bool add = false;
if(i < _nlocal) {
double xdim_tmp = static_cast <double>(_x[i + dim * _nmax]);
if(xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim]) {
add = true;
}
}
shared[threadIdx.x] = add ? 1 : 0;
__syncthreads();
int nsend = 0;
if(threadIdx.x == 0) {
for(int k = 0; k < blockDim.x; k++) {
if(shared[k]) {
nsend++;
shared[k] = nsend;
}
}
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
}
__syncthreads();
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
if(add && nsend + 1 < n)
buf[nsend] = i;
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist)
{
double* buf = (double*) _buffer;
int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(k >= nsend) return;
buf = &buf[1 + k];
int i = static_cast <int>(buf[0]);
int j = copylist[k];
int m = 1;
if(data_mask & X_MASK) {
buf[(m++)*nsend] = static_cast <double>(_x[i]);
buf[(m++)*nsend] = static_cast <double>(_x[i + _nmax]);
buf[(m++)*nsend] = static_cast <double>(_x[i + 2 * _nmax]);
}
if(data_mask & V_MASK) {
buf[(m++)*nsend] = _v[i];
buf[(m++)*nsend] = _v[i + _nmax];
buf[(m++)*nsend] = _v[i + 2 * _nmax];
}
if(data_mask & TAG_MASK) buf[(m++)*nsend] = _tag[i];
if(data_mask & TYPE_MASK) buf[(m++)*nsend] = _type[i];
if(data_mask & MASK_MASK) buf[(m++)*nsend] = _mask[i];
if(data_mask & IMAGE_MASK) buf[(m++)*nsend] = _image[i];
if(data_mask & Q_MASK) buf[(m++)*nsend] = _q[i];
if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i];
if(data_mask & RADIUS_MASK) buf[(m++)*nsend] = _radius[i];
if(data_mask & DENSITY_MASK) buf[(m++)*nsend] = _density[i];
if(data_mask & RMASS_MASK) buf[(m++)*nsend] = _rmass[i];
if(data_mask & OMEGA_MASK) {
buf[(m++)*nsend] = _omega[i];
buf[(m++)*nsend] = _omega[i + _nmax];
buf[(m++)*nsend] = _omega[i + 2 * _nmax];
}
/* if(data_mask & NSPECIAL_MASK)
{
buf[(m++)*nsend] = _nspecial[i];
buf[(m++)*nsend] = _nspecial[i+_nmax];
buf[(m++)*nsend] = _nspecial[i+2* _nmax];
}*/
if(i >= _nlocal) return;
if(data_mask & X_MASK) {
_x[i] = _x[j];
_x[i + _nmax] = _x[j + _nmax];
_x[i + 2 * _nmax] = _x[j + 2 * _nmax];
}
if(data_mask & V_MASK) {
_v[i] = _v[j];
_v[i + _nmax] = _v[j + _nmax];
_v[i + 2 * _nmax] = _v[j + 2 * _nmax];
}
if(data_mask & TAG_MASK) _tag[i] = _tag[j];
if(data_mask & TYPE_MASK) _type[i] = _type[j];
if(data_mask & MASK_MASK) _mask[i] = _mask[j];
if(data_mask & IMAGE_MASK) _image[i] = _image[j];
if(data_mask & Q_MASK) _q[i] = _q[j];
if(data_mask & MOLECULE_MASK) _molecule[i] = _molecule[j];
if(data_mask & RADIUS_MASK) _radius[i] = _radius[j];
if(data_mask & DENSITY_MASK) _density[i] = _density[j];
if(data_mask & RMASS_MASK) _rmass[i] = _rmass[j];
if(data_mask & OMEGA_MASK) {
_omega[i] = _omega[j];
_omega[i + _nmax] = _omega[j + _nmax];
_omega[i + 2 * _nmax] = _omega[j + 2 * _nmax];
}
/* if(data_mask & NSPECIAL_MASK)
{
_nspecial[i] = _nspecial[j];
_nspecial[i+_nmax] = _nspecial[j+_nmax];
_nspecial[i+2* _nmax] = _nspecial[j+2* _nmax];
}*/
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim, int nsend, int* copylist)
{
double* buf = (double*) _buffer;
int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(k >= nsend) return;
buf = &buf[1 + k];
int i = -1;
double xdim_tmp = buf[(1 + dim) * nsend];
if(xdim_tmp >= _sublo[dim] - SMALL && xdim_tmp < _subhi[dim] + SMALL) {
i = atomicAdd(_flag, 1) + _nlocal;
int m = 1;
if(data_mask & X_MASK) {
_x[i] = buf[(m++) * nsend];
_x[i + _nmax] = buf[(m++) * nsend];
_x[i + 2 * _nmax] = buf[(m++) * nsend];
}
if(data_mask & V_MASK) {
_v[i] = buf[(m++) * nsend];
_v[i + _nmax] = buf[(m++) * nsend];
_v[i + 2 * _nmax] = buf[(m++) * nsend];
}
if(data_mask & TAG_MASK) _tag[i] = buf[(m++) * nsend];
if(data_mask & TYPE_MASK) _type[i] = buf[(m++) * nsend];
if(data_mask & MASK_MASK) _mask[i] = buf[(m++) * nsend];
if(data_mask & IMAGE_MASK) _image[i] = buf[(m++) * nsend];
if(data_mask & Q_MASK) _q[i] = buf[(m++) * nsend];
if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++) * nsend];
if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++) * nsend];
if(data_mask & DENSITY_MASK) _density[i] = buf[(m++) * nsend];
if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++) * nsend];
if(data_mask & OMEGA_MASK) {
_omega[i] = buf[(m++) * nsend];
_omega[i + _nmax] = buf[(m++) * nsend];
_omega[i + 2 * _nmax] = buf[(m++) * nsend];
}
/* if(data_mask & NSPECIAL_MASK)
{
_nspecial[i] = buf[(m++)*nsend];
_nspecial[i+_nmax] = buf[(m++)*nsend];
_nspecial[i+2*_nmax] = buf[(m++)*nsend];
}*/
}
copylist[k] = i;
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = list[i];
int m = 0;
if(data_mask & X_MASK) {
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx;
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy;
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz;
}
if(data_mask & V_MASK) {
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax];
}
if(data_mask & TAG_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _tag[j];
if(data_mask & TYPE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _type[j];
if(data_mask & MASK_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _mask[j];
if(data_mask & Q_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _q[j];
if(data_mask & MOLECULE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _molecule[j];
if(data_mask & RADIUS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _radius[i];
if(data_mask & DENSITY_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _density[i];
if(data_mask & RMASS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _rmass[i];
if(data_mask & OMEGA_MASK) {
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax];
}
}
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = list[i];
if(data_mask & X_MASK) {
_x[i + first] = _x[j] + dx;
_x[i + first + _nmax] = _x[j + _nmax] + dy;
_x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
}
if(data_mask & V_MASK) {
_v[i + first] = _v[j];
_v[i + first + _nmax] = _v[j + _nmax];
_v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
}
if(data_mask & TAG_MASK) _tag[i + first] = _tag[j];
if(data_mask & TYPE_MASK) _type[i + first] = _type[j];
if(data_mask & MASK_MASK) _mask[i + first] = _mask[j];
if(data_mask & Q_MASK) _q[i + first] = _q[j];
if(data_mask & MOLECULE_MASK) _molecule[i + first] = _molecule[j];
if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j];
if(data_mask & DENSITY_MASK) _density[i + first] = _density[j];
if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j];
if(data_mask & OMEGA_MASK) {
_omega[i + first] = _omega[j];
_omega[i + first + _nmax] = _omega[j + _nmax];
_omega[i + first + 2 * _nmax] = _omega[j + 2 * _nmax];
}
}
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
if(i + first < _nmax) {
int m = 0;
if(data_mask & X_MASK) {
_x[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_x[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
}
if(data_mask & V_MASK) {
_v[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_v[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
}
if(data_mask & TAG_MASK) _tag[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & TYPE_MASK) _type[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & MASK_MASK) _mask[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & Q_MASK) _q[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
if(data_mask & DENSITY_MASK) _density[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
if(data_mask & OMEGA_MASK) {
_omega[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_omega[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_omega[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
}
} else {
_flag[0] = 1;
}
}
}

View File

@ -1,85 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
const unsigned int FULL_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
#include "atom_vec_full_cuda_cu.h"
void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata)
{
return Cuda_AtomVecCuda_Init<FULL_DATA_MASK>(sdata);
}
int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
}
int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
}
int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
}
int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
}
int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
}
int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
}
int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
}
int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
}
int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
}

View File

@ -1,15 +0,0 @@
#ifndef ATOM_VEC_FULL_CUDA_CU_H_
#define ATOM_VEC_FULL_CUDA_CU_H_
extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata);
extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/

View File

@ -1,539 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX comm_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "comm_cuda_cu.h"
#include "comm_cuda_kernel.cu"
#include <ctime>
void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata, int n)
{
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
}
void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
}
void Cuda_CommCuda_Init(cuda_shared_data* sdata)
{
Cuda_CommCuda_UpdateNmax(sdata);
int ntypesp = sdata->atom.ntypes + 1;
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &ntypesp, sizeof(int));
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata, sizeof(int*));
}
int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
dz = pbc[2] * sdata->domain.prd[2];
}
}
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
cudaMemset(sdata->flag, 0, sizeof(int));
my_gettime(CLOCK_REALTIME, &time1);
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
if(not sdata->overlap_comm)
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
int aflag;
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
}
return 3 * n;
}
int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 6 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
dz = pbc[2] * sdata->domain.prd[2];
}
}
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
cudaMemset(sdata->flag, 0, sizeof(int));
my_gettime(CLOCK_REALTIME, &time1);
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
if(not sdata->overlap_comm)
cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
int aflag;
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
}
return 6 * n;
}
int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
static int count = -1;
count++;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
dz = pbc[2] * sdata->domain.prd[2];
}
}
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_gettime(CLOCK_REALTIME, &time1);
Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_self +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
}
return 3 * n;
}
int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 6 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
static int count = -1;
count++;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
dz = pbc[2] * sdata->domain.prd[2];
}
}
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_gettime(CLOCK_REALTIME, &time1);
Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_self +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
}
return 6 * n;
}
void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
{
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_gettime(CLOCK_REALTIME, &time1);
if(not sdata->overlap_comm || iswap < 0)
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
}
}
void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
{
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 6 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
my_gettime(CLOCK_REALTIME, &time1);
if(not sdata->overlap_comm || iswap < 0)
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
}
}
int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send)
{
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(F_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
F_CFLOAT* buf = (F_CFLOAT*)buf_send;
F_CFLOAT* f_dev = (F_CFLOAT*)sdata->atom.f.dev_data;
f_dev += first;
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
buf += n;
f_dev += sdata->atom.nmax;
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
buf += n;
f_dev += sdata->atom.nmax;
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
return n * 3;
}
void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv)
{
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(F_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
cudaMemcpy(sdata->buffer, buf_recv, size, cudaMemcpyHostToDevice);
Cuda_CommCuda_UnpackReverse_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed");
}
}
void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first)
{
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
Cuda_CommCuda_UnpackReverse_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, first);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed");
}
}
int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap)
{
MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new or (80 > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, 10);
int n;
if(!bordergroup || ineed >= 2)
n = nlast - nfirst + 1;
else {
n = atom_nfirst;
if(nlast - sdata->atom.nlocal + 1 > n) n = nlast - sdata->atom.nlocal + 1;
}
int3 layout = getgrid(n, 0, 512, true);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x + 1, layout.y, 1);
cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
my_gettime(CLOCK_REALTIME, &time1);
if(style == 1)
Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.slablo.dev_data, (X_CFLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
else
Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.multilo.dev_data, (X_CFLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_border_kernel_buildlist +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed");
int nsend;
cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
return nsend;
}

View File

@ -1,35 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag);
extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag);
extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag);
extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag);
extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1);
extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1);
extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send);
extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv);
extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first);
extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap);

View File

@ -1,394 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = list[i];
if(j > _nmax) _flag[0] = 1;
((X_CFLOAT*) buffer)[i] = _x[j] + dx;
((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
}
}
__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = list[i];
if(j > _nmax) _flag[0] = 1;
((X_CFLOAT*) buffer)[i] = _x[j] + dx;
((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
((X_CFLOAT*) buffer)[i + 3 * n] = _v[j];
((X_CFLOAT*) buffer)[i + 4 * n] = _v[j + _nmax];
((X_CFLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax];
}
}
__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = i;
j = list[i];
_x[i + first] = _x[j] + dx;
_x[i + first + _nmax] = _x[j + _nmax] + dy;
_x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
}
}
__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = i;
j = list[i];
_x[i + first] = _x[j] + dx;
_x[i + first + _nmax] = _x[j + _nmax] + dy;
_x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
_v[i + first] = _v[j];
_v[i + first + _nmax] = _v[j + _nmax];
_v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
}
}
__global__ void Cuda_CommCuda_UnpackComm_Kernel(int n, int first, void* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
_x[i + first] = ((X_CFLOAT*) buffer)[i];
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
}
}
__global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n, int first, void* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
_x[i + first] = ((X_CFLOAT*) buffer)[i];
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
_v[i + first] = ((X_CFLOAT*) buffer)[i + 3 * n];
_v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 4 * n];
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 5 * n];
}
}
__global__ void Cuda_CommCuda_PackReverse_Kernel(int n, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
((F_CFLOAT*) _buffer)[i] = _f[i + first];
((F_CFLOAT*) _buffer)[i + n] = _f[i + first + _nmax];
((F_CFLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax];
}
}
__global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist, int n, int maxlistlength, int iswap)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = list[i];
_f[j] += ((F_CFLOAT*)_buffer)[i];
_f[j + _nmax] += ((F_CFLOAT*) _buffer)[i + n];
_f[j + 2 * _nmax] += ((F_CFLOAT*) _buffer)[i + 2 * n];
}
}
__global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
if(i < n) {
int j = list[i];
_f[j] += _f[i + first];
_f[j + _nmax] += _f[i + first + _nmax];
_f[j + 2 * _nmax] += _f[i + first + 2 * _nmax];
}
}
extern __shared__ int shared[];
__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, int atom_nfirst,
int nfirst, int nlast, int dim, int iswap, X_CFLOAT* slablo, X_CFLOAT* slabhi, int* sendlist, int maxlistlength)
{
int* list = sendlist + iswap * maxlistlength;
X_CFLOAT lo = slablo[iswap];
X_CFLOAT hi = slabhi[iswap];
bool add = false;
if(!bordergroup || ineed >= 2) {
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst;
if(i < nlast)
if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
add = true;
}
shared[threadIdx.x] = add ? 1 : 0;
__syncthreads();
int nsend = 0;
if(threadIdx.x == 0) {
for(int k = 0; k < blockDim.x; k++) {
if(shared[k]) {
nsend++;
shared[k] = nsend;
}
}
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
}
__syncthreads();
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
if(add && nsend < maxlistlength)
list[nsend] = i;
} else {
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < atom_nfirst)
if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
add = true;
}
shared[threadIdx.x] = add ? 1 : 0;
__syncthreads();
int nsend = 0;
if(threadIdx.x == 0) {
for(int k = 0; k < blockDim.x; k++) {
if(shared[k]) {
nsend++;
shared[k] = nsend;
}
}
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
}
__syncthreads();
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
if(add && nsend < maxlistlength)
list[nsend] = i;
__syncthreads();
add = false;
i += _nlocal;
if(i < nlast)
if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
add = true;
}
shared[threadIdx.x] = add ? 1 : 0;
__syncthreads();
nsend = 0;
if(threadIdx.x == 0) {
for(int k = 0; k < blockDim.x; k++) {
if(shared[k]) {
nsend++;
shared[k] = nsend;
}
}
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
}
__syncthreads();
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
if(add && nsend < maxlistlength)
list[nsend] = i;
}
}
__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup, int ineed, int atom_nfirst
, int nfirst, int nlast, int dim, int iswap, X_CFLOAT* multilo, X_CFLOAT* multihi, int* sendlist, int maxlistlength)
{
int* list = sendlist + iswap * maxlistlength;
X_CFLOAT* mlo = &multilo[iswap * _cuda_ntypes];
X_CFLOAT* mhi = &multihi[iswap * _cuda_ntypes];
int itype = 0;
bool add = false;
if(!bordergroup || ineed >= 2) {
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst;
if(i < nlast) {
itype = _type[i];
if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
add = true;
}
}
shared[threadIdx.x] = add ? 1 : 0;
__syncthreads();
int nsend = 0;
if(threadIdx.x == 0) {
for(int k = 0; k < blockDim.x; k++) {
if(shared[k]) {
nsend++;
shared[k] = nsend;
}
}
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
}
__syncthreads();
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
if(add && nsend < maxlistlength)
list[nsend] = i;
} else {
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < atom_nfirst) {
itype = _type[i];
if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
add = true;
}
}
shared[threadIdx.x] = add ? 1 : 0;
__syncthreads();
int nsend = 0;
if(threadIdx.x == 0) {
for(int k = 0; k < blockDim.x; k++) {
if(shared[k]) {
nsend++;
shared[k] = nsend;
}
}
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
}
__syncthreads();
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
if(add && nsend < maxlistlength)
list[nsend] = i;
__syncthreads();
add = false;
i += _nlocal;
if(i < nlast) {
itype = _type[i];
if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
add = true;
}
}
shared[threadIdx.x] = add ? 1 : 0;
__syncthreads();
nsend = 0;
if(threadIdx.x == 0) {
for(int k = 0; k < blockDim.x; k++) {
if(shared[k]) {
nsend++;
shared[k] = nsend;
}
}
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
}
__syncthreads();
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
if(add && nsend < maxlistlength)
list[nsend] = i;
}
}

View File

@ -1,126 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX compute_temp_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "compute_temp_cuda_cu.h"
#include "compute_temp_cuda_kernel.cu"
void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
}
void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
if(sdata->atom.rmass_flag)
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
}
void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata)
{
Cuda_ComputeTempCuda_UpdateNmax(sdata);
}
void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
//if(sdata->buffer_new)
Cuda_ComputeTempCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed");
int oldgrid = grid.x * grid.y;
grid.x = 6;
grid.y = 1;
threads.x = 512;
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed");
}
}
void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
//if(sdata->buffer_new)
Cuda_ComputeTempCuda_UpdateBuffer(sdata);
MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n", sdata->atom.nlocal);)
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel");
Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed");
int oldgrid = grid.x * grid.y;
grid.x = 1;
grid.y = 1;
threads.x = 512;
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed");
}
}

View File

@ -1,28 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);
extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);

View File

@ -1,118 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ ENERGY_CFLOAT sharedmem[];
__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
sharedmem[threadIdx.x] = 0;
if(i < _nlocal) {
if(_rmass_flag) {
if(_mask[i] & groupbit)
sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * _rmass[i];
} else {
if(_mask[i] & groupbit)
sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * (_mass[_type[i]]);
}
}
reduceBlock(sharedmem);
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
}
}
__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
sharedmem[threadIdx.x] = 0;
sharedmem[threadIdx.x + blockDim.x] = 0;
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
sharedmem[threadIdx.x + 4 * blockDim.x] = 0;
sharedmem[threadIdx.x + 5 * blockDim.x] = 0;
if(i < _nlocal)
if(_mask[i] & groupbit) {
V_CFLOAT massone;
if(_rmass_flag) massone = _rmass[i];
else massone = _mass[_type[i]];
sharedmem[threadIdx.x] = massone * _v[i] * _v[i];
sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax];
sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax];
sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax];
sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax];
sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax];
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
reduceBlock(&sharedmem[3 * blockDim.x]);
reduceBlock(&sharedmem[4 * blockDim.x]);
reduceBlock(&sharedmem[5 * blockDim.x]);
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x];
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x];
}
}
__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
ENERGY_CFLOAT myforig = 0.0;
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {
sharedmem[threadIdx.x] = 0;
if(i + threadIdx.x < n)
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i += blockDim.x;
if(threadIdx.x == 0)
myforig += sharedmem[0];
}
if(threadIdx.x == 0)
t[blockIdx.x] = myforig;
}

View File

@ -1,164 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX compute_temp_partial_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "compute_temp_partial_cuda_cu.h"
#include "compute_temp_partial_cuda_kernel.cu"
void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
}
void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
if(sdata->atom.rmass_flag)
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
}
void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
{
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
}
void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
//if(sdata->buffer_new)
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
int oldgrid = grid.x * grid.y;
grid.x = 6;
threads.x = 512;
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
}
}
void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
//if(sdata->buffer_new)
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n", sdata->atom.nlocal);)
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
int oldgrid = grid.x * grid.y;
grid.x = 1;
threads.x = 512;
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
}
}
void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
//if(sdata->buffer_new)
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
}
}
void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
//if(sdata->buffer_new)
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
}
}

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);

View File

@ -1,161 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ ENERGY_CFLOAT sharedmem[];
__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xflag, int yflag, int zflag)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
sharedmem[threadIdx.x] = 0;
if(i < _nlocal) {
if(_rmass_flag) {
if(_mask[i] & groupbit)
sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * _rmass[i];
} else {
if(_mask[i] & groupbit)
sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * (_mass[_type[i]]);
}
}
reduceBlock(sharedmem);
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
}
}
__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xflag, int yflag, int zflag)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
sharedmem[threadIdx.x] = 0;
sharedmem[threadIdx.x + blockDim.x] = 0;
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
sharedmem[threadIdx.x + 4 * blockDim.x] = 0;
sharedmem[threadIdx.x + 5 * blockDim.x] = 0;
if(i < _nlocal)
if(_mask[i] & groupbit) {
V_CFLOAT massone;
if(_rmass_flag) massone = _rmass[i];
else massone = _mass[_type[i]];
sharedmem[threadIdx.x] = massone * _v[i] * _v[i] * xflag;
sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax] * yflag;
sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag;
sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax] * xflag * yflag;
sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax] * xflag * zflag;
sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax] * yflag * zflag;
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
reduceBlock(&sharedmem[3 * blockDim.x]);
reduceBlock(&sharedmem[4 * blockDim.x]);
reduceBlock(&sharedmem[5 * blockDim.x]);
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x];
}
}
__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
ENERGY_CFLOAT myforig = 0.0;
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {
sharedmem[threadIdx.x] = 0;
if(i + threadIdx.x < n)
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i += blockDim.x;
if(threadIdx.x == 0)
myforig += sharedmem[0];
}
if(threadIdx.x == 0)
t[blockIdx.x] = myforig;
}
__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
if(!xflag) {
vbiasall[i] = _v[i];
_v[i] = V_F(0.0);
}
if(!yflag) {
vbiasall[i + _nmax] = _v[i + _nmax];
_v[i + _nmax] = V_F(0.0);
}
if(!zflag) {
vbiasall[i + 2 * _nmax] = _v[i + 2 * _nmax];
_v[i + 2 * _nmax] = V_F(0.0);
}
}
}
__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
if(!xflag) {
_v[i] += vbiasall[i];
}
if(!yflag) {
_v[i + _nmax] += vbiasall[i + _nmax];
}
if(!zflag) {
_v[i + 2 * _nmax] += vbiasall[i + 2 * _nmax];
}
}
}

View File

@ -1,919 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef CRM_CUDA_UTILS
#define CRM_CUDA_UTILS
//split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
{
int3 gridparams;
int sharedsize = 16000;
if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
if((n < 60 * 32) || (threadsmax < 64))
gridparams.z = 32;
else if((n < 60 * 64) || (threadsmax < 128))
gridparams.z = 64;
else if((n < 60 * 128) || (threadsmax < 256))
gridparams.z = 128;
else if((n < 60 * 256) || (threadsmax < 512))
gridparams.z = 256;
else gridparams.z = 512;
if(p2) {
gridparams.z = 16;
while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
}
int blocks = (n + gridparams.z - 1) / gridparams.z;
if(blocks > 10000)
gridparams.x = gridparams.y = int(sqrt(blocks));
else {
gridparams.x = blocks;
gridparams.y = 1;
}
while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
if(gridparams.x == 0) gridparams.x = 1;
return gridparams;
}
//return value: 1 if f<0; else: 0
//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
static inline __device__ int negativCUDA(float f)
{
return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
}
//return value: -1 if f<0; else +1
static inline __device__ float fsignCUDA(float f)
{
return f < 0.0f ? -1.0f : 1.0f;
}
//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
//blockDim.y and blockDim.z are assumed to be 1
static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
__syncthreads();
}
static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
__syncthreads();
}
static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
__syncthreads();
}
static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
__syncthreads();
}
static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
__syncthreads();
}
static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
{
int i;
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
__syncthreads();
}
//copy data between two memory areas on device, 3d BlockDims are allowed
static __device__ inline void copyData(double* source, double* target, const int &n)
{
int i;
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
target[i + offset] = source[i + offset];
}
if(offset < n - i) {
target[i + offset] = source[i + offset];
}
__syncthreads();
}
static __device__ inline void copyData(float* source, float* target, const int &n)
{
int i;
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
target[i + offset] = source[i + offset];
}
if(offset < n - i) {
target[i + offset] = source[i + offset];
}
__syncthreads();
}
static __device__ inline void copyData(int* source, int* target, const int &n)
{
int i;
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
target[i + offset] = source[i + offset];
}
if(offset < n - i) {
target[i + offset] = source[i + offset];
}
__syncthreads();
}
static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
{
int i;
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
target[i + offset] = source[i + offset];
}
if(offset < n - i) {
target[i + offset] = source[i + offset];
}
__syncthreads();
}
//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
//in the end in data[0]=sum_i=0^blockDim.x data[i]
//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
static __device__ inline void reduceBlockP2(int* data)
{
__syncthreads();
for(int i = 2; i <= blockDim.x; i *= 2) {
if(threadIdx.x < blockDim.x / i)
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
__syncthreads();
}
}
static __device__ inline void reduceBlockP2(unsigned int* data)
{
__syncthreads();
for(int i = 2; i <= blockDim.x; i *= 2) {
if(threadIdx.x < blockDim.x / i)
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
__syncthreads();
}
}
static __device__ inline void reduceBlockP2(float* data)
{
__syncthreads();
for(int i = 2; i <= blockDim.x; i *= 2) {
if(threadIdx.x < blockDim.x / i)
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
__syncthreads();
}
}
static __device__ inline void reduceBlockP2(double* data)
{
__syncthreads();
for(int i = 2; i <= blockDim.x; i *= 2) {
if(threadIdx.x < blockDim.x / i)
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
__syncthreads();
}
}
static __device__ inline void reduceBlock(float* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] += data[threadIdx.x + p2];
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] += data[threadIdx.x + p2 / i];
__syncthreads();
}
}
static __device__ inline void reduceBlock(int* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] += data[threadIdx.x + p2];
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] += data[threadIdx.x + p2 / i];
__syncthreads();
}
}
static __device__ inline void reduceBlock(unsigned int* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] += data[threadIdx.x + p2];
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] += data[threadIdx.x + p2 / i];
__syncthreads();
}
}
static __device__ inline void reduceBlock(double* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] += data[threadIdx.x + p2];
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] += data[threadIdx.x + p2 / i];
__syncthreads();
}
}
static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
{
int i;
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
data[i + threadIdx.x] = value;
}
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
}
static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
{
int i;
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
data[i + threadIdx.x] = value;
}
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
}
static __device__ inline void reduce(float* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
j++;
}
__syncthreads();
}
}
static __device__ inline void reduce(double* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
j++;
}
__syncthreads();
}
}
static __device__ inline void minOfBlock(float* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
__syncthreads();
}
}
static __device__ inline void maxOfBlock(float* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
__syncthreads();
}
}
static __device__ inline void minOfBlock(double* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
__syncthreads();
}
}
static __device__ inline void maxOfBlock(double* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
__syncthreads();
}
}
static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) < n - p2) {
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
j++;
}
__syncthreads();
}
}
static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) < n - p2) {
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
j++;
}
__syncthreads();
}
}
static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) < n - p2) {
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
j++;
}
__syncthreads();
}
}
static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) < n - p2) {
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
j++;
}
__syncthreads();
}
}
#if X_PRECISION == 2
static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
{
int2 v = tex1Dfetch(t, i);
return __hiloint2double(v.y, v.x);
}
static __device__ inline X_CFLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t, 2 * i);
int4 u = tex1Dfetch(t, 2 * i + 1);
X_CFLOAT4 w;
w.x = __hiloint2double(v.y, v.x);
w.y = __hiloint2double(v.w, v.z);
w.z = __hiloint2double(u.y, u.x);
w.w = __hiloint2double(u.w, u.z);
return w;
}
#endif
inline void BindXTypeTexture(cuda_shared_data* sdata)
{
#ifdef CUDA_USE_TEXTURE
_x_type_tex.normalized = false; // access with normalized texture coordinates
_x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
#if X_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_CFLOAT4));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
#endif
#endif
}
static __device__ inline X_CFLOAT4 fetchXType(int i)
{
#ifdef CUDA_USE_TEXTURE
#if X_PRECISION == 1
return tex1Dfetch(_x_type_tex, i);
#else
return tex1Dfetch_double(_x_type_tex, i);
#endif
#else
return _x_type[i];
#endif
}
#if V_PRECISION == 2
static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
{
int2 v = tex1Dfetch(t, i);
return __hiloint2double(v.y, v.x);
}
static __device__ inline V_CFLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t, 2 * i);
int4 u = tex1Dfetch(t, 2 * i + 1);
V_CFLOAT4 w;
w.x = __hiloint2double(v.y, v.x);
w.y = __hiloint2double(v.w, v.z);
w.z = __hiloint2double(u.y, u.x);
w.w = __hiloint2double(u.w, u.z);
return w;
}
#endif
inline void BindVRadiusTexture(cuda_shared_data* sdata)
{
#ifdef CUDA_USE_TEXTURE
_v_radius_tex.normalized = false; // access with normalized texture coordinates
_v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
#if V_PRECISION == 1
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_CFLOAT4));
#else
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
#endif
#endif
}
static __device__ inline V_CFLOAT4 fetchVRadius(int i)
{
#ifdef CUDA_USE_TEXTURE
#if V_PRECISION == 1
return tex1Dfetch(_v_radius_tex, i);
#else
return tex1Dfetch_double_v(_v_radius_tex, i);
#endif
#else
return _v_radius[i];
#endif
}
inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
{
#ifdef CUDA_USE_TEXTURE
_omega_rmass_tex.normalized = false; // access with normalized texture coordinates
_omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
#if V_PRECISION == 1
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_CFLOAT4));
#else
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
#endif
#endif
}
static __device__ inline V_CFLOAT4 fetchOmegaRmass(int i)
{
#ifdef CUDA_USE_TEXTURE
#if V_PRECISION == 1
return tex1Dfetch(_omega_rmass_tex, i);
#else
return tex1Dfetch_double_v(_omega_rmass_tex, i);
#endif
#else
return _omega_rmass[i];
#endif
}
#if F_PRECISION == 2
static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
{
int2 v = tex1Dfetch(t, i);
return __hiloint2double(v.y, v.x);
}
static __device__ inline F_CFLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t, 2 * i);
int4 u = tex1Dfetch(t, 2 * i + 1);
F_CFLOAT4 w;
w.x = __hiloint2double(v.y, v.x);
w.y = __hiloint2double(v.w, v.z);
w.z = __hiloint2double(u.y, u.x);
w.w = __hiloint2double(u.w, u.z);
return w;
}
#endif
inline void BindQTexture(cuda_shared_data* sdata)
{
#ifdef CUDA_USE_TEXTURE
_q_tex.normalized = false; // access with normalized texture coordinates
_q_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* q_texture_ptr = &MY_AP(q_tex);
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
#endif
#endif
}
static __device__ inline F_CFLOAT fetchQ(int i)
{
#ifdef CUDA_USE_TEXTURE
#if F_PRECISION == 1
return tex1Dfetch(_q_tex, i);
#else
return tex1Dfetch_double_f(_q_tex, i);
#endif
#else
return _q[i];
#endif
}
#endif
/*
inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
{
#ifdef CUDA_USE_TEXTURE
_coeff_tex.normalized = false; // access with normalized texture coordinates
_coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* coeff_texture_ptr;
cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_CFLOAT4));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
#endif
#endif
}
static __device__ inline X_CFLOAT4 fetchXType(int i)
{
#ifdef CUDA_USE_TEXTURE
#if X_PRECISION == 1
return tex1Dfetch(_x_type_tex,i);
#else
return tex1Dfetch_double(_x_type_tex,i);
#endif
#else
return _x_type[i];
#endif
}
*/
#define SBBITS 30
static inline __device__ int sbmask(int j)
{
return j >> SBBITS & 3;
}
static inline __device__ void minimum_image(X_CFLOAT4 &delta)
{
if(_triclinic == 0) {
if(_periodicity[0]) {
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
}
if(_periodicity[1]) {
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
}
if(_periodicity[2]) {
delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
(delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
}
} else {
if(_periodicity[1]) {
delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
(delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
(delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
(delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
}
if(_periodicity[1]) {
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
(delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
}
if(_periodicity[0]) {
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
}
}
}
static inline __device__ void closest_image(X_CFLOAT4 &x1, X_CFLOAT4 &x2, X_CFLOAT4 &ci)
{
ci.x = x2.x - x1.x;
ci.y = x2.y - x1.y;
ci.z = x2.z - x1.z;
minimum_image(ci);
ci.x += x1.x;
ci.y += x1.y;
ci.z += x1.z;
}

View File

@ -1,22 +0,0 @@
#include "cuda_precision.h"
#include "cuda_shared.h"
#include "cuda_cu.h"
void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata)
{
sdata->compile_settings.prec_glob = sizeof(CUDA_CFLOAT) / 4;
sdata->compile_settings.prec_x = sizeof(X_CFLOAT) / 4;
sdata->compile_settings.prec_v = sizeof(V_CFLOAT) / 4;
sdata->compile_settings.prec_f = sizeof(F_CFLOAT) / 4;
sdata->compile_settings.prec_pppm = sizeof(PPPM_CFLOAT) / 4;
sdata->compile_settings.prec_fft = sizeof(FFT_CFLOAT) / 4;
#ifdef FFT_CUFFT
sdata->compile_settings.cufft = 1;
#else
sdata->compile_settings.cufft = 0;
#endif
sdata->compile_settings.arch = CUDA_ARCH;
}

View File

@ -1,344 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_COMMON_H_
#define _CUDA_COMMON_H_
//#include "cutil.h"
#include "cuda_precision.h"
#include "cuda_wrapper_cu.h"
#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
//this can not be arbitrarly large, since constant space is limited.
//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
//Christian
#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
#define CUDA_MAX_NSPECIAL 25
// define some easy-to-use debug and emulation macros
#ifdef _DEBUG
#define MYDBG(a) a
#else
#define MYDBG(a)
#endif
#if __DEVICE_EMULATION__
#define MYEMU(a) a
#else
#define MYEMU(a)
#endif
#define MYEMUDBG(a) MYEMU(MYDBG(a))
// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
#define MY_ADD_PREFIX(prefix, var) prefix##_##var
#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
#define MY_VAR_TO_STR(var) #var
#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
//#define &MY_AP(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
//#define &MY_AP(var) &(MY_AP(var))
#define CUDA_USE_TEXTURE
#define CUDA_USE_CFLOAT4
//constants used by many classes
//domain
#define _boxhi MY_AP(boxhi)
#define _boxlo MY_AP(boxlo)
#define _subhi MY_AP(subhi)
#define _sublo MY_AP(sublo)
#define _box_size MY_AP(box_size)
#define _prd MY_AP(prd)
#define _periodicity MY_AP(periodicity)
#define _triclinic MY_AP(triclinic)
#define _boxhi_lamda MY_AP(boxhi_lamda)
#define _boxlo_lamda MY_AP(boxlo_lamda)
#define _prd_lamda MY_AP(prd_lamda)
#define _h MY_AP(h)
#define _h_inv MY_AP(h_inv)
#define _h_rate MY_AP(h_rate)
__device__ __constant__ X_CFLOAT _boxhi[3];
__device__ __constant__ X_CFLOAT _boxlo[3];
__device__ __constant__ X_CFLOAT _subhi[3];
__device__ __constant__ X_CFLOAT _sublo[3];
__device__ __constant__ X_CFLOAT _box_size[3];
__device__ __constant__ X_CFLOAT _prd[3];
__device__ __constant__ int _periodicity[3];
__device__ __constant__ int _triclinic;
__device__ __constant__ X_CFLOAT _boxhi_lamda[3];
__device__ __constant__ X_CFLOAT _boxlo_lamda[3];
__device__ __constant__ X_CFLOAT _prd_lamda[3];
__device__ __constant__ X_CFLOAT _h[6];
__device__ __constant__ X_CFLOAT _h_inv[6];
__device__ __constant__ V_CFLOAT _h_rate[6];
//atom properties
#define _x MY_AP(x)
#define _v MY_AP(v)
#define _f MY_AP(f)
#define _tag MY_AP(tag)
#define _type MY_AP(type)
#define _mask MY_AP(mask)
#define _image MY_AP(image)
#define _q MY_AP(q)
#define _mass MY_AP(mass)
#define _rmass MY_AP(rmass)
#define _rmass_flag MY_AP(rmass_flag)
#define _eatom MY_AP(eatom)
#define _vatom MY_AP(vatom)
#define _x_type MY_AP(x_type)
#define _radius MY_AP(radius)
#define _density MY_AP(density)
#define _omega MY_AP(omega)
#define _torque MY_AP(torque)
#define _special MY_AP(special)
#define _maxspecial MY_AP(maxspecial)
#define _nspecial MY_AP(nspecial)
#define _special_flag MY_AP(special_flag)
#define _molecule MY_AP(molecule)
#define _v_radius MY_AP(v_radius)
#define _omega_rmass MY_AP(omega_rmass)
#define _freeze_group_bit MY_AP(freeze_group_bit)
#define _map_array MY_AP(map_array)
__device__ __constant__ X_CFLOAT* _x; //holds pointer to positions
__device__ __constant__ V_CFLOAT* _v;
__device__ __constant__ F_CFLOAT* _f;
__device__ __constant__ int* _tag;
__device__ __constant__ int* _type;
__device__ __constant__ int* _mask;
__device__ __constant__ int* _image;
__device__ __constant__ V_CFLOAT* _mass;
__device__ __constant__ F_CFLOAT* _q;
__device__ __constant__ V_CFLOAT* _rmass;
__device__ __constant__ int _rmass_flag;
__device__ __constant__ ENERGY_CFLOAT* _eatom;
__device__ __constant__ ENERGY_CFLOAT* _vatom;
__device__ __constant__ X_CFLOAT4* _x_type; //holds pointer to positions
__device__ __constant__ X_CFLOAT* _radius;
__device__ __constant__ F_CFLOAT* _density;
__device__ __constant__ V_CFLOAT* _omega;
__device__ __constant__ F_CFLOAT* _torque;
__device__ __constant__ int* _special;
__device__ __constant__ int _maxspecial;
__device__ __constant__ int* _nspecial;
__device__ __constant__ int _special_flag[4];
__device__ __constant__ int* _molecule;
__device__ __constant__ V_CFLOAT4* _v_radius; //holds pointer to positions
__device__ __constant__ V_CFLOAT4* _omega_rmass; //holds pointer to positions
__device__ __constant__ int _freeze_group_bit;
__device__ __constant__ int* _map_array;
#ifdef CUDA_USE_TEXTURE
#define _x_tex MY_AP(x_tex)
#if X_PRECISION == 1
texture<float> _x_tex;
#else
texture<int2, 1> _x_tex;
#endif
#define _type_tex MY_AP(type_tex)
texture<int> _type_tex;
#define _x_type_tex MY_AP(x_type_tex)
#if X_PRECISION == 1
texture<float4, 1> _x_type_tex;
#else
texture<int4, 1> _x_type_tex;
#endif
#define _v_radius_tex MY_AP(v_radius_tex)
#if V_PRECISION == 1
texture<float4, 1> _v_radius_tex;
#else
texture<int4, 1> _v_radius_tex;
#endif
#define _omega_rmass_tex MY_AP(omega_rmass_tex)
#if V_PRECISION == 1
texture<float4, 1> _omega_rmass_tex;
#else
texture<int4, 1> _omega_rmass_tex;
#endif
#define _q_tex MY_AP(q_tex)
#if F_PRECISION == 1
texture<float> _q_tex;
#else
texture<int2, 1> _q_tex;
#endif
#endif
//neighbor
#ifdef IncludeCommonNeigh
#define _inum MY_AP(inum)
#define _inum_border MY_AP(inum_border)
#define _ilist MY_AP(ilist)
#define _ilist_border MY_AP(ilist_border)
#define _numneigh MY_AP(numneigh)
#define _numneigh_border MY_AP(numneigh_border)
#define _numneigh_inner MY_AP(numneigh_inner)
#define _firstneigh MY_AP(firstneigh)
#define _neighbors MY_AP(neighbors)
#define _neighbors_border MY_AP(neighbors_border)
#define _neighbors_inner MY_AP(neighbors_inner)
#define _reneigh_flag MY_AP(reneigh_flag)
#define _triggerneighsq MY_AP(triggerneighsq)
#define _xhold MY_AP(xhold)
#define _maxhold MY_AP(maxhold)
#define _dist_check MY_AP(dist_check)
#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
#define _maxneighbors MY_AP(maxneighbors)
#define _overlap_comm MY_AP(overlap_comm)
__device__ __constant__ int _inum;
__device__ __constant__ int* _inum_border;
__device__ __constant__ int* _ilist;
__device__ __constant__ int* _ilist_border;
__device__ __constant__ int* _numneigh;
__device__ __constant__ int* _numneigh_border;
__device__ __constant__ int* _numneigh_inner;
__device__ __constant__ int** _firstneigh;
__device__ __constant__ int* _neighbors;
__device__ __constant__ int* _neighbors_border;
__device__ __constant__ int* _neighbors_inner;
__device__ __constant__ int* _reneigh_flag;
__device__ __constant__ X_CFLOAT _triggerneighsq;
__device__ __constant__ X_CFLOAT* _xhold; //holds pointer to positions
__device__ __constant__ int _maxhold;
__device__ __constant__ int _dist_check;
__device__ __constant__ int _neighbor_maxlocal;
__device__ __constant__ int _maxneighbors;
__device__ __constant__ int _overlap_comm;
#endif
//system properties
#define _nall MY_AP(nall)
#define _nghost MY_AP(nghost)
#define _nlocal MY_AP(nlocal)
#define _nmax MY_AP(nmax)
#define _cuda_ntypes MY_AP(cuda_ntypes)
#define _dtf MY_AP(dtf)
#define _dtv MY_AP(dtv)
#define _factor MY_AP(factor)
#define _virial MY_AP(virial)
#define _eng_vdwl MY_AP(eng_vdwl)
#define _eng_coul MY_AP(eng_coul)
#define _molecular MY_AP(molecular)
__device__ __constant__ unsigned _nall;
__device__ __constant__ unsigned _nghost;
__device__ __constant__ unsigned _nlocal;
__device__ __constant__ unsigned _nmax;
__device__ __constant__ unsigned _cuda_ntypes;
__device__ __constant__ V_CFLOAT _dtf;
__device__ __constant__ X_CFLOAT _dtv;
__device__ __constant__ V_CFLOAT _factor;
__device__ __constant__ ENERGY_CFLOAT* _virial;
__device__ __constant__ ENERGY_CFLOAT* _eng_vdwl;
__device__ __constant__ ENERGY_CFLOAT* _eng_coul;
__device__ __constant__ int _molecular;
//other general constants
#define _buffer MY_AP(buffer)
#define _flag MY_AP(flag)
#define _debugdata MY_AP(debugdata)
__device__ __constant__ void* _buffer;
__device__ __constant__ int* _flag;
__device__ __constant__ int* _debugdata;
// pointers to data fields on GPU are hold in constant space
// -> reduces register usage and number of parameters for kernelcalls
// will be variables of file scope in cuda files
// maybe used to output cudaError_t
#define MY_OUTPUT_RESULT(result) \
switch(result) \
{ \
case cudaSuccess: printf(" => cudaSuccess\n"); break; \
case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
default: printf(" => unknown\n"); break; \
}
#ifdef _DEBUG
# define CUT_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#else
# define CUT_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#endif
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} }
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
#define X_MASK 1
#define V_MASK 2
#define F_MASK 4
#define TAG_MASK 8
#define TYPE_MASK 16
#define MASK_MASK 32
#define IMAGE_MASK 64
#define Q_MASK 128
#define MOLECULE_MASK 256
#define RMASS_MASK 512
#define RADIUS_MASK 1024
#define DENSITY_MASK 2048
#define OMEGA_MASK 4096
#define TORQUE_MASK 8192
#endif // #ifdef _CUDA_COMMON_H_

View File

@ -1 +0,0 @@
extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata);

View File

@ -1,220 +0,0 @@
enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
#include "cuda_data_cu.h"
#include "cuda_wrapper_cu.h"
#include "cuda_data_kernel.cu"
#include <cstdio>
void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
int size = n[0];
if(n[1] > 0) size *= n[1];
if(n[2] > 0) size *= n[2];
dim3 threads;
threads.x = 1;
threads.y = 1;
threads.z = 1;
dim3 grid;
grid.x = 1;
grid.y = 1;
grid.z = 1;
if(size <= 128 * 30)
threads.x = 32;
else if(size <= 256 * 30)
threads.x = 64;
else if(size <= 512 * 30)
threads.x = 128;
else
threads.x = 256;
grid.x = ((size - 1) + threads.x) / threads.x;
if(grid.x > 32000)
grid.x = 32000;
while(grid.x * grid.y * threads.x < size) grid.y++;
float debugdata[size];
//int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
size *= sizeof(double);
printf("size: %i (%i %i %i) (%i %i %i) %p\n", size, grid.x, grid.y, threads.x, n[0], n[1], n[2], buffer);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_DoubleFloat <<< grid, threads>>>((double*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
cudaThreadSynchronize();
CudaWrapper_DownloadCudaData(debugdata, dev_data, size / 2);
double sum = 0;
printf("debugdata: ");
for(int i = 0; i < size / sizeof(double); i++) sum += (debugdata[i] - ((double*) host_data)[i]) * (debugdata[i] - ((double*) host_data)[i]);
printf("%lf \n", sum);
}
void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
int size = n[0];
if(n[1] > 0) size *= n[1];
if(n[2] > 0) size *= n[2];
dim3 threads;
threads.x = 1;
threads.y = 1;
threads.z = 1;
dim3 grid;
grid.x = 1;
grid.y = 1;
grid.z = 1;
if(size <= 128 * 30)
threads.x = 32;
else if(size <= 256 * 30)
threads.x = 64;
else if(size <= 512 * 30)
threads.x = 128;
else
threads.x = 256;
grid.x = ((size - 1) + threads.x) / threads.x;
if(grid.x > 32000)
grid.x = 32000;
while(grid.x * grid.y * threads.x < size) grid.y++;
size *= sizeof(double);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_DoubleDouble <<< grid, threads>>>((double*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
cudaThreadSynchronize();
}
void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
int size = n[0];
if(n[1] > 0) size *= n[1];
if(n[2] > 0) size *= n[2];
dim3 threads;
threads.x = 1;
threads.y = 1;
threads.z = 1;
dim3 grid;
grid.x = 1;
grid.y = 1;
grid.z = 1;
if(size <= 128 * 30)
threads.x = 32;
else if(size <= 256 * 30)
threads.x = 64;
else if(size <= 512 * 30)
threads.x = 128;
else
threads.x = 256;
grid.x = ((size - 1) + threads.x) / threads.x;
if(grid.x > 32000)
grid.x = 32000;
while(grid.x * grid.y * threads.x < size) grid.y++;
size *= sizeof(float);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_FloatDouble <<< grid, threads>>>((float*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
cudaThreadSynchronize();
}
void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
int size = n[0];
if(n[1] > 0) size *= n[1];
if(n[2] > 0) size *= n[2];
dim3 threads;
threads.x = 1;
threads.y = 1;
threads.z = 1;
dim3 grid;
grid.x = 1;
grid.y = 1;
grid.z = 1;
if(size <= 128 * 30)
threads.x = 32;
else if(size <= 256 * 30)
threads.x = 64;
else if(size <= 512 * 30)
threads.x = 128;
else
threads.x = 256;
grid.x = ((size - 1) + threads.x) / threads.x;
if(grid.x > 32000)
grid.x = 32000;
while(grid.x * grid.y * threads.x < size) grid.y++;
size *= sizeof(float);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_FloatFloat <<< grid, threads>>>((float*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
cudaThreadSynchronize();
}
void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
{
int size = n[0];
if(n[1] > 0) size *= n[1];
if(n[2] > 0) size *= n[2];
dim3 threads;
threads.x = 1;
threads.y = 1;
threads.z = 1;
dim3 grid;
grid.x = 1;
grid.y = 1;
grid.z = 1;
if(size <= 128 * 30)
threads.x = 32;
else if(size <= 256 * 30)
threads.x = 64;
else if(size <= 512 * 30)
threads.x = 128;
else
threads.x = 256;
grid.x = ((size - 1) + threads.x) / threads.x;
if(grid.x > 32000)
grid.x = 32000;
while(grid.x * grid.y * threads.x < size) grid.y++;
size *= sizeof(int);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_IntInt <<< grid, threads>>>((int*)buffer, (int*)dev_data, n[0], n[1], n[2], mode);
cudaThreadSynchronize();
}
void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer)
{
}

View File

@ -1,13 +0,0 @@
#ifndef CUDA_DATA_CU_H_
#define CUDA_DATA_CU_H_
extern "C" void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
extern "C" void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
extern "C" void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
extern "C" void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
extern "C" void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
extern "C" void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer);
#endif /*CUDA_DATA_CU_H_*/

View File

@ -1,195 +0,0 @@
__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer, float* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx: {
dev_data[i] = buffer[i];
}
case xy: {
dev_data[i] = buffer[i];
}
case yx: {
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
}
case xyz: {
dev_data[i] = buffer[i];
}
case xzy: {
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}
}
__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer, double* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx:
dev_data[i] = buffer[i];
case xy:
dev_data[i] = buffer[i];
case yx:
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
case xyz:
dev_data[i] = buffer[i];
case xzy:
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}
__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer, double* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx:
dev_data[i] = buffer[i];
case xy:
dev_data[i] = buffer[i];
case yx:
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
case xyz:
dev_data[i] = buffer[i];
case xzy:
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}
__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer, float* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx:
dev_data[i] = buffer[i];
case xy:
dev_data[i] = buffer[i];
case yx:
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
case xyz:
dev_data[i] = buffer[i];
case xzy:
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}
__global__ void CudaData_Upload_Kernel_IntInt(int* buffer, int* dev_data,
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
{
if(mode == x) mode = xx;
unsigned length = nx;
if(ny > 0) length *= ny;
if(nz > 0) length *= nz;
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
if(i >= length) return;
switch(mode) {
case xx:
dev_data[i] = buffer[i];
case xy:
dev_data[i] = buffer[i];
case yx:
j = i / ny;
k = i % ny;
dev_data[k * nx + j] = buffer[j * ny + k];
case xyz:
dev_data[i] = buffer[i];
case xzy:
j = i / (ny * nz);
k = (i % (ny * nz)) / nz;
l = i % nz;
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag);

File diff suppressed because it is too large Load Diff

View File

@ -1,126 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ ENERGY_CFLOAT sharedmem[];
static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, int coulflag = 0)
{
__syncthreads();
ENERGY_CFLOAT* shared = sharedmem;
if(eflag) {
reduceBlock(shared);
shared += blockDim.x;
if(coulflag) {
reduceBlock(shared);
shared += blockDim.x;
}
}
if(vflag) {
reduceBlock(shared + 0 * blockDim.x);
reduceBlock(shared + 1 * blockDim.x);
reduceBlock(shared + 2 * blockDim.x);
reduceBlock(shared + 3 * blockDim.x);
reduceBlock(shared + 4 * blockDim.x);
reduceBlock(shared + 5 * blockDim.x);
}
if(threadIdx.x == 0) {
shared = sharedmem;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
shared += blockDim.x;
buffer += gridDim.x * gridDim.y;
if(coulflag) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
shared += blockDim.x;
buffer += gridDim.x * gridDim.y;
}
}
if(vflag) {
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[0 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[1 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[2 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[3 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[4 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[5 * blockDim.x];
}
}
__syncthreads();
}
__global__ void MY_AP(PairVirialCompute_reduce)(int n)
{
sharedmem[threadIdx.x] = ENERGY_F(0.0);
ENERGY_CFLOAT sum = ENERGY_F(0.0);
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
//if(blockIdx.x==2) buf=&buf[n];
for(int i = 0; i < n; i += blockDim.x) {
sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0);
__syncthreads();
reduceBlock(sharedmem);
if(threadIdx.x == 0) sum += sharedmem[0];
}
if(threadIdx.x == 0) {
if(gridDim.x == 1) { //evdwl
_eng_vdwl[0] += sum;
}
if(gridDim.x == 2) { //evdwl + ecoul only
if(blockIdx.x == 0)
_eng_vdwl[0] += sum;
else
_eng_coul[0] += sum;
}
if(gridDim.x == 6) { //virial
_virial[blockIdx.x] += sum;
}
if(gridDim.x == 7) { //evdwl+virial
if(blockIdx.x == 0)
_eng_vdwl[0] += sum;
else _virial[blockIdx.x - 1] += sum;
}
if(gridDim.x == 8) { //evdwl+ecoul+virial
if(blockIdx.x == 0)
_eng_vdwl[0] += sum;
else if(blockIdx.x == 1)
_eng_coul[0] += sum;
else
_virial[blockIdx.x - 2] += sum;
}
}
}

View File

@ -1,278 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef CUDA_PRECISION_H_
#define CUDA_PRECISION_H_
/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
* Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
* ***_CFLOAT: type definition of given property
* ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
*/
#ifdef CUDA_USE_BINNING
#define CUDA_IF_BINNING(a) a
#else
#define CUDA_IF_BINNING(a)
#endif
//GLOBAL
#ifdef CUDA_PRECISION
#if CUDA_PRECISION == 1
#define CUDA_CFLOAT float
#define CUDA_F(x) x##f
#endif
#if CUDA_PRECISION == 2
#define CUDA_CFLOAT double
#define CUDA_F(x) x
#endif
#endif
#ifndef CUDA_PRECISION
#define CUDA_CFLOAT double
#define CUDA_F(x) x
#define CUDA_PRECISION 2
#endif
//--------------------------------
//-----------FFT-----------------
//--------------------------------
#ifdef FFT_PRECISION_CU
#if FFT_PRECISION_CU == 1
#define FFT_CFLOAT float
#define FFT_F(x) x##f
#endif
#if FFT_PRECISION_CU == 2
#define FFT_CFLOAT double
#define FFT_F(x) x
#endif
#endif
#ifndef FFT_PRECISION_CU
#define FFT_CFLOAT CUDA_CFLOAT
#define FFT_F(x) CUDA_F(x)
#define FFT_PRECISION_CU CUDA_PRECISION
#endif
//--------------------------------
//-----------PPPM-----------------
//--------------------------------
#ifndef PPPM_PRECISION
#define PPPM_PRECISION CUDA_PRECISION
#endif
#ifdef PPPM_PRECISION
#if PPPM_PRECISION == 1
#define PPPM_CFLOAT float
#ifdef float3
#define PPPM_CFLOAT3 float3
#else
struct PPPM_CFLOAT3 {
PPPM_CFLOAT x;
PPPM_CFLOAT y;
PPPM_CFLOAT z;
};
#endif
#define PPPM_F(x) x##f
#endif
#if PPPM_PRECISION == 2
#define PPPM_CFLOAT double
struct PPPM_CFLOAT3 {
PPPM_CFLOAT x;
PPPM_CFLOAT y;
PPPM_CFLOAT z;
};
#define PPPM_F(x) x
#endif
#endif
//--------------------------------
//-----------FORCE-----------------
//--------------------------------
#ifdef F_PRECISION
#if F_PRECISION == 1
#define F_CFLOAT float
#define F_F(x) x##f
#endif
#if F_PRECISION == 2
#define F_CFLOAT double
#define F_F(x) x
#endif
#endif
#ifndef F_PRECISION
#define F_CFLOAT CUDA_CFLOAT
#define F_F(x) CUDA_F(x)
#define F_PRECISION CUDA_PRECISION
#endif
#if F_PRECISION == 1
#define _SQRT_ sqrtf
#define _RSQRT_ rsqrtf
#define _EXP_ expf
#else
#define _SQRT_ sqrt
#define _RSQRT_ rsqrt
#define _EXP_ exp
#endif
#if F_PRECISION == 2
struct F_CFLOAT2 {
F_CFLOAT x;
F_CFLOAT y;
};
struct F_CFLOAT3 {
F_CFLOAT x;
F_CFLOAT y;
F_CFLOAT z;
};
struct F_CFLOAT4 {
F_CFLOAT x;
F_CFLOAT y;
F_CFLOAT z;
F_CFLOAT w;
};
#else
#define F_CFLOAT2 float2
#define F_CFLOAT3 float3
#define F_CFLOAT4 float4
#endif
//--------------------------------
//-----------ENERGY-----------------
//--------------------------------
#ifndef ENERGY_PRECISION
#define ENERGY_CFLOAT CUDA_CFLOAT
#define ENERGY_F(x) CUDA_F(x)
#endif
#ifdef ENERGY_PRECISION
#if ENERGY_PRECISION == 1
#define ENERGY_CFLOAT float
#define ENERGY_F(x) x##f
#endif
#if ENERGY_PRECISION == 2
#define ENERGY_CFLOAT double
#define ENERGY_F(x) x
#endif
#endif
#ifndef ENERGY_PRECISION
#define ENERGY_CFLOAT CUDA_CFLOAT
#define ENERGY_F(x) CUDA_F(x)
#define ENERGY_PRECISION CUDA_PRECISION
#endif
//--------------------------------
//-----------POSITIONS------------
//--------------------------------
#ifdef X_PRECISION
#if X_PRECISION == 1
#define X_CFLOAT float
#define X_F(x) x##f
#endif
#if X_PRECISION == 2
#define X_CFLOAT double
#define X_F(x) x
#endif
#endif
#ifndef X_PRECISION
#define X_CFLOAT CUDA_CFLOAT
#define X_F(x) CUDA_F(x)
#define X_PRECISION CUDA_PRECISION
#endif
#if X_PRECISION == 2
struct X_CFLOAT2 {
X_CFLOAT x;
X_CFLOAT y;
};
struct X_CFLOAT3 {
X_CFLOAT x;
X_CFLOAT y;
X_CFLOAT z;
};
struct X_CFLOAT4 {
X_CFLOAT x;
X_CFLOAT y;
X_CFLOAT z;
X_CFLOAT w;
};
#else
#define X_CFLOAT2 float2
#define X_CFLOAT3 float3
#define X_CFLOAT4 float4
#endif
//--------------------------------
//-----------velocities-----------
//--------------------------------
#ifdef V_PRECISION
#if V_PRECISION == 1
#define V_CFLOAT float
#define V_F(x) x##f
#endif
#if V_PRECISION == 2
#define V_CFLOAT double
#define V_F(x) x
#endif
#endif
#ifndef V_PRECISION
#define V_CFLOAT CUDA_CFLOAT
#define V_F(x) CUDA_F(x)
#define V_PRECISION CUDA_PRECISION
#endif
#if V_PRECISION == 2
struct V_CFLOAT4 {
V_CFLOAT x;
V_CFLOAT y;
V_CFLOAT z;
V_CFLOAT w;
};
#else
#define V_CFLOAT4 float4
#endif
#ifdef NO_PREC_TIMING
struct my_times {
unsigned int tv_sec;
unsigned int tv_nsec;
};
#define my_gettime(a,b)
#else
#define my_times timespec
#define my_gettime(a,b) clock_gettime(a,b)
#endif
#endif /*CUDA_PRECISION_H_*/

View File

@ -1,370 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_SHARED_H_
#define _CUDA_SHARED_H_
#include "cuda_precision.h"
#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
struct dev_array {
void* dev_data; // pointer to memory address on cuda device
unsigned dim[3]; // array dimensions
};
struct cuda_shared_atom { // relevent data from atom class
dev_array dx; // cumulated distance for binning settings
dev_array x; // position
dev_array v; // velocity
dev_array f; // force
dev_array tag;
dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1)
dev_array mask;
dev_array image;
dev_array q; // charges
dev_array mass; // per-type masses
dev_array rmass; // per-atom masses
dev_array radius; // per-atom radius
dev_array density;
dev_array omega;
dev_array torque;
dev_array molecule;
dev_array special;
int maxspecial;
dev_array nspecial;
int* special_flag;
int molecular;
dev_array eatom; // per-atom energy
dev_array vatom; // per-atom virial
int need_eatom;
int need_vatom;
dev_array x_type; // position + type in X_CFLOAT4 struct
dev_array v_radius; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
dev_array omega_rmass; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
double* mass_host; // remember per-type host pointer to masses
//int natoms; // total # of atoms in system, could be 0
int nghost; // and ghost atoms on this proc
int nlocal; // # of owned
int nall; // total # of atoms in this proc
int nmax; // max # of owned+ghost in arrays on this proc
int ntypes;
int q_flag; // do we have charges?
int rmass_flag; // do we have per-atom masses?
int firstgroup;
int nfirst;
int update_nlocal;
int update_nmax;
int update_neigh;
dev_array xhold; // position at last neighboring
X_CFLOAT triggerneighsq; // maximum square movement before reneighboring
int reneigh_flag; // is reneighboring necessary
int maxhold; // size of xhold
int dist_check; //perform distance check for reneighboring
dev_array binned_id; //id of each binned atom (not tag!!)
dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
float bin_extraspace;
int bin_dim[3];
int bin_nmax;
dev_array map_array;
};
struct cuda_shared_pair { // relevent data from pair class
char cudable_force; // check for (cudable_force!=0)
X_CFLOAT cut_global;
X_CFLOAT cut_inner_global;
X_CFLOAT cut_coul_global;
double** cut; // type-type cutoff
double** cutsq; // type-type cutoff
double** cut_inner; // type-type cutoff for coul
double** cut_coul; // type-type cutoff for coul
double** coeff1; // tpye-type pair parameters
double** coeff2;
double** coeff3;
double** coeff4;
double** coeff5;
double** coeff6;
double** coeff7;
double** coeff8;
double** coeff9;
double** coeff10;
double** offset;
double* special_lj;
double* special_coul;
dev_array virial; // ENERGY_CFLOAT
dev_array eng_vdwl; // ENERGY_CFLOAT
dev_array eng_coul; // ENERGY_CFLOAT
X_CFLOAT cut_coulsq_global;
F_CFLOAT g_ewald, kappa;
int freeze_group_bit;
dev_array coeff1_gm;
dev_array coeff2_gm;
dev_array coeff3_gm;
dev_array coeff4_gm;
dev_array coeff5_gm;
dev_array coeff6_gm;
dev_array coeff7_gm;
dev_array coeff8_gm;
dev_array coeff9_gm;
dev_array coeff10_gm;
int lastgridsize;
int n_energy_virial;
int collect_forces_later;
int use_block_per_atom;
int override_block_per_atom;
bool neighall;
};
struct cuda_shared_domain { // relevent data from domain class
X_CFLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
X_CFLOAT subhi[3];
X_CFLOAT boxlo[3];
X_CFLOAT boxhi[3];
X_CFLOAT prd[3];
int periodicity[3]; // xyz periodicity as array
int triclinic;
X_CFLOAT xy;
X_CFLOAT xz;
X_CFLOAT yz;
X_CFLOAT boxlo_lamda[3];
X_CFLOAT boxhi_lamda[3];
X_CFLOAT prd_lamda[3];
X_CFLOAT h[6];
X_CFLOAT h_inv[6];
V_CFLOAT h_rate[6];
int update;
};
struct cuda_shared_pppm {
char cudable_force;
#ifdef FFT_CUFFT
FFT_CFLOAT* work1;
FFT_CFLOAT* work2;
FFT_CFLOAT* work3;
PPPM_CFLOAT* greensfn;
PPPM_CFLOAT* fkx;
PPPM_CFLOAT* fky;
PPPM_CFLOAT* fkz;
PPPM_CFLOAT* vg;
#endif
int* part2grid;
PPPM_CFLOAT* density_brick;
int* density_brick_int;
PPPM_CFLOAT density_intScale;
PPPM_CFLOAT* vdx_brick;
PPPM_CFLOAT* vdy_brick;
PPPM_CFLOAT* vdz_brick;
PPPM_CFLOAT* density_fft;
ENERGY_CFLOAT* energy;
ENERGY_CFLOAT* virial;
int nxlo_in;
int nxhi_in;
int nxlo_out;
int nxhi_out;
int nylo_in;
int nyhi_in;
int nylo_out;
int nyhi_out;
int nzlo_in;
int nzhi_in;
int nzlo_out;
int nzhi_out;
int nx_pppm;
int ny_pppm;
int nz_pppm;
PPPM_CFLOAT qqrd2e;
int order;
// float3 sublo;
PPPM_CFLOAT* rho_coeff;
int nmax;
int nlocal;
PPPM_CFLOAT* debugdata;
PPPM_CFLOAT delxinv;
PPPM_CFLOAT delyinv;
PPPM_CFLOAT delzinv;
int nlower;
int nupper;
PPPM_CFLOAT shiftone;
PPPM_CFLOAT3* fH;
};
struct cuda_shared_comm {
int maxswap;
int maxlistlength;
dev_array pbc;
dev_array slablo;
dev_array slabhi;
dev_array multilo;
dev_array multihi;
dev_array sendlist;
int grow_flag;
int comm_phase;
int nsend;
int* nsend_swap;
int* send_size;
int* recv_size;
double** buf_send;
void** buf_send_dev;
double** buf_recv;
void** buf_recv_dev;
void* buffer;
int buffer_size;
double overlap_split_ratio;
};
struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cuda_shared_data
int maxlocal;
int inum; // # of I atoms neighbors are stored for local indices of I atoms
int inum_border2;
dev_array inum_border; // # of atoms which interact with border atoms
dev_array ilist;
dev_array ilist_border;
dev_array numneigh;
dev_array numneigh_inner;
dev_array numneigh_border;
dev_array firstneigh;
dev_array neighbors;
dev_array neighbors_border;
dev_array neighbors_inner;
int maxpage;
dev_array page_pointers;
dev_array* pages;
int maxneighbors;
int neigh_lists_per_page;
double** cutneighsq;
CUDA_CFLOAT* cu_cutneighsq;
int* binned_id;
int* bin_dim;
int bin_nmax;
float bin_extraspace;
double maxcut;
dev_array ex_type;
int nex_type;
dev_array ex1_bit;
dev_array ex2_bit;
int nex_group;
dev_array ex_mol_bit;
int nex_mol;
};
struct cuda_compile_settings { // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
int prec_glob;
int prec_x;
int prec_v;
int prec_f;
int prec_pppm;
int prec_fft;
int cufft;
int arch;
};
struct cuda_timings_struct {
//Debug:
double test1;
double test2;
//transfers
double transfer_upload_tmp_constr;
double transfer_download_tmp_deconstr;
//communication
double comm_forward_total;
double comm_forward_mpi_upper;
double comm_forward_mpi_lower;
double comm_forward_kernel_pack;
double comm_forward_kernel_unpack;
double comm_forward_kernel_self;
double comm_forward_upload;
double comm_forward_download;
double comm_exchange_total;
double comm_exchange_mpi;
double comm_exchange_kernel_pack;
double comm_exchange_kernel_unpack;
double comm_exchange_kernel_fill;
double comm_exchange_cpu_pack;
double comm_exchange_upload;
double comm_exchange_download;
double comm_border_total;
double comm_border_mpi;
double comm_border_kernel_pack;
double comm_border_kernel_unpack;
double comm_border_kernel_self;
double comm_border_kernel_buildlist;
double comm_border_upload;
double comm_border_download;
//pair forces
double pair_xtype_conversion;
double pair_kernel;
double pair_virial;
double pair_force_collection;
//neighbor
double neigh_bin;
double neigh_build;
double neigh_special;
//PPPM
double pppm_particle_map;
double pppm_make_rho;
double pppm_brick2fft;
double pppm_poisson;
double pppm_fillbrick;
double pppm_fieldforce;
double pppm_compute;
};
struct cuda_shared_data { // holds space for all relevent data from the different classes
void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
int buffersize; //maxsize of buffer
int buffer_new; //should be 1 if the pointer to buffer has changed
void* flag;
void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
cuda_shared_atom atom;
cuda_shared_pair pair;
cuda_shared_domain domain;
cuda_shared_pppm pppm;
cuda_shared_comm comm;
cuda_compile_settings compile_settings;
cuda_timings_struct cuda_timings;
int exchange_dim;
int me; //mpi rank
unsigned int datamask;
int overlap_comm;
};
#endif // #ifndef _CUDA_SHARED_H_

View File

@ -1,337 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#include "cuda_shared.h"
#include "cuda_common.h"
#include "cuda_wrapper_cu.h"
#include "cuda_wrapper_kernel.cu"
static int CudaWrapper_total_gpu_mem = 0;
static double CudaWrapper_total_upload_time = 0;
static double CudaWrapper_total_download_time = 0;
static double CudaWrapper_cpubuffer_upload_time = 0;
static double CudaWrapper_cpubuffer_download_time = 0;
static cudaStream_t* streams;
static int nstreams = 0;
void CudaWrapper_Init(int argc, char** argv, int me, int ppn, int* devicelist)
{
MYDBG(printf("# CUDA: debug mode on\n");)
#if __DEVICE_EMULATION__
printf("# CUDA: emulation mode on\n");
#else
// modified from cutil.h
static int deviceCount = 0;
static bool sharedmode = false;
if(deviceCount && !sharedmode) return;
if(deviceCount && sharedmode) cudaThreadExit();
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));
if(deviceCount == 0) {
fprintf(stderr, "cutil error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
MYDBG(printf("# CUDA There are %i devices supporting CUDA in this system.\n", deviceCount);)
cudaDeviceProp deviceProp[deviceCount];
for(int i = 0; i < deviceCount; i++)
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&(deviceProp[i]), i));
int dev_list[deviceCount];
for(int i = 0; i < deviceCount; i++) dev_list[i] = i;
for(int i = 0; i < deviceCount; i++) {
for(int j = 0; j < deviceCount - 1 - i; j++)
if(deviceProp[dev_list[j]].multiProcessorCount < deviceProp[dev_list[j + 1]].multiProcessorCount) {
int k = dev_list[j];
dev_list[j] = dev_list[j + 1];
dev_list[j + 1] = k;
}
}
for(int i = 0; i < deviceCount; i++) {
if((deviceProp[dev_list[i]].computeMode == 0)) sharedmode = true;
cudaSetDevice(i);
cudaSetDeviceFlags(cudaDeviceMapHost);
}
if(sharedmode) {
if(ppn && (me % ppn + 1) > deviceCount) {
printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n");
exit(0);
}
int devicea = me % ppn;
if(devicelist) devicea = devicelist[devicea];
else
devicea = dev_list[devicea];
if(devicea >= deviceCount) {
printf("Asking for non existent GPU %i. Found only %i GPUs.\n", devicea, deviceCount);
exit(0);
}
MYDBG(
printf(" # CUDA myid: %i take device: %i\n", me, devicea);
)
CUDA_SAFE_CALL(cudaSetDevice(devicea));
} else {
CUDA_SAFE_CALL(cudaSetValidDevices(dev_list, deviceCount));
}
cudaThreadSynchronize();
int dev;
CUDA_SAFE_CALL(cudaGetDevice(&dev));
if(deviceProp[dev].major < 1) {
fprintf(stderr, "CUDA error: device does not support CUDA.\n");
exit(EXIT_FAILURE);
} else if((deviceProp[dev].major == 1) && (deviceProp[dev].minor != 3)) {
fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n", dev, deviceProp[dev].name, deviceProp[dev].major, deviceProp[dev].minor);
exit(EXIT_FAILURE);
}
if((deviceProp[dev].major == 2) && (CUDA_ARCH < 20)) {
fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n", deviceProp[dev].major, deviceProp[dev].minor);
}
if((deviceProp[dev].major == 1) && (CUDA_ARCH >= 20)) {
fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n", CUDA_ARCH);
exit(EXIT_FAILURE);
}
fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);
MYDBG(fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);)
MYDBG
(
printf("name = %s\n", deviceProp[dev].name);
printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem);
printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock);
printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock);
printf("warpSize = %i\n", deviceProp[dev].warpSize);
printf("memPitch = %i\n", deviceProp[dev].memPitch);
printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock);
printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]);
printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]);
printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem);
printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor);
printf("clockRate = %i\n", deviceProp[dev].clockRate);
printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment);
printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap);
printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount);
printf("computeMode = %i\n", deviceProp[dev].computeMode);
)
#endif
}
void* CudaWrapper_AllocCudaData(unsigned nbytes)
{
void* dev_data;
CUDA_SAFE_CALL(cudaMalloc((void**)&dev_data, nbytes));
MYDBG(printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data);)
CudaWrapper_total_gpu_mem += nbytes;
return dev_data;
}
void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes)
{
MYDBG(printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data, host_data);)
cudaThreadSynchronize();
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
CUDA_SAFE_CALL(cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice));
my_gettime(CLOCK_REALTIME, &time2);
CudaWrapper_total_upload_time +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
}
void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
{
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice, streams[stream]);
}
void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes)
{
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
cudaThreadSynchronize();
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
CUDA_SAFE_CALL(cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost));
my_gettime(CLOCK_REALTIME, &time2);
CudaWrapper_total_download_time +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
}
void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
{
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost, streams[stream]);
}
void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes)
{
MYDBG(printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data, nbytes, (char*)dev_data + nbytes);)
CUDA_SAFE_CALL(cudaFree(dev_data));
CudaWrapper_total_gpu_mem -= nbytes;
}
void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes)
{
MYDBG(printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data);)
CUDA_SAFE_CALL(cudaMemset(dev_data, value, nbytes));
}
void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes)
{
MYDBG(printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source, dev_dest);)
CUDA_SAFE_CALL(cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice));
}
void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped, bool writeCombined)
{
void* host_data;
int flags = 0;
if(mapped) flags = flags | cudaHostAllocMapped;
if(writeCombined) flags = flags | cudaHostAllocWriteCombined;
CUDA_SAFE_CALL(cudaHostAlloc((void**)&host_data, nbytes, flags));
// CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) );
MYDBG(printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data);)
return host_data;
}
void CudaWrapper_FreePinnedHostData(void* host_data)
{
MYDBG(printf("# CUDA: freeing pinned host memory at %p \n", host_data);)
if(host_data)
CUDA_SAFE_CALL(cudaFreeHost(host_data));
}
void cuda_check_error(char* comment)
{
printf("ERROR-CUDA %s %s\n", comment, cudaGetErrorString(cudaGetLastError()));
}
int CudaWrapper_CheckMemUsage()
{
size_t free, total;
cudaMemGetInfo(&free, &total);
return total - free; //possible with cuda 3.0 ???
//return CudaWrapper_total_gpu_mem;
}
double CudaWrapper_CheckUploadTime(bool reset)
{
if(reset) CudaWrapper_total_upload_time = 0.0;
return CudaWrapper_total_upload_time;
}
double CudaWrapper_CheckDownloadTime(bool reset)
{
if(reset) CudaWrapper_total_download_time = 0.0;
return CudaWrapper_total_download_time;
}
double CudaWrapper_CheckCPUBufUploadTime(bool reset)
{
if(reset) CudaWrapper_cpubuffer_upload_time = 0.0;
return CudaWrapper_cpubuffer_upload_time;
}
double CudaWrapper_CheckCPUBufDownloadTime(bool reset)
{
if(reset) CudaWrapper_cpubuffer_download_time = 0.0;
return CudaWrapper_cpubuffer_download_time;
}
void CudaWrapper_AddCPUBufUploadTime(double dt)
{
CudaWrapper_cpubuffer_upload_time += dt;
}
void CudaWrapper_AddCPUBufDownloadTime(double dt)
{
CudaWrapper_cpubuffer_download_time += dt;
}
void CudaWrapper_Sync()
{
cudaThreadSynchronize();
}
void CudaWrapper_SyncStream(int stream)
{
cudaStreamSynchronize(streams[stream]);
}
void CudaWrapper_AddStreams(int n)
{
cudaStream_t* new_streams = new cudaStream_t[nstreams + n];
for(int i = 0; i < nstreams; i++) new_streams[i] = streams[i];
for(int i = nstreams; i < nstreams + n; i++) cudaStreamCreate(&new_streams[i]);
if(nstreams > 0)
delete [] streams;
streams = new_streams;
nstreams += n;
}
void* CudaWrapper_returnStreams()
{
return (void*) streams;
}
int CudaWrapper_returnNStreams()
{
return nstreams;
}

View File

@ -1,52 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_DATA_WRAPPER_H_
#define _CUDA_DATA_WRAPPER_H_
extern "C" void CudaWrapper_Init(int argc, char** argv, int me = 0, int ppn = 2, int* devicelist = NULL);
extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes);
extern "C" void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes);
extern "C" void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
extern "C" void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes);
extern "C" void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
extern "C" void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes = 0);
extern "C" void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes);
extern "C" void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes);
extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped = false, bool writeCombind = false);
extern "C" void CudaWrapper_FreePinnedHostData(void* dev_data);
extern "C" void cuda_check_error(char* comment);
extern "C" int CudaWrapper_CheckMemUsage();
extern "C" double CudaWrapper_CheckUploadTime(bool reset = false);
extern "C" double CudaWrapper_CheckDownloadTime(bool reset = false);
extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset = false);
extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset = false);
extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt);
extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt);
extern "C" void CudaWrapper_Sync();
extern "C" void CudaWrapper_SyncStream(int n);
extern "C" void CudaWrapper_AddStreams(int n);
extern "C" void* CudaWrapper_returnStreams();
extern "C" int CudaWrapper_returnNStreams();
#endif // _CUDA_DATA_WRAPPER_H_

View File

@ -1,24 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
// empty file to obay common make rule

View File

@ -1,202 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX domain
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "domain_cu.h"
#include "domain_kernel.cu"
void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata, int size)
{
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_Domain Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
}
void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*));
}
void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(boxhi) , sdata->domain.boxhi , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , 3 * sizeof(int));
cudaMemcpyToSymbol(MY_AP(triclinic) , & sdata->domain.triclinic , sizeof(int));
cudaMemcpyToSymbol(MY_AP(boxlo_lamda) , sdata->domain.boxlo_lamda , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(boxhi_lamda) , sdata->domain.boxhi_lamda , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(prd_lamda) , sdata->domain.prd_lamda , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , 6 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(h_inv) , sdata->domain.h_inv , 6 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(h_rate) , sdata->domain.h_rate , 6 * sizeof(V_CFLOAT));
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*));
cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata , sizeof(int*));
}
void Cuda_Domain_Init(cuda_shared_data* sdata)
{
Cuda_Domain_UpdateNmax(sdata);
Cuda_Domain_UpdateDomain(sdata);
}
void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent)
{
Cuda_Domain_UpdateNmax(sdata);
//if(sdata->domain.update)
Cuda_Domain_UpdateDomain(sdata);
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int box_change = 0;
if(extent) box_change = 1;
int sharedmem = 0;
if(box_change) sharedmem = 6 * sizeof(X_CFLOAT);
int3 layout = getgrid(sdata->atom.nlocal, sharedmem);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
sharedmem *= threads.x;
if((box_change) && (sdata->buffer_new or (6 * sizeof(X_CFLOAT)*grid.x * grid.y > sdata->buffersize)))
Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_CFLOAT));
Domain_PBC_Kernel <<< grid, threads, sharedmem>>>(deform_remap, deform_groupbit, box_change);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed");
if(box_change) {
X_CFLOAT buf2[6 * layout.x * layout.y];
X_CFLOAT* buf = buf2;
int flag;
cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
//printf("Flag: %i\n",flag);
X_CFLOAT min, max;
min = 1.0 * BIG;
max = -1.0 * BIG;
for(int i = 0; i < layout.x * layout.y; i++) {
if(buf[i] < min) min = buf[i];
if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
}
extent[0] = min;
extent[1] = max;
buf += 2 * layout.x * layout.y;
min = 1.0 * BIG;
max = -1.0 * BIG;
for(int i = 0; i < layout.x * layout.y; i++) {
if(buf[i] < min) min = buf[i];
if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
}
extent[2] = min;
extent[3] = max;
buf += 2 * layout.x * layout.y;
min = 1.0 * BIG;
max = -1.0 * BIG;
for(int i = 0; i < layout.x * layout.y; i++) {
if(buf[i] < min) min = buf[i];
if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
}
extent[4] = min;
extent[5] = max;
//printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]);
/* int n=grid.x*grid.y;
if(n<128) threads.x=32;
else if(n<256) threads.x=64;
else threads.x=128;
sharedmem=n*sizeof(X_CFLOAT);
grid.x=6;
grid.y=1;
Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/
}
}
void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n)
{
Cuda_Domain_UpdateNmax(sdata);
//if(sdata->domain.update)
Cuda_Domain_UpdateDomain(sdata);
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Domain_lamda2x_Kernel <<< grid, threads, 0>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed");
}
void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n)
{
Cuda_Domain_UpdateNmax(sdata);
//if(sdata->domain.update)
Cuda_Domain_UpdateDomain(sdata);
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Domain_x2lamda_Kernel <<< grid, threads, 0>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed");
}

View File

@ -1,29 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata);
extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent = NULL);
extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n);
extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n);

View File

@ -1,293 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ X_CFLOAT sharedmem[];
#define BIG 1e10
__global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box_change)
{
int idim, otherdims;
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_CFLOAT lo[3];
X_CFLOAT hi[3];
X_CFLOAT* period;
if(_triclinic == 0) {
lo[0] = _boxlo[0];
lo[1] = _boxlo[1];
lo[2] = _boxlo[2];
hi[0] = _boxhi[0];
hi[1] = _boxhi[1];
hi[2] = _boxhi[2];
period = _prd;
} else {
lo[0] = _boxlo_lamda[0];
lo[1] = _boxlo_lamda[1];
lo[2] = _boxlo_lamda[2];
hi[0] = _boxhi_lamda[0];
hi[1] = _boxhi_lamda[1];
hi[2] = _boxhi_lamda[2];
period = _prd_lamda;
}
X_CFLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]);
X_CFLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]);
X_CFLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]);
X_CFLOAT* buf = (X_CFLOAT*) _buffer;
buf += blockIdx.x * gridDim.y + blockIdx.y;
buf[0] = tmpx;
buf += gridDim.x * gridDim.y;
buf[0] = tmpx;
buf += gridDim.x * gridDim.y;
buf[0] = tmpy;
buf += gridDim.x * gridDim.y;
buf[0] = tmpy;
buf += gridDim.x * gridDim.y;
buf[0] = tmpz;
buf += gridDim.x * gridDim.y;
buf[0] = tmpz;
if(i < _nlocal) {
if(_periodicity[0]) {
if(_x[i] < lo[0]) {
_x[i] += period[0];
if(deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0];
idim = _image[i] & 1023;
otherdims = _image[i] ^ idim;
idim--;
idim &= 1023;
_image[i] = otherdims | idim;
}
if(_x[i] >= hi[0]) {
_x[i] -= period[0];
_x[i] = MAX(_x[i], lo[0]);
if(deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0];
idim = _image[i] & 1023;
otherdims = _image[i] ^ idim;
idim++;
idim &= 1023;
_image[i] = otherdims | idim;
}
}
if(_periodicity[1]) {
if(_x[i + _nmax] < lo[1]) {
_x[i + _nmax] += period[1];
if(deform_remap && _mask[i] & deform_groupbit) {
_v[i] += _h_rate[5];
_v[i + _nmax] += _h_rate[1];
}
idim = (_image[i] >> 10) & 1023;
otherdims = _image[i] ^ (idim << 10);
idim--;
idim &= 1023;
_image[i] = otherdims | (idim << 10);
}
if(_x[i + _nmax] >= hi[1]) {
_x[i + _nmax] -= period[1];
_x[i + _nmax] = MAX(_x[i + _nmax], lo[1]);
if(deform_remap && _mask[i] & deform_groupbit) {
_v[i] -= _h_rate[5];
_v[i + _nmax] -= _h_rate[1];
}
idim = (_image[i] >> 10) & 1023;
otherdims = _image[i] ^ (idim << 10);
idim++;
idim &= 1023;
_image[i] = otherdims | (idim << 10);
}
}
if(_periodicity[2]) {
if(_x[i + 2 * _nmax] < lo[2]) {
_x[i + 2 * _nmax] += period[2];
if(deform_remap && _mask[i] & deform_groupbit) {
_v[i] += _h_rate[4];
_v[i + _nmax] += _h_rate[3];
_v[i + 2 * _nmax] += _h_rate[2];
}
idim = _image[i] >> 20;
otherdims = _image[i] ^ (idim << 20);
idim--;
idim &= 1023;
_image[i] = otherdims | (idim << 20);
}
if(_x[i + 2 * _nmax] >= hi[2]) {
_x[i + 2 * _nmax] -= period[2];
_x[i + 2 * _nmax] = MAX(_x[i + 2 * _nmax], lo[2]);
if(deform_remap && _mask[i] & deform_groupbit) {
_v[i] -= _h_rate[4];
_v[i + _nmax] -= _h_rate[3];
_v[i + 2 * _nmax] -= _h_rate[2];
}
idim = _image[i] >> 20;
otherdims = _image[i] ^ (idim << 20);
idim++;
idim &= 1023;
_image[i] = otherdims | (idim << 20);
}
}
if(box_change) {
tmpx = _x[i];
tmpy = _x[i + _nmax];
tmpz = _x[i + 2 * _nmax];
}
}
__syncthreads();
if(box_change) {
X_CFLOAT minx = BIG;
X_CFLOAT maxx = -BIG;
X_CFLOAT miny = BIG;
X_CFLOAT maxy = -BIG;
X_CFLOAT minz = BIG;
X_CFLOAT maxz = -BIG;
if(not _periodicity[0]) {
sharedmem[threadIdx.x] = tmpx;
minOfBlock(sharedmem);
minx = sharedmem[0];
__syncthreads();
sharedmem[threadIdx.x] = tmpx;
maxOfBlock(sharedmem);
maxx = sharedmem[0];
__syncthreads();
} else {
minx = lo[0];
maxx = hi[0];
}
if(not _periodicity[1]) {
sharedmem[threadIdx.x] = tmpy;
minOfBlock(sharedmem);
miny = sharedmem[0];
__syncthreads();
sharedmem[threadIdx.x] = tmpy;
maxOfBlock(sharedmem);
maxy = sharedmem[0];
__syncthreads();
} else {
minx = lo[1];
maxx = hi[1];
}
if(not _periodicity[2]) {
sharedmem[threadIdx.x] = tmpz;
minOfBlock(sharedmem);
minz = sharedmem[0];
__syncthreads();
sharedmem[threadIdx.x] = tmpz;
maxOfBlock(sharedmem);
maxz = sharedmem[0];
__syncthreads();
} else {
minz = lo[2];
maxz = hi[2];
}
if(threadIdx.x == 0) {
buf = (X_CFLOAT*) _buffer;
buf += blockIdx.x * gridDim.y + blockIdx.y;
buf[0] = minx;
buf += gridDim.x * gridDim.y;
buf[0] = maxx;
buf += gridDim.x * gridDim.y;
buf[0] = miny;
buf += gridDim.x * gridDim.y;
buf[0] = maxy;
buf += gridDim.x * gridDim.y;
buf[0] = minz;
buf += gridDim.x * gridDim.y;
buf[0] = maxz;
}
}
}
__global__ void Domain_reduceBoxExtent(double* extent, int n)
{
X_CFLOAT* buf = (X_CFLOAT*) _buffer;
buf += blockIdx.x * n;
copyGlobToShared(buf, sharedmem, n);
if(blockIdx.x % 2 == 0)
minOfData(sharedmem, n);
else
maxOfData(sharedmem, n);
extent[blockIdx.x] = sharedmem[0];
}
__global__ void Domain_lamda2x_Kernel(int n)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
X_CFLOAT ytmp = _x[i + _nmax];
X_CFLOAT ztmp = _x[i + 2 * _nmax];
_x[i] = _h[0] * _x[i] + _h[5] * ytmp + _h[4] * ztmp + _boxlo[0];
_x[i + _nmax] = _h[1] * ytmp + _h[3] * ztmp + _boxlo[1];
_x[i + 2 * _nmax] = _h[2] * ztmp + _boxlo[2];
}
}
__global__ void Domain_x2lamda_Kernel(int n)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_CFLOAT delta[3];
if(i < n) {
delta[0] = _x[i] - _boxlo[0];
delta[1] = _x[i + _nmax] - _boxlo[1];
delta[2] = _x[i + 2 * _nmax] - _boxlo[2];
_x[i] = _h_inv[0] * delta[0] + _h_inv[5] * delta[1] + _h_inv[4] * delta[2];
_x[i + _nmax] = _h_inv[1] * delta[1] + _h_inv[3] * delta[2];
_x[i + 2 * _nmax] = _h_inv[2] * delta[2];
}
}

View File

@ -1,103 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
//#define CUDA_PRECISION 1
#include "cuda_precision.h"
#include "cuda_common.h"
struct FFT_DATA {
FFT_CFLOAT re;
FFT_CFLOAT im;
};
#include "fft3d_cuda_cu.h"
#include "fft3d_cuda_kernel.cu"
#include <stdio.h>
void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow)
{
dim3 grid;
grid.x = nslow;
grid.y = nmid;
grid.z = 1;
dim3 threads;
threads.x = nfast;
threads.y = 1;
threads.z = 1;
cudaThreadSynchronize();
initfftdata_kernel <<< grid, threads, 0>>>(in, out);
cudaThreadSynchronize();
MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
}
void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
{
dim3 grid;
grid.x = nslow;
grid.y = nmid;
grid.z = 1;
dim3 threads;
threads.x = nfast * 2;
threads.y = 1;
threads.z = 1;
permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
cudaThreadSynchronize();
MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
}
void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
{
dim3 grid;
grid.x = nslow;
grid.y = nmid;
grid.z = 1;
dim3 threads;
threads.x = nfast * 2;
threads.y = 1;
threads.z = 1;
permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
cudaThreadSynchronize();
}
void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
{
dim3 grid;
grid.x = (ihi - ilo + 1);
grid.y = (jhi - jlo + 1);
grid.z = 1;
dim3 threads;
threads.x = (khi - klo + 1) * 2;
threads.y = 1;
threads.z = 1;
permute_part_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo);
cudaThreadSynchronize();
}
void FFTsyncthreads()
{
cudaThreadSynchronize();
}

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow);
extern "C" void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
extern "C" void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
extern "C" void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo);
extern "C" void FFTsyncthreads();

View File

@ -1,46 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void initfftdata_kernel(double* in, FFT_CFLOAT* out)
{
out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x) + 1] = 0;
}
__global__ void permute_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
{
out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
}
__global__ void permute_scale_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
{
out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x] * gridDim.x * gridDim.y * blockDim.x * 0.5;
}
__global__ void permute_part_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
{
{
out[2 * ((threadIdx.x / 2) * (ihi - ilo + 1) * (jhi - jlo + 1) + (blockIdx.x) * (jhi - jlo + 1) + blockIdx.y - jlo) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[2 * (blockIdx.x + ilo) * nmid * nslow + 2 * (blockIdx.y + jlo) * nmid + threadIdx.x + 2 * klo];
}
}

View File

@ -1,93 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_add_force_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_addforce_cuda_cu.h"
#include "fix_addforce_cuda_kernel.cu"
void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
}
void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
}
void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixAddForceCuda_UpdateNmax(sdata);
}
void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal)
{
if(sdata->atom.update_nmax)
Cuda_FixAddForceCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixAddForceCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit, axvalue, ayvalue, azvalue);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed");
int oldgrid = grid.x;
grid.x = 4;
threads.x = 512;
reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal);

View File

@ -1,90 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_CFLOAT sharedmem[];
__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
sharedmem[threadIdx.x] = 0;
sharedmem[threadIdx.x + blockDim.x] = 0;
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
if(i < _nlocal)
if(_mask[i] & groupbit)
//if (iregion >= 0 &&
//match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported
{
sharedmem[threadIdx.x] = -xvalue * _x[i] - yvalue * _x[i + 1 * _nmax] - zvalue * _x[i + 2 * _nmax];
sharedmem[threadIdx.x + blockDim.x] = _f[i];
sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 1 * _nmax];
sharedmem[threadIdx.x + 3 * blockDim.x] = _f[i + 2 * _nmax];
_f[i] += xvalue;
_f[i + 1 * _nmax] += yvalue;
_f[i + 2 * _nmax] += zvalue;
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
reduceBlock(&sharedmem[3 * blockDim.x]);
F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
}
}
__global__ void reduce_foriginal(int n, F_CFLOAT* foriginal)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
F_CFLOAT myforig = 0.0;
F_CFLOAT* buf = (F_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {
sharedmem[threadIdx.x] = 0;
if(i + threadIdx.x < n)
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i += blockDim.x;
if(threadIdx.x == 0)
myforig += sharedmem[0];
}
if(threadIdx.x == 0)
foriginal[blockIdx.x] = myforig;
}

View File

@ -1,107 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_ave_force_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_aveforce_cuda_cu.h"
#include "fix_aveforce_cuda_kernel.cu"
void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
}
void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
}
void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixAveForceCuda_UpdateNmax(sdata);
}
void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal)
{
if(sdata->atom.update_nmax)
Cuda_FixAveForceCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixAveForceCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed");
int oldgrid = grid.x;
grid.x = 4;
threads.x = 512;
Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed");
}
void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue)
{
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixAveForceCuda_PostForce_Set_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, axvalue, ayvalue, azvalue);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed");
}

View File

@ -1,28 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal);
extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue);

View File

@ -1,96 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_CFLOAT sharedmem[];
__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
sharedmem[threadIdx.x] = 0;
sharedmem[threadIdx.x + blockDim.x] = 0;
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
if(i < _nlocal)
if(_mask[i] & groupbit) {
sharedmem[threadIdx.x] = _f[i];
sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
sharedmem[threadIdx.x + 3 * blockDim.x] = 1;
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
reduceBlock(&sharedmem[3 * blockDim.x]);
F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
}
}
__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_CFLOAT* foriginal)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
F_CFLOAT myforig = 0.0;
F_CFLOAT* buf = (F_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {
sharedmem[threadIdx.x] = 0;
if(i + threadIdx.x < n)
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i += blockDim.x;
if(threadIdx.x == 0)
myforig += sharedmem[0];
}
if(threadIdx.x == 0)
foriginal[blockIdx.x] = myforig;
}
__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
if(xflag) _f[i] = xvalue;
if(yflag) _f[i + 1 * _nmax] = yvalue;
if(zflag) _f[i + 2 * _nmax] = zvalue;
}
}

View File

@ -1,55 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_enforce2d_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_enforce2d_cuda_cu.h"
#include "fix_enforce2d_cuda_kernel.cu"
void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
}
void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit)
{
if(sdata->atom.update_nmax)
Cuda_FixEnforce2dCuda_Init(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixEnforce2dCuda_PostForce_Kernel <<< grid, threads>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit);

View File

@ -1,34 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
_v[i + 2 * _nmax] = V_F(0.0);
_f[i + 2 * _nmax] = F_F(0.0);
}
}

View File

@ -1,98 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_freeze_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_freeze_cuda_cu.h"
#include "fix_freeze_cuda_kernel.cu"
void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
}
void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_CFLOAT*));
}
void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixFreezeCuda_UpdateNmax(sdata);
}
void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal)
{
if(sdata->atom.update_nmax)
Cuda_FixFreezeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixFreezeCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed");
int oldgrid = grid.x;
grid.x = 3;
threads.x = 512;
Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal);

View File

@ -1,87 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_CFLOAT sharedmem[];
__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
sharedmem[threadIdx.x] = 0;
sharedmem[threadIdx.x + blockDim.x] = 0;
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
if(i < _nlocal)
if(_mask[i] & groupbit) {
sharedmem[threadIdx.x] = _f[i];
sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
_f[i] = F_F(0.0);
_f[i + 1 * _nmax] = F_F(0.0);
_f[i + 2 * _nmax] = F_F(0.0);
_torque[i] = F_F(0.0);
_torque[i + 1 * _nmax] = F_F(0.0);
_torque[i + 2 * _nmax] = F_F(0.0);
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
}
}
__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
F_CFLOAT myforig = 0.0;
F_CFLOAT* buf = (F_CFLOAT*)_buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {
sharedmem[threadIdx.x] = 0;
if(i + threadIdx.x < n)
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i += blockDim.x;
if(threadIdx.x == 0)
myforig += sharedmem[0];
}
if(threadIdx.x == 0)
foriginal[blockIdx.x] = myforig;
}

View File

@ -1,92 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_gravity_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_gravity_cuda_cu.h"
#include "fix_gravity_cuda_kernel.cu"
void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
}
void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
}
void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixGravityCuda_UpdateNmax(sdata);
}
void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
{
if(sdata->atom.update_nmax)
Cuda_FixGravityCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixGravityCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixGravityCuda_PostForce_Kernel <<< grid, threads>>> (groupbit, xacc, yacc, zacc);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc);

View File

@ -1,36 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
F_CFLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]];
_f[i] += mass * xacc;
_f[i + 1 * _nmax] += mass * yacc;
_f[i + 2 * _nmax] += mass * zacc;
}
}

View File

@ -1,255 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_nh_cuda
#define IncludeCommonNeigh
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_nh_cuda_cu.h"
#include "fix_nh_cuda_kernel.cu"
void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
}
void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size = (unsigned)10 * sizeof(int);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixNHCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
}
void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
{
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
Cuda_FixNHCuda_UpdateNmax(sdata);
}
void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
F_CFLOAT3 factor2;
if(p_triclinic) {
factor2.x = factor_h[3], factor2.y = factor_h[4];
factor2.z = factor_h[5];
}
int3 layout = getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNHCuda_nh_v_press_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
}
void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
F_CFLOAT3 factor2;
if(p_triclinic) {
factor2.x = factor_h[3], factor2.y = factor_h[4];
factor2.z = factor_h[5];
}
int3 layout = getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
CUT_CHECK_ERROR("FixNHCuda: fix nh v_press pre Kernel execution failed");
FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
}
void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
int3 layout = getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNHCuda_nh_v_temp_Kernel <<< grid, threads>>> (groupbit, factor_eta);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed");
}
void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
int3 layout = getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNHCuda_nve_v_Kernel <<< grid, threads>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed");
}
void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
int3 layout = getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
cudaMemset(sdata->buffer, 0, sizeof(int));
FixNHCuda_nve_x_Kernel <<< grid, threads>>> (groupbit);
cudaThreadSynchronize();
int reneigh_flag;
cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
sdata->atom.reneigh_flag += reneigh_flag;
CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed");
}
void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
F_CFLOAT3 factor2;
if(p_triclinic) {
factor2.x = factor_h[3], factor2.y = factor_h[4];
factor2.z = factor_h[5];
}
int3 layout = getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias: Kernel execution failed");
}

View File

@ -1,32 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp

View File

@ -1,205 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
{
if(_dist_check) {
X_CFLOAT d = X_F(0.0);
if(i < _nlocal) {
X_CFLOAT tmp = xtmp - _xhold[i];
d = tmp * tmp;
tmp = ytmp - _xhold[i + _maxhold];
d += tmp * tmp;
tmp = ztmp - _xhold[i + 2 * _maxhold];
d += tmp * tmp;
d = ((_mask[i] & groupbit)) ? d : X_F(0.0);
}
if(not __all(d <= _triggerneighsq))
_reneigh_flag[0] = 1;
}
}
__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
V_CFLOAT* my_v = _v + i;
V_CFLOAT vx = my_v[0];
V_CFLOAT vy = my_v[_nmax];
V_CFLOAT vz = my_v[2 * _nmax];
vx *= factor.x;
vy *= factor.y;
vz *= factor.z;
if(p_triclinic) {
vx += vy * factor2.z + vz * factor2.y;
vy += vz * factor2.x;
}
vx *= factor.x;
vy *= factor.y;
vz *= factor.z;
my_v[0] = vx;
my_v[_nmax] = vy;
my_v[2 * _nmax] = vz;
}
}
__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_CFLOAT factor_eta)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
V_CFLOAT* my_v = _v + i;
my_v[0] *= factor_eta;
my_v[_nmax] *= factor_eta;
my_v[2 * _nmax] *= factor_eta;
}
}
__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];
V_CFLOAT vx = my_v[0];
V_CFLOAT vy = my_v[_nmax];
V_CFLOAT vz = my_v[2 * _nmax];
vx *= factor.x;
vy *= factor.y;
vz *= factor.z;
if(p_triclinic) {
vx += vy * factor2.z + vz * factor2.y;
vy += vz * factor2.x;
}
vx *= factor.x;
vy *= factor.y;
vz *= factor.z;
my_v[0] = vx + dtfm * my_f[0];
my_v[_nmax] = vy + dtfm * my_f[_nmax];
my_v[2 * _nmax] = vz + dtfm * my_f[_nmax * 2];
}
}
__global__ void FixNHCuda_nve_v_Kernel(int groupbit)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];
*my_v = (*my_v + dtfm * (*my_f));
my_f += _nmax;
my_v += _nmax;
*my_v = (*my_v + dtfm * (*my_f));
my_f += _nmax;
my_v += _nmax;
*my_v = (*my_v + dtfm * (*my_f));
}
}
__global__ void FixNHCuda_nve_x_Kernel(int groupbit)
{
X_CFLOAT xtmp, ytmp, ztmp;
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
V_CFLOAT* my_v = _v + i;
X_CFLOAT* my_x = _x + i;
xtmp = *my_x += _dtv * *my_v;
my_v += _nmax;
my_x += _nmax;
ytmp = *my_x += _dtv * *my_v;
my_v += _nmax;
my_x += _nmax;
ztmp = *my_x += _dtv * *my_v;
}
check_distance(xtmp, ytmp, ztmp, i, groupbit);
}
__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];
V_CFLOAT vx = my_v[0] + dtfm * my_f[0];
V_CFLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax];
V_CFLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax];
vx *= factor.x;
vy *= factor.y;
vz *= factor.z;
if(p_triclinic) {
vx += vy * factor2.z + vz * factor2.y;
vy += vz * factor2.x;
}
vx *= factor.x;
vy *= factor.y;
vz *= factor.z;
my_v[0] = vx;
my_v[_nmax] = vy;
my_v[2 * _nmax] = vz;
}
}

View File

@ -1,134 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_nve_cuda
#define IncludeCommonNeigh
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_nve_cuda_cu.h"
#include "fix_nve_cuda_kernel.cu"
void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
}
void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size = (unsigned)10 * sizeof(int);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixNVECuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
}
void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
{
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
Cuda_FixNVECuda_UpdateNmax(sdata);
}
void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
if(sdata->atom.update_nmax)
Cuda_FixNVECuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixNVECuda_UpdateBuffer(sdata);
int3 layout = getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
cudaMemset(sdata->buffer, 0, sizeof(int));
FixNVECuda_InitialIntegrate_Kernel <<< grid, threads>>> (groupbit);
cudaThreadSynchronize();
int reneigh_flag;
cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
sdata->atom.reneigh_flag += reneigh_flag;
CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed");
}
void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
if(sdata->atom.update_nmax)
Cuda_FixNVECuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixNVECuda_UpdateBuffer(sdata);
#ifdef CUDA_USE_BINNING
dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
dim3 threads(sdata->domain.bin_nmax, 1, 1);
FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed");
#else
int3 layout = getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed");
#endif
}

View File

@ -1,28 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);

View File

@ -1,166 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
{
if(_dist_check) {
X_CFLOAT tmp = xtmp - _xhold[i];
X_CFLOAT d = tmp * tmp;
tmp = ytmp - _xhold[i + _maxhold];
d += tmp * tmp;
tmp = ztmp - _xhold[i + 2 * _maxhold];
d += tmp * tmp;
d = ((i < _nlocal) && (_mask[i] & groupbit)) ? d : X_F(0.0);
if(not __all(d <= _triggerneighsq))
_reneigh_flag[0] = 1;
}
}
__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
{
X_CFLOAT xtmp, ytmp, ztmp;
#ifdef CUDA_USE_BINNING
const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
if(threadIdx.x < _bin_count_local[bin]) {
const int i = 3 * blockDim.x * bin + threadIdx.x;
if(_mask[i] & groupbit) {
F_CFLOAT* my_f = _binned_f + i;
V_CFLOAT* my_v = _binned_v + i;
X_CFLOAT* my_x = _binned_x + i;
V_CFLOAT dtfm = _dtf
if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
V_CFLOAT v_mem;
v_mem = *my_v += dtfm * (*my_f);
xtmp = *my_x += _dtv * v_mem;
my_f += blockDim.x;
my_v += blockDim.x;
my_x += blockDim.x;
v_mem = *my_v += dtfm * (*my_f);
ytmp = *my_x += _dtv * v_mem;
my_f += blockDim.x;
my_v += blockDim.x;
my_x += blockDim.x;
v_mem = *my_v += dtfm * (*my_f);
ztmp = *my_x += _dtv * v_mem;
}
}
#else
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
X_CFLOAT* my_x = _x + i;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];
V_CFLOAT v_mem;
v_mem = *my_v += dtfm * (*my_f);
xtmp = *my_x += _dtv * v_mem;
my_f += _nmax;
my_v += _nmax;
my_x += _nmax;
v_mem = *my_v += dtfm * (*my_f);
ytmp = *my_x += _dtv * v_mem;
my_f += _nmax;
my_v += _nmax;
my_x += _nmax;
v_mem = *my_v += dtfm * (*my_f);
ztmp = *my_x += _dtv * v_mem;
}
#endif
check_distance(xtmp, ytmp, ztmp, i, groupbit);
}
__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
{
#ifdef CUDA_USE_BINNING
const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
if(threadIdx.x < _bin_count_local[bin]) {
const int i = 3 * blockDim.x * bin + threadIdx.x;
if(_mask[i] & groupbit) {
F_CFLOAT* my_f = _binned_f + i;
V_CFLOAT* my_v = _binned_v + i;
V_CFLOAT dtfm = _dtf
if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
*my_v += dtfm * (*my_f);
my_f += blockDim.x;
my_v += blockDim.x;
*my_v += dtfm * (*my_f);
my_f += blockDim.x;
my_v += blockDim.x;
*my_v += dtfm * (*my_f);
}
}
#else
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];
*my_v += dtfm * (*my_f);
my_f += _nmax;
my_v += _nmax;
*my_v += dtfm * (*my_f);
my_f += _nmax;
my_v += _nmax;
*my_v += dtfm * (*my_f);
}
#endif
}

View File

@ -1,96 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_set_force_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_set_force_cuda_cu.h"
#include "fix_set_force_cuda_kernel.cu"
void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
}
void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
}
void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixSetForceCuda_UpdateNmax(sdata);
}
void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz)
{
if(sdata->atom.update_nmax)
Cuda_FixSetForceCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixSetForceCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed");
int oldgrid = grid.x;
grid.x = 3;
threads.x = 512;
Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz);

View File

@ -1,86 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_CFLOAT sharedmem[];
__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, int flagx, int flagy, int flagz)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
sharedmem[threadIdx.x] = 0;
sharedmem[threadIdx.x + blockDim.x] = 0;
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
if(i < _nlocal)
if(_mask[i] & groupbit) {
sharedmem[threadIdx.x] = _f[i];
sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
if(flagx) _f[i] = xvalue;
if(flagy) _f[i + 1 * _nmax] = yvalue;
if(flagz) _f[i + 2 * _nmax] = zvalue;
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
}
}
__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
F_CFLOAT myforig = 0.0;
F_CFLOAT* buf = (F_CFLOAT*)_buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {
sharedmem[threadIdx.x] = 0;
if(i + threadIdx.x < n)
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i += blockDim.x;
if(threadIdx.x == 0)
myforig += sharedmem[0];
}
if(threadIdx.x == 0)
foriginal[blockIdx.x] = myforig;
}

View File

@ -1,297 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_shake_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_shake_cuda_cu.h"
#include "cuda_pair_virial_kernel_nc.cu"
#define _shake_atom MY_AP(shake_atom)
#define _shake_type MY_AP(shake_type)
#define _shake_flag MY_AP(shake_flag)
#define _xshake MY_AP(xshake)
#define _dtfsq MY_AP(dtfsq)
#define _bond_distance MY_AP(bond_distance)
#define _angle_distance MY_AP(angle_distance)
#define _max_iter MY_AP(max_iter)
#define _tolerance MY_AP(tolerance)
__device__ __constant__ int* _shake_atom;
__device__ __constant__ int* _shake_type;
__device__ __constant__ int* _shake_flag;
__device__ __constant__ X_CFLOAT3* _xshake;
__device__ __constant__ F_CFLOAT _dtfsq;
__device__ __constant__ X_CFLOAT* _bond_distance;
__device__ __constant__ X_CFLOAT* _angle_distance;
__device__ __constant__ int _max_iter;
__device__ __constant__ X_CFLOAT _tolerance;
#include "fix_shake_cuda_kernel.cu"
void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(debugdata), & sdata->debugdata , sizeof(int*));
}
void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity , sizeof(int) * 3);
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , sizeof(X_CFLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(triclinic) , &sdata->domain.triclinic , sizeof(int));
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , sizeof(X_CFLOAT) * 6);
}
void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
{
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixShakeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
}
void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
void* bond_distance, void* angle_distance, void* virial,
int max_iter, X_CFLOAT tolerance)
{
Cuda_FixShakeCuda_UpdateNmax(sdata);
Cuda_FixShakeCuda_UpdateDomain(sdata);
cudaMemcpyToSymbol(MY_AP(shake_atom) , & shake_atom , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(shake_type) , & shake_type , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(shake_flag) , & shake_flag , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(xshake) , & xshake , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dtfsq) , & dtfsq , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(bond_distance) , & bond_distance , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(angle_distance) , & angle_distance , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(virial) , & virial , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*));
cudaMemcpyToSymbol(MY_AP(max_iter) , &max_iter , sizeof(int));
cudaMemcpyToSymbol(MY_AP(tolerance) , &tolerance , sizeof(X_CFLOAT));
if(sdata->atom.mass_host)
cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag, sizeof(int*));
}
void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixShakeCuda_UpdateBuffer(sdata, 10 * sizeof(double));
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixShakeCuda_UnconstrainedUpdate_Kernel <<< grid, threads>>> ();
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed");
}
void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->domain.update)
Cuda_FixShakeCuda_UpdateDomain(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_CFLOAT), 64);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->buffer_new)
Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_CFLOAT));
BindXTypeTexture(sdata);
FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_CFLOAT)>>> (vflag, vflag_atom, list, nlist);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed");
if(vflag) {
int n = grid.x * grid.y;
grid.x = 6;
grid.y = 1;
threads.x = 256;
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed");
}
}
int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
dz = pbc[2] * sdata->domain.prd[2];
}
}
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
cudaMemset(sdata->flag, 0, sizeof(int));
FixShakeCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz);
cudaThreadSynchronize();
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
int aflag;
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed");
}
return 3 * n;
}
int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
static int count = -1;
count++;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
dx = pbc[0] * sdata->domain.prd[0];
dy = pbc[1] * sdata->domain.prd[1];
dz = pbc[2] * sdata->domain.prd[2];
} else {
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
dz = pbc[2] * sdata->domain.prd[2];
}
}
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
FixShakeCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
}
return 3 * n;
}
void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
int3 layout = getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
FixShakeCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed");
}
}

View File

@ -1,34 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
void* bond_distance, void* angle_distance, void* virial,
int max_iter, X_CFLOAT tolerance);
extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata);
extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist);
extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv);

File diff suppressed because it is too large Load Diff

View File

@ -1,66 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_temp_berendsen_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_temp_berendsen_cuda_cu.h"
#include "fix_temp_berendsen_cuda_kernel.cu"
void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
}
void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
}
void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
{
V_CFLOAT factor = afactor;
if(sdata->atom.update_nmax)
Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixTempBerendsenCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor);

View File

@ -1,37 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
_v[i] *= factor;
_v[i + _nmax] *= factor;
_v[i + 2 * _nmax] *= factor;
}
}

View File

@ -1,64 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_temp_rescale_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_temp_rescale_cuda_cu.h"
#include "fix_temp_rescale_cuda_kernel.cu"
void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
}
void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
}
void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
{
V_CFLOAT factor = afactor;
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
//cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixTempRescaleCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor);

View File

@ -1,37 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
_v[i] *= factor;
_v[i + _nmax] *= factor;
_v[i + 2 * _nmax] *= factor;
}
}

View File

@ -1,64 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_temp_rescale_limit_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_temp_rescale_limit_cuda_cu.h"
#include "fix_temp_rescale_limit_cuda_kernel.cu"
void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
}
void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
}
void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit)
{
V_CFLOAT factor = afactor;
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
//cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor, limit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit);

View File

@ -1,44 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor, V_CFLOAT limit)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
V_CFLOAT vx = _v[i];
V_CFLOAT vy = _v[i + _nmax];
V_CFLOAT vz = _v[i + 2 * _nmax];
vx *= factor;
vy *= factor;
vz *= factor;
_v[i] = vx > 0 ? min(vx, limit) : max(vx, -limit);
_v[i + _nmax] = vy > 0 ? min(vy, limit) : max(vy, -limit);
_v[i + 2 * _nmax] = vz > 0 ? min(vz, limit) : max(vz, -limit);
}
}

View File

@ -1,67 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_viscous_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_viscous_cuda_cu.h"
#include "fix_viscous_cuda_kernel.cu"
void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
}
void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixViscousCuda_UpdateNmax(sdata);
}
void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma)
{
if(sdata->atom.update_nmax)
Cuda_FixViscousCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int3 layout = getgrid(sdata->atom.nlocal, 0);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_CFLOAT*) gamma);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma);

View File

@ -1,35 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_CFLOAT* gamma)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
F_CFLOAT drag = gamma[_type[i]];
_f[i] -= drag * _v[i];
_f[i + 1 * _nmax] -= drag * _v[i + 1 * _nmax];
_f[i + 2 * _nmax] -= drag * _v[i + 2 * _nmax];
}
}

View File

@ -1,364 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#include <time.h>
#define MY_PREFIX neighbor
#define IncludeCommonNeigh
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "cuda_wrapper_cu.h"
#define _cutneighsq MY_AP(cutneighsq)
#define _ex_type MY_AP(ex_type)
#define _nex_type MY_AP(nex_type)
#define _ex1_bit MY_AP(ex1_bit)
#define _ex2_bit MY_AP(ex2_bit)
#define _nex_group MY_AP(nex_group)
#define _ex_mol_bit MY_AP(ex_mol_bit)
#define _nex_mol MY_AP(nex_mol)
__device__ __constant__ CUDA_CFLOAT* _cutneighsq;
__device__ __constant__ int* _ex_type;
__device__ __constant__ int _nex_type;
__device__ __constant__ int* _ex1_bit;
__device__ __constant__ int* _ex2_bit;
__device__ __constant__ int _nex_group;
__device__ __constant__ int* _ex_mol_bit;
__device__ __constant__ int _nex_mol;
#include "neighbor_cu.h"
#include "neighbor_kernel.cu"
void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_CFLOAT)));
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
if(sdata->buffer != NULL) CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed");
}
int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
if(sdata->buffer_new)
Cuda_Neighbor_UpdateBuffer(sdata, sneighlist);
// initialize only on first call
CUDA_CFLOAT rez_bin_size[3] = {
(1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
(1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
(1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
};
short init = 0;
if(! init) {
init = 0;
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , sizeof(X_CFLOAT) * 3);
}
int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
my_times starttime, endtime;
my_gettime(CLOCK_REALTIME, &starttime);
cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_CFLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &endtime);
sdata->cuda_timings.neigh_bin +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
int binning_error;
cudaMemcpy((void*) &binning_error, (void*) sdata->buffer, 1 * sizeof(int), cudaMemcpyDeviceToHost);
if(binning_error) {
sneighlist->bin_extraspace += 0.05;
} else {
MYDBG(printf("CUDA: binning successful\n");)
}
CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
return binning_error;
}
int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
//Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
CUDA_CFLOAT globcutoff = -1.0;
short init = 0;
if(! init) {
init = 1;
// !! LAMMPS indexes atom types starting with 1 !!
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
//printf("Allocate: %i\n",nx);
sneighlist->cu_cutneighsq = (CUDA_CFLOAT*) CudaWrapper_AllocCudaData(nx);
if(sneighlist->cutneighsq) {
int cutoffsdiffer = 0;
double cutoff0 = sneighlist->cutneighsq[1][1];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++;
}
}
if(not cutoffsdiffer) globcutoff = (CUDA_CFLOAT) cutoff0;
} else {
MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
return 0;
}
int size = 100;
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx);
cudaMemcpyToSymbol(MY_AP(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(special_flag) , sdata->atom.special_flag , 4 * sizeof(int));
cudaMemcpyToSymbol(MY_AP(molecular) , & sdata->atom.molecular , sizeof(int));
}
cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned));
//cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(special) , & sdata->atom.special .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(maxspecial) , & sdata->atom.maxspecial , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nspecial) , & sdata->atom.nspecial .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*));
cudaMemcpyToSymbol(MY_AP(overlap_comm) , & sdata->overlap_comm, sizeof(int));
cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(ex_type) , & sneighlist->ex_type.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(ex1_bit) , & sneighlist->ex1_bit.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(ex2_bit) , & sneighlist->ex2_bit.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(ex_mol_bit) , & sneighlist->ex_mol_bit.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nex_type) , & sneighlist->nex_type, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nex_group) , & sneighlist->nex_group, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nex_mol) , & sneighlist->nex_mol, sizeof(int));
if(sdata->overlap_comm) {
cudaMemcpyToSymbol(MY_AP(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*));
}
//dim3 threads(sneighlist->bin_nmax,1,1);
dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1);
dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1);
//printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_CFLOAT))*threads.x,sneighlist->bin_nmax);
int buffer[20];
buffer[0] = 1;
buffer[1] = 0;
CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int));
CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
//cudaMemset(sdata->debugdata,0,100*sizeof(int));
unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_CFLOAT)) * threads.x;
MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
//shared_size=2056;
my_times starttime, endtime;
my_gettime(CLOCK_REALTIME, &starttime);
//for(int i=0;i<100;i++)
{
if(sdata->overlap_comm)
NeighborBuildFullBin_OverlapComm_Kernel <<< grid, threads, shared_size>>>
(sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom);
else {
int exclude = sneighlist->nex_mol | sneighlist->nex_group | sneighlist->nex_type;
if(exclude)
NeighborBuildFullBin_Kernel<1> <<< grid, threads, shared_size>>>
(sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
else
NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>>
(sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
}
//NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_CFLOAT))*threads.x+sizeof(int)>>>
// (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
my_gettime(CLOCK_REALTIME, &endtime);
sdata->cuda_timings.neigh_build +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
//dim3 threads,grid;
CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int));
if(buffer[0] >= 0 && true && sdata->atom.molecular) {
//printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
my_gettime(CLOCK_REALTIME, &starttime);
int3 layout = getgrid(sdata->atom.nlocal, 0, 512);
threads.x = layout.z;
threads.y = 1;
threads.z = 1;
grid.x = layout.x;
grid.y = layout.y;
grid.z = 1;
FindSpecial <<< grid, threads>>>(sdata->pair.use_block_per_atom);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
my_gettime(CLOCK_REALTIME, &endtime);
sdata->cuda_timings.neigh_special +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
}
}
//printf("Neightime: %lf\n",sdata->cuda_timings.test1);
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
//CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int));
MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");)
return buffer[0];
}
int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");)
// initialize only on first call
/*static*/ short init = 0;
if(! init) {
init = 1;
// !! LAMMPS indexes atom types starting with 1 !!
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u "
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
if(sneighlist->cutneighsq) {
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
//printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
}
}
} else {
MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
return 0;
}
int size = 100;
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer , sizeof(int*));
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(cutneighsq) , acutneighsq , nx);
cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
free(acutneighsq);
}
int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int return_value = 1;
CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int));
CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed");
NeighborBuildFullNsq_Kernel <<< grid, threads>>> ();
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
int buffer[20];
CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int) * 20);
MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");)
return return_value = buffer[0];
}

View File

@ -1,32 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef NEIGHBOR_CU_H_
#define NEIGHBOR_CU_H_
#include "cuda_shared.h"
extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
#endif /*NEIGHBOR_CU_H_*/

View File

@ -1,660 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#define SBBITS 30
__global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z,
CUDA_CFLOAT rez_bin_size_x, CUDA_CFLOAT rez_bin_size_y, CUDA_CFLOAT rez_bin_size_z)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
/*int* bin_count=(int*) _buffer;
bin_count=bin_count+20;
CUDA_CFLOAT* binned_x=(CUDA_CFLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
binned_x = &binned_x[2];
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
if(i < _nall) {
// copy atom position from global device memory to local register
// in this 3 steps to get as much coalesced access as possible
X_CFLOAT* my_x = _x + i;
CUDA_CFLOAT x_i = *my_x;
my_x += _nmax;
CUDA_CFLOAT y_i = *my_x;
my_x += _nmax;
CUDA_CFLOAT z_i = *my_x;
// calculate flat bin index
int bx = __float2int_rd(rez_bin_size_x * (x_i - _sublo[0])) + 2;
int by = __float2int_rd(rez_bin_size_y * (y_i - _sublo[1])) + 2;
int bz = __float2int_rd(rez_bin_size_z * (z_i - _sublo[2])) + 2;
bx -= bx * negativCUDA(1.0f * bx);
bx -= (bx - bin_dim_x + 1) * negativCUDA(1.0f * bin_dim_x - 1.0f - 1.0f * bx);
by -= by * negativCUDA(1.0f * by);
by -= (by - bin_dim_y + 1) * negativCUDA(1.0f * bin_dim_y - 1.0f - 1.0f * by);
bz -= bz * negativCUDA(1.0f * bz);
bz -= (bz - bin_dim_z + 1) * negativCUDA(1.0f * bin_dim_z - 1.0f - 1.0f * bz);
const unsigned j = bin_dim_z * (bin_dim_y * bx + by) + bz;
// add new atom to bin, get bin-array position
const unsigned k = atomicAdd(& bin_count[j], 1);
if(k < bin_nmax) {
binned_id [bin_nmax * j + k] = i;
binned_x [3 * bin_nmax * j + k] = x_i;
binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
binned_x [3 * bin_nmax * j + k + 2 * bin_nmax] = z_i;
} else {
// normally, this should not happen:
int errorn = atomicAdd((int*) _buffer, 1);
MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);)
}
}
}
__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
{
int m;
if(_nex_type)
if(_ex_type[itype * _cuda_ntypes + jtype]) return 1;
if(_nex_group) {
for(m = 0; m < _nex_group; m++) {
if(_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;
if(_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
}
}
if(_nex_mol) {
if(_molecule[i] == _molecule[j])
for(m = 0; m < _nex_mol; m++)
if(_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m]) return 1;
}
return 0;
}
extern __shared__ CUDA_CFLOAT shared[];
__device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
{
int k = n.z;
for(int l = 0; l < n.z; l++) k = ((list[l] == tag) ? l : k);
return k < n.x ? flag.x : (k < n.y ? flag.y : (k < n.z ? flag.z : 0));
}
template <const unsigned int exclude>
__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style, bool neighall)
{
int natoms = neighall ? _nall : _nlocal;
//const bool domol=false;
int bin_dim_z = gridDim.y;
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
binned_x = &binned_x[2];
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
int bin_x = blockIdx.x / bin_dim_y;
int bin_y = blockIdx.x - bin_x * bin_dim_y;
int bin_z = blockIdx.y;
int bin_c = bin_count[bin];
CUDA_CFLOAT cut;
if(globcutoff > 0)
cut = globcutoff;
int i = _nall;
CUDA_CFLOAT* my_x;
CUDA_CFLOAT x_i, y_i, z_i;
for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
int actIdx = threadIdx.x + actOffset;
CUDA_CFLOAT* other_x = shared;
int* other_id = (int*) &other_x[3 * blockDim.x];
if(actIdx < bin_c) {
i = binned_id[__mul24(bin, bin_nmax) + actIdx];
my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
x_i = *my_x;
my_x += bin_nmax;
y_i = *my_x;
my_x += bin_nmax;
z_i = *my_x;
} else
i = 2 * _nall;
__syncthreads();
int jnum = 0;
int itype;
if(i < natoms) {
jnum = 0;
_ilist[i] = i;
itype = _type[i];
}
//__syncthreads();
for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
int otherActIdx = threadIdx.x + otherActOffset;
if(otherActIdx < bin_c) {
if(otherActOffset == actOffset) {
other_id[threadIdx.x] = i;
other_x[threadIdx.x] = x_i;
other_x[threadIdx.x + blockDim.x] = y_i;
other_x[threadIdx.x + 2 * blockDim.x] = z_i;
} else {
other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
other_x[threadIdx.x] = *my_x;
my_x += bin_nmax;
other_x[threadIdx.x + blockDim.x] = *my_x;
my_x += bin_nmax;
other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
}
}
__syncthreads();
int kk = threadIdx.x;
for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
if(i < natoms) {
kk++;
kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
int j = other_id[kk];
if(exclude && exclusion(i, j, itype, _type[j])) continue;
if(globcutoff < 0) {
int jtype = _type[j];
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_CFLOAT delx = x_i - other_x[kk];
CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq <= cut && i != j) {
if(jnum < _maxneighbors) {
if(block_style)
_neighbors[i * _maxneighbors + jnum] = j;
else
_neighbors[i + jnum * natoms] = j;
}
++jnum;
}
}
}
__syncthreads();
}
for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
if(other_bin == bin) continue;
int obin_c = bin_count[other_bin];
for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
int otherActIdx = otherActOffset + threadIdx.x;
if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
other_x[threadIdx.x] = *my_x;
my_x += bin_nmax;
other_x[threadIdx.x + blockDim.x] = *my_x;
my_x += bin_nmax;
other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
}
__syncthreads();
for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
if(i < natoms) {
int j = other_id[k];
if(exclude && exclusion(i, j, itype, _type[j])) continue;
if(globcutoff < 0) {
int jtype = _type[j];
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_CFLOAT delx = x_i - other_x[k];
CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq <= cut && i != j) {
if(jnum < _maxneighbors) {
if(block_style)
_neighbors[i * _maxneighbors + jnum] = j;
else
_neighbors[i + jnum * natoms] = j;
}
++jnum;
}
}
}
__syncthreads();
}
}
if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
if(i < natoms)
_numneigh[i] = jnum;
}
}
__global__ void FindSpecial(int block_style)
{
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int which;
int tag_mask = 0;
int3 spec_flag;
int3 mynspecial = {0, 0, 1};
if(ii >= _nlocal) return;
int special_id[CUDA_MAX_NSPECIAL];
int i = _ilist[ii];
if(i >= _nlocal) return;
int jnum = _numneigh[i];
if(_special_flag[1] == 0) spec_flag.x = -1;
else if(_special_flag[1] == 1) spec_flag.x = 0;
else spec_flag.x = 1;
if(_special_flag[2] == 0) spec_flag.y = -1;
else if(_special_flag[2] == 1) spec_flag.y = 0;
else spec_flag.y = 2;
if(_special_flag[3] == 0) spec_flag.z = -1;
else if(_special_flag[3] == 1) spec_flag.z = 0;
else spec_flag.z = 3;
mynspecial.x = _nspecial[i];
mynspecial.y = _nspecial[i + _nmax];
mynspecial.z = _nspecial[i + 2 * _nmax];
if(i < _nlocal) {
int* list = &_special[i];
for(int k = 0; k < mynspecial.z; k++) {
special_id[k] = list[k * _nmax];
tag_mask = tag_mask | special_id[k];
}
}
for(int k = 0; k < MIN(jnum, _maxneighbors); k++) {
int j;
if(block_style)
j = _neighbors[i * _maxneighbors + k];
else
j = _neighbors[i + k * _nlocal];
int tag_j = _tag[j];
which = 0;
if((tag_mask & tag_j) == tag_j) {
which = find_special(mynspecial, special_id, tag_j, spec_flag);
if(which > 0) {
if(block_style)
_neighbors[i * _maxneighbors + k] = j ^ (which << SBBITS);
else
_neighbors[i + k * _nlocal] = j ^ (which << SBBITS);
} else if(which < 0) {
if(block_style)
_neighbors[i * _maxneighbors + k] = _neighbors[i * _maxneighbors + jnum - 1];
else
_neighbors[i + k * _nlocal] = _neighbors[i + (jnum - 1) * _nlocal];
jnum--;
k--;
}
}
}
_numneigh[i] = jnum;
}
__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style)
{
int bin_dim_z = gridDim.y;
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
binned_x = &binned_x[2];
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
int bin_x = blockIdx.x / bin_dim_y;
int bin_y = blockIdx.x - bin_x * bin_dim_y;
int bin_z = blockIdx.y;
int bin_c = bin_count[bin];
CUDA_CFLOAT cut;
if(globcutoff > 0)
cut = globcutoff;
int i = _nall;
CUDA_CFLOAT* my_x;
CUDA_CFLOAT x_i, y_i, z_i;
for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
int actIdx = threadIdx.x + actOffset;
CUDA_CFLOAT* other_x = shared;
int* other_id = (int*) &other_x[3 * blockDim.x];
if(actIdx < bin_c) {
i = binned_id[__mul24(bin, bin_nmax) + actIdx];
my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
x_i = *my_x;
my_x += bin_nmax;
y_i = *my_x;
my_x += bin_nmax;
z_i = *my_x;
} else
i = 2 * _nall;
__syncthreads();
int jnum = 0;
int jnum_border = 0;
int jnum_inner = 0;
int i_border = -1;
int itype;
if(i < _nlocal) {
jnum = 0;
_ilist[i] = i;
itype = _type[i];
}
__syncthreads();
for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
int otherActIdx = threadIdx.x + otherActOffset;
if(otherActIdx < bin_c) {
if(otherActOffset == actOffset) {
other_id[threadIdx.x] = i;
other_x[threadIdx.x] = x_i;
other_x[threadIdx.x + blockDim.x] = y_i;
other_x[threadIdx.x + 2 * blockDim.x] = z_i;
} else {
other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
other_x[threadIdx.x] = *my_x;
my_x += bin_nmax;
other_x[threadIdx.x + blockDim.x] = *my_x;
my_x += bin_nmax;
other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
}
}
__syncthreads();
int kk = threadIdx.x;
for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
if(i < _nlocal) {
kk++;
kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
int j = other_id[kk];
if(globcutoff < 0) {
int jtype = _type[j];
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_CFLOAT delx = x_i - other_x[kk];
CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq <= cut && i != j) {
if((j >= _nlocal) && (i_border < 0))
i_border = atomicAdd(_inum_border, 1);
if(jnum < _maxneighbors) {
if(block_style) {
_neighbors[i * _maxneighbors + jnum] = j;
if(j >= _nlocal) {
_neighbors_border[i_border * _maxneighbors + jnum_border] = j;
} else {
_neighbors_inner[i * _maxneighbors + jnum_inner] = j;
}
} else {
_neighbors[i + jnum * _nlocal] = j;
if(j >= _nlocal) {
_neighbors_border[i_border + jnum_border * _nlocal] = j;
} else {
_neighbors_inner[i + jnum_inner * _nlocal] = j;
}
}
}
++jnum;
if(j >= _nlocal)
jnum_border++;
else
jnum_inner++;
}
}
}
__syncthreads();
}
for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
if(other_bin == bin) continue;
int obin_c = bin_count[other_bin];
for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
int otherActIdx = otherActOffset + threadIdx.x;
if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
other_x[threadIdx.x] = *my_x;
my_x += bin_nmax;
other_x[threadIdx.x + blockDim.x] = *my_x;
my_x += bin_nmax;
other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
}
__syncthreads();
for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
if(i < _nlocal) {
int j = other_id[k];
if(globcutoff < 0) {
int jtype = _type[j];
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_CFLOAT delx = x_i - other_x[k];
CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq <= cut && i != j) {
if((j >= _nlocal) && (i_border < 0))
i_border = atomicAdd(_inum_border, 1);
if(jnum < _maxneighbors) {
if(block_style) {
_neighbors[i * _maxneighbors + jnum] = j;
if(j >= _nlocal) {
_neighbors_border[i_border * _maxneighbors + jnum_border] = j;
} else {
_neighbors_inner[i * _maxneighbors + jnum_inner] = j;
}
} else {
_neighbors[i + jnum * _nlocal] = j;
if(j >= _nlocal) {
_neighbors_border[i_border + jnum_border * _nlocal] = j;
} else {
_neighbors_inner[i + jnum_inner * _nlocal] = j;
}
}
}
++jnum;
if(j >= _nlocal)
jnum_border++;
else
jnum_inner++;
}
}
}
__syncthreads();
}
}
if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
if(i < _nlocal) {
_numneigh[i] = jnum;
_numneigh_inner[i] = jnum_inner;
if(i_border >= 0) _numneigh_border[i_border] = jnum_border;
if(i_border >= 0) _ilist_border[i_border] = i;
}
}
}
__global__ void NeighborBuildFullNsq_Kernel()
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* buffer = (int*) _buffer;
if(i < _nlocal) {
X_CFLOAT* my_x = _x + i;
CUDA_CFLOAT x_i = *my_x;
my_x += _nmax;
CUDA_CFLOAT y_i = *my_x;
my_x += _nmax;
CUDA_CFLOAT z_i = *my_x;
int jnum = 0;
int* jlist = _firstneigh[i];
_ilist[i] = i;
int itype = _type[i];
__syncthreads();
for(int j = 0; j < _nall; ++j) {
my_x = _x + j;
CUDA_CFLOAT x_j = *my_x;
my_x += _nmax;
CUDA_CFLOAT y_j = *my_x;
my_x += _nmax;
CUDA_CFLOAT z_j = *my_x;
CUDA_CFLOAT delx = x_i - x_j;
CUDA_CFLOAT dely = y_i - y_j;
CUDA_CFLOAT delz = z_i - z_j;
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
int jtype = _type[j];
if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) {
if(jnum < _maxneighbors)
jlist[jnum] = j;
if(i == 151)((int*)_buffer)[jnum + 2] = j;
++jnum;
}
__syncthreads();
}
if(jnum > _maxneighbors) buffer[0] = 0;
_numneigh[i] = jnum;
if(i == 151)((int*)_buffer)[1] = jnum;
}
}

View File

@ -1,78 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define _rhoinv MY_AP(coeff1)
#define _sigma MY_AP(coeff2)
#define _a MY_AP(coeff3)
#define _c MY_AP(coeff4)
#define _d MY_AP(coeff5)
#include "pair_born_coul_long_cuda_cu.h"
#include "pair_born_coul_long_cuda_kernel_nc.cu"
#include <time.h>
void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata)
{
Cuda_Pair_Init_AllStyles(sdata, 5, true);
}
void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{
static short init = 0;
if(! init) {
init = 1;
Cuda_PairBornCoulLongCuda_Init(sdata);
}
dim3 grid, threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BORN, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_BORN, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}
#undef _rhoinv
#undef _sigma
#undef _a
#undef _c
#undef _d

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
#ifdef CUDA_USE_BINNING
extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
#else
extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
#endif

View File

@ -1,36 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_CFLOAT PairBornCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
{
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
const F_CFLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]);
const F_CFLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp -
F_F(6.0) * _c[ij_type] * r6inv + F_F(8.0) * _d[ij_type] * r2inv * r6inv;
if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv
+ _d[ij_type] * r2inv * r6inv - _offset[ij_type]);
return factor_lj * forceborn * r2inv;
}

View File

@ -1,75 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define _rhoinv MY_AP(coeff1)
#define _buck1 MY_AP(coeff2)
#define _buck2 MY_AP(coeff3)
#define _a MY_AP(coeff4)
#define _c MY_AP(coeff5)
#include "pair_buck_coul_cut_cuda_cu.h"
#include <time.h>
void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata)
{
Cuda_Pair_Init_AllStyles(sdata, 5, true);
}
void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{
static short init = 0;
if(! init) {
init = 1;
Cuda_PairBuckCoulCutCuda_Init(sdata);
}
dim3 grid, threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}
#undef _rhoinv
#undef _buck1
#undef _buck2
#undef _a
#undef _c

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
#ifdef CUDA_USE_BINNING
extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag);
#else
extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
#endif

View File

@ -1,78 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define _rhoinv MY_AP(coeff1)
#define _buck1 MY_AP(coeff2)
#define _buck2 MY_AP(coeff3)
#define _a MY_AP(coeff4)
#define _c MY_AP(coeff5)
#include "pair_buck_coul_long_cuda_cu.h"
#include <time.h>
void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata)
{
Cuda_Pair_Init_AllStyles(sdata, 5, true);
}
void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{
static short init = 0;
if(! init) {
init = 1;
Cuda_PairBuckCoulLongCuda_Init(sdata);
}
dim3 grid, threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}
#undef _rhoinv
#undef _buck1
#undef _buck2
#undef _a
#undef _c

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
#ifdef CUDA_USE_BINNING
extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
#else
extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
#endif

View File

@ -1,77 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define _rhoinv MY_AP(coeff1)
#define _buck1 MY_AP(coeff2)
#define _buck2 MY_AP(coeff3)
#define _a MY_AP(coeff4)
#define _c MY_AP(coeff5)
#include "pair_buck_cuda_cu.h"
#include "pair_buck_cuda_kernel_nc.cu"
#include <time.h>
void Cuda_PairBuckCuda_Init(cuda_shared_data* sdata)
{
Cuda_Pair_Init_AllStyles(sdata, 5);
}
void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
{
static short init = 0;
if(! init) {
init = 1;
Cuda_PairBuckCuda_Init(sdata);
}
dim3 grid, threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}
#undef _rhoinv
#undef _buck1
#undef _buck2
#undef _a
#undef _c

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
#ifdef CUDA_USE_BINNING
extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, int eflag, int vflag);
#else
extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
#endif

Some files were not shown because too many files have changed in this diff Show More