mirror of https://github.com/lammps/lammps.git
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15268 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
fd27214f7d
commit
65c1e16401
|
@ -1,4 +0,0 @@
|
|||
#Makefile for liblammpscuda.a
|
||||
#No need to modify anything here! The CUDA path is inserted into Makefile.common
|
||||
|
||||
include Makefile.cudalib
|
|
@ -1,123 +0,0 @@
|
|||
#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed
|
||||
|
||||
# make options:
|
||||
# emu=1 switch to cuda emulation mode (otherwise: use gpu)
|
||||
# dbg=1 print a lot of debugging output during runtime
|
||||
# verbose=1 output nvcc command line during compilation
|
||||
# keep=1 do not delete temporary compilation files (.ii, .cubin, ...)
|
||||
# cufft=1 use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw)
|
||||
# binning=1 create virtual particle grid (neighbor-lists otherwise); currently this is not supported
|
||||
# precision=1 single precision (global setting)
|
||||
# precision=2 double precision (global setting)
|
||||
|
||||
SHELL = /bin/sh
|
||||
|
||||
# System-specific settings
|
||||
|
||||
CUDA_INSTALL_PATH = /usr/local/cuda
|
||||
#CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
|
||||
# e.g. in Gentoo
|
||||
# CUDA_INSTALL_PATH = /opt/cuda
|
||||
|
||||
#//////////////////////////////////////////////////////////////////////////////////////////////
|
||||
# no need to change anything below this line
|
||||
#//////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#use CPU FFT if cufft=0 is requested.
|
||||
FALLBACK_FFT = 1
|
||||
|
||||
#default settings for compiler switches
|
||||
ifdef COMPILELIB
|
||||
include Makefile.defaults
|
||||
else
|
||||
include ../../lib/cuda/Makefile.defaults
|
||||
endif
|
||||
|
||||
#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer}
|
||||
|
||||
CUDA_FLAGS := -I${CUDA_INSTALL_PATH}/include -DUNIX
|
||||
CUDA_USRLIB_CONDITIONAL := -L${CUDA_INSTALL_PATH}/lib -L${CUDA_INSTALL_PATH}/lib64
|
||||
|
||||
# debug setting
|
||||
ifeq ($(strip $(dbg)), 1)
|
||||
CUDA_FLAGS += -D_DEBUG -g
|
||||
NVCC_FLAGS += -g -G
|
||||
else
|
||||
NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O3
|
||||
endif
|
||||
|
||||
# skip timing on Mac and Windows manually
|
||||
ifeq ($(strip $(prec_timer)), 0)
|
||||
CUDA_FLAGS += -DNO_PREC_TIMING
|
||||
endif
|
||||
|
||||
# set fft routine
|
||||
ifeq ($(strip $(cufft)), 0)
|
||||
ifneq ($(FALLBACK_FFT), 1)
|
||||
FFT_INC = -DFFT_NONE
|
||||
FFT_PATH =
|
||||
FFT_LIB =
|
||||
CUDA_FLAGS += -DFFT_NONE
|
||||
endif
|
||||
else
|
||||
CUDA_FLAGS += -DFFT_CUFFT
|
||||
CUDA_USRLIB_CONDITIONAL += -lcufft
|
||||
endif
|
||||
|
||||
# make global precision setting
|
||||
|
||||
ifeq ($(strip $(precision)), 1)
|
||||
CUDA_FLAGS += -DCUDA_PRECISION=1
|
||||
else
|
||||
ifeq ($(strip $(precision)), 3)
|
||||
CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2
|
||||
else
|
||||
ifeq ($(strip $(precision)), 4)
|
||||
CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2
|
||||
else
|
||||
CUDA_FLAGS += -DCUDA_PRECISION=2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# make architecture settings
|
||||
ifeq ($(strip $(arch)), 13)
|
||||
CUDA_FLAGS += -DCUDA_ARCH=13
|
||||
SMVERSIONFLAGS := -arch sm_13
|
||||
else
|
||||
ifeq ($(strip $(arch)), 20)
|
||||
CUDA_FLAGS += -DCUDA_ARCH=20
|
||||
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
|
||||
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
|
||||
SMVERSIONFLAGS := -arch sm_20
|
||||
else
|
||||
ifeq ($(strip $(arch)), 21)
|
||||
CUDA_FLAGS += -DCUDA_ARCH=20
|
||||
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
|
||||
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
|
||||
SMVERSIONFLAGS := -arch sm_21
|
||||
else
|
||||
ifeq ($(strip $(arch)), 30)
|
||||
CUDA_FLAGS += -DCUDA_ARCH=20
|
||||
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
|
||||
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
|
||||
SMVERSIONFLAGS := -arch sm_30
|
||||
else
|
||||
ifeq ($(strip $(arch)), 35)
|
||||
CUDA_FLAGS += -DCUDA_ARCH=20
|
||||
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
|
||||
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
|
||||
SMVERSIONFLAGS := -arch sm_35
|
||||
else
|
||||
CUDA_FLAGS += -DCUDA_ARCH=99
|
||||
SMVERSIONFLAGS := -arch sm_13
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \
|
||||
-I$(CUDA_INSTALL_PATH)/include
|
|
@ -1,87 +0,0 @@
|
|||
#Makefile for liblammpscuda.a
|
||||
#No need to modify anything here! The CUDA path is inserted into Makefile.common
|
||||
|
||||
.DEFAULT: lib
|
||||
|
||||
COMPILELIB := 1
|
||||
|
||||
SHELL = /bin/sh
|
||||
|
||||
CUDA_SRC_DIR = ../cuda
|
||||
CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake
|
||||
CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) )
|
||||
include $(CUDA_TEMP)
|
||||
CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu)
|
||||
CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o)
|
||||
CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO))
|
||||
CUDA_DEP = $(CUDA_OBJ:.o=.d)
|
||||
|
||||
NVCC_FLAGS :=
|
||||
|
||||
VPATH = $(CUDA_SRC_DIR)
|
||||
|
||||
#rewriting default settings if new ones are specified
|
||||
|
||||
|
||||
ifdef precision
|
||||
tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults)
|
||||
endif
|
||||
|
||||
ifdef arch
|
||||
tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults)
|
||||
endif
|
||||
|
||||
ifdef cufft
|
||||
tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults)
|
||||
endif
|
||||
|
||||
ifdef dbg
|
||||
tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults)
|
||||
endif
|
||||
|
||||
ifdef prec_timer
|
||||
tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults)
|
||||
endif
|
||||
|
||||
include Makefile.common
|
||||
|
||||
tmp := $(shell sed -i '2 d' Makefile.lammps)
|
||||
tmp := $(shell sed -i '2 d' Makefile.lammps)
|
||||
tmp := $(shell sed -i '1a CUDA_FLAGS := ${CUDA_FLAGS}' Makefile.lammps)
|
||||
tmp := $(shell sed -i '2a CUDA_USRLIB_CONDITIONAL := ${CUDA_USRLIB_CONDITIONAL}' Makefile.lammps)
|
||||
|
||||
# verbose nvcc output during compilation
|
||||
ifeq ($(verbose), 1)
|
||||
VERBOSE :=
|
||||
NVCC_FLAGS += --ptxas-options=-v
|
||||
else
|
||||
VERBOSE := @
|
||||
endif
|
||||
|
||||
# keep temporary compilation files of nvcc
|
||||
ifeq ($(keep), 1)
|
||||
NVCC_FLAGS += -keep -Xptxas="--verbose"
|
||||
endif
|
||||
|
||||
|
||||
NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
|
||||
CUDA_INCLUDES = -I./ -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA
|
||||
CUDA_USRLIB =
|
||||
|
||||
# Link target
|
||||
|
||||
lib: $(CUDA_OBJ)
|
||||
$(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a
|
||||
|
||||
clean:
|
||||
rm $(CUDA_SRC_DIR)/*.o
|
||||
rm liblammpscuda.a
|
||||
|
||||
# Library target
|
||||
|
||||
|
||||
# Cuda compilation rules
|
||||
|
||||
%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h
|
||||
$(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $<
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
|
||||
#precision setting: 1 single, 2 double, 4 mixed
|
||||
precision ?= 2
|
||||
|
||||
#verbose setting: 0 no, 1 yes
|
||||
verbose ?= 1
|
||||
|
||||
#GPU architecture (compute capability): 13, 20, 21, 35
|
||||
arch ?= 21
|
||||
|
||||
#Using cufft (should not be changed)
|
||||
cufft ?= 1
|
||||
|
||||
#Using dbg mode
|
||||
dbg ?= 0
|
||||
|
||||
#On mac machines set this to 0 in order to avoid usage of linux specific precision timer
|
||||
prec_timer ?= 1
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
# Settings that the LAMMPS build will import when this package library is used
|
||||
CUDA_FLAGS := -I/usr/local/cuda/include -DUNIX -DFFT_CUFFT -DCUDA_PRECISION=2 -DCUDA_ARCH=20
|
||||
CUDA_USRLIB_CONDITIONAL := -L/usr/local/cuda/lib -L/usr/local/cuda/lib64
|
||||
|
||||
user-cuda_SYSINC = ${CUDA_FLAGS}
|
||||
user-cuda_SYSLIB = -lcufft -lcuda -lcudart
|
||||
user-cuda_SYSPATH = $(CUDA_USRLIB_CONDITIONAL)
|
|
@ -1,59 +0,0 @@
|
|||
USER-CUDA library
|
||||
Christian Trott, crtrott at sandia.gov
|
||||
|
||||
-------------------------------------------------------------------
|
||||
|
||||
This directory has source files to build a library that LAMMPS links
|
||||
against when using the USER-CUDA package.
|
||||
|
||||
This library must be built before LAMMPS is built, so LAMMPS can link
|
||||
against it. The build process also write settings into the
|
||||
Makefile.lammps file which are used when files in the src/USER-CUDA
|
||||
package are compiled.
|
||||
|
||||
Thus if you re-build this library (e.g. for a different precision),
|
||||
you MUST re-compile the src/USER-CUDA files as well. You can force
|
||||
this to happen by uninstalling, then re-installing the USER-CUDA
|
||||
package (make no-user-cuda; make yes-user-cuda) before doing
|
||||
a LAMMPS build.
|
||||
|
||||
Build this library in two steps. First type:
|
||||
|
||||
make OPTIONS
|
||||
|
||||
where OPTIONS is one or more of the following settings:
|
||||
|
||||
precision=N to set the precision level
|
||||
N = 1 for single precision (default)
|
||||
N = 2 for double precision
|
||||
N = 3 for positions in double precision
|
||||
N = 4 for positions and velocities in double precision
|
||||
arch=M to set GPU compute capability
|
||||
M = 20 for CC2.0 (GF100/110, e.g. C2050,GTX580,GTX470) (default)
|
||||
M = 21 for CC2.1 (GF104/114, e.g. GTX560, GTX460, GTX450)
|
||||
M = 13 for CC1.3 (GF200, e.g. C1060, GTX285)
|
||||
prec_timer=0/1 to use hi-precision timers
|
||||
0 = do not use them (default)
|
||||
1 = use these timers
|
||||
this is usually only useful for Mac machines
|
||||
dbg=0/1 to activate debug mode
|
||||
0 = no debug mode (default)
|
||||
1 = yes debug mode
|
||||
this is only useful for developers
|
||||
cufft=1 to determine usage of CUDA FFT library
|
||||
0 = no CUFFT support (default)
|
||||
in the future other CUDA-enabled FFT libraries might be supported
|
||||
|
||||
This will write settings to the Makefile.defaults file.
|
||||
|
||||
Then type "make" with with no arguments to build the library with the
|
||||
new settings.
|
||||
|
||||
After the second make, two files should exist in this directory:
|
||||
|
||||
liblammpscuda.a the library LAMMPS will link against
|
||||
Makefile.lammps settings the LAMMPS Makefile will import
|
||||
|
||||
Makefile.lammps is created by the make command and will have settings
|
||||
consistent with the OPTIONS you selected. It is used by the LAMMPS
|
||||
build, both for compile-time and link-time settings.
|
|
@ -1,85 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
const unsigned int ANGLE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
|
||||
|
||||
#include "atom_vec_angle_cuda_cu.h"
|
||||
|
||||
void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
return Cuda_AtomVecCuda_Init<ANGLE_DATA_MASK>(sdata);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_
|
||||
#define ATOM_VEC_ANGLE_CUDA_CU_H_
|
||||
|
||||
extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
|
||||
extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
|
||||
extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
|
||||
extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
|
||||
extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
|
||||
|
||||
#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/
|
|
@ -1,85 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
const unsigned int ATOMIC_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
|
||||
|
||||
#include "atom_vec_atomic_cuda_cu.h"
|
||||
|
||||
void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
return Cuda_AtomVecCuda_Init<ATOMIC_DATA_MASK>(sdata);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
|
||||
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
|
||||
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_
|
||||
#define ATOM_VEC_ATOMIC_CUDA_CU_H_
|
||||
|
||||
extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
|
||||
extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
|
||||
extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
|
||||
extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
|
||||
extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
|
||||
|
||||
#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/
|
|
@ -1,85 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
const unsigned int CHARGE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
|
||||
|
||||
#include "atom_vec_charge_cuda_cu.h"
|
||||
|
||||
void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
return Cuda_AtomVecCuda_Init<CHARGE_DATA_MASK>(sdata);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
|
||||
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
|
||||
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_
|
||||
#define ATOM_VEC_CHARGE_CUDA_CU_H_
|
||||
|
||||
extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
|
||||
extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
|
||||
extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
|
||||
extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
|
||||
extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
|
||||
|
||||
#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/
|
|
@ -1,628 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX atom_vec_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "cuda_wrapper_cu.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "atom_vec_cuda_kernel.cu"
|
||||
|
||||
int AtomVecCuda_CountDataItems(unsigned int data_mask)
|
||||
{
|
||||
int n = 0;
|
||||
|
||||
if(data_mask & X_MASK) n += 3;
|
||||
|
||||
if(data_mask & V_MASK) n += 3;
|
||||
|
||||
if(data_mask & F_MASK) n += 3;
|
||||
|
||||
if(data_mask & TAG_MASK) n++;
|
||||
|
||||
if(data_mask & TYPE_MASK) n++;
|
||||
|
||||
if(data_mask & MASK_MASK) n++;
|
||||
|
||||
if(data_mask & IMAGE_MASK) n++;
|
||||
|
||||
if(data_mask & Q_MASK) n++;
|
||||
|
||||
if(data_mask & MOLECULE_MASK) n++;
|
||||
|
||||
if(data_mask & RMASS_MASK) n++;
|
||||
|
||||
if(data_mask & RADIUS_MASK) n++;
|
||||
|
||||
if(data_mask & DENSITY_MASK) n++;
|
||||
|
||||
if(data_mask & OMEGA_MASK) n += 3;
|
||||
|
||||
if(data_mask & TORQUE_MASK) n++;
|
||||
|
||||
//if(data_mask & NSPECIAL_MASK) n+=3;
|
||||
return n;
|
||||
}
|
||||
|
||||
void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
|
||||
{
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_AtomVecCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*));
|
||||
|
||||
if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*));
|
||||
|
||||
if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_AP(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*));
|
||||
|
||||
if(data_mask & RADIUS_MASK) cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius.dev_data, sizeof(int*));
|
||||
|
||||
if(data_mask & DENSITY_MASK) cudaMemcpyToSymbol(MY_AP(density) , & sdata->atom.density.dev_data, sizeof(int*));
|
||||
|
||||
if(data_mask & RMASS_MASK) cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(int*));
|
||||
|
||||
if(data_mask & OMEGA_MASK) cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega.dev_data, sizeof(int*));
|
||||
|
||||
//if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_AP(nspecial) , & sdata->atom.nspecial.dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*));
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n");)
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n");)
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*));
|
||||
cudaThreadSynchronize();
|
||||
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n");)
|
||||
}
|
||||
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
int size = (n * n_data_items) * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
|
||||
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
}
|
||||
}
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemset(sdata->flag, 0, sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
|
||||
Cuda_AtomVecCuda_PackComm_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
|
||||
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
|
||||
|
||||
if(not sdata->overlap_comm)
|
||||
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
int aflag;
|
||||
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
return n_data_items * n;
|
||||
}
|
||||
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");)
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
int size = (n * n_data_items) * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
static int count = -1;
|
||||
count++;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
|
||||
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed");
|
||||
|
||||
Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_self +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed");
|
||||
}
|
||||
|
||||
return n_data_items * n;
|
||||
}
|
||||
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
|
||||
{
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
int size = (n * n_data_items) * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(not sdata->overlap_comm || iswap < 0)
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
|
||||
Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask> <<< grid, threads, 0>>>(n, first, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
|
||||
{
|
||||
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n", dim);)
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: pre Kernel execution failed");
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
Cuda_AtomVecCuda_Init<data_mask>(sdata);
|
||||
int size = n * sizeof(double);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, sizeof(int), 256, true);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_AtomVecCuda_PackExchangeList_Kernel <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (n - 1, dim);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_exchange_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost);
|
||||
int return_value = ((int*) buf_send)[0];
|
||||
|
||||
if(n > 1 + return_value)
|
||||
cudaMemcpy(buf_send, sdata->buffer, (1 + return_value)*sizeof(double), cudaMemcpyDeviceToHost);
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed");
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_exchange_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n");)
|
||||
return return_value;
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n");)
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
||||
//if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1;
|
||||
int size = (nsend * n_data_items + 1) * sizeof(double);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
|
||||
|
||||
int3 layout = getgrid(nsend, 0);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_AtomVecCuda_PackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(nsend, (int*) copylist);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed");
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_exchange_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_exchange_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... done\n");)
|
||||
return nsend * n_data_items + 1;
|
||||
}
|
||||
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1;
|
||||
|
||||
int size = (nsend * n_data_items + 1) * sizeof(double);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*));
|
||||
|
||||
cudaMemset((int*)(sdata->flag), 0, sizeof(int));
|
||||
|
||||
if(nsend) {
|
||||
int3 layout = getgrid(nsend, 0);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
cudaMemcpy(sdata->buffer, buf_send , size, cudaMemcpyHostToDevice);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_exchange_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(sdata->exchange_dim, nsend, (int*) copylist);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_exchange_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed");
|
||||
}
|
||||
}
|
||||
|
||||
int naccept;
|
||||
cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
return naccept;
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
|
||||
int size = nsend * n_data_items * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0];
|
||||
dy = pbc[1];
|
||||
dz = pbc[2];
|
||||
}
|
||||
}
|
||||
|
||||
int3 layout = getgrid(nsend);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_AtomVecCuda_PackBorder_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, nsend, sdata->comm.maxlistlength, iswap, dx, dy, dz);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_border_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed");
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_border_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
}
|
||||
|
||||
return nsend * n_data_items;
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
|
||||
int size = n * n_data_items * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0];
|
||||
dy = pbc[1];
|
||||
dz = pbc[2];
|
||||
}
|
||||
}
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_border_kernel_self +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
return n * n_data_items;
|
||||
}
|
||||
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
|
||||
int size = n * n_data_items * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
cudaMemset((int*)(sdata->flag), 0, sizeof(int));
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, size, cudaMemcpyHostToDevice);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_border_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask> <<< grid, threads, 0>>>(n, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_border_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
cudaMemcpy(&sdata->comm.grow_flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
return sdata->comm.grow_flag;
|
||||
}
|
||||
|
||||
|
||||
#include "atom_vec_angle_cuda.cu"
|
||||
#include "atom_vec_atomic_cuda.cu"
|
||||
#include "atom_vec_charge_cuda.cu"
|
||||
#include "atom_vec_full_cuda.cu"
|
||||
//#include "atom_vec_granular_cuda.cu"
|
|
@ -1,512 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
#define RIMLARGER 1.000001
|
||||
#define RIMSMALLER 0.999999
|
||||
#define SMALL 1e-5
|
||||
|
||||
extern __shared__ int shared[];
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = list[i];
|
||||
|
||||
if(j > _nmax) _flag[0] = 1;
|
||||
|
||||
int k = 0;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _x[j] + dx;
|
||||
k++;
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy;
|
||||
k++;
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz;
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _v[j];
|
||||
k++;
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _v[j + _nmax];
|
||||
k++;
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _omega[j];
|
||||
k++;
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _omega[j + _nmax];
|
||||
k++;
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & RADIUS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _radius[j];
|
||||
|
||||
k++;
|
||||
|
||||
if(data_mask & RMASS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _rmass[j];
|
||||
|
||||
k++;
|
||||
}
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = i;
|
||||
j = list[i];
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
_x[i + first] = _x[j] + dx;
|
||||
_x[i + first + _nmax] = _x[j + _nmax] + dy;
|
||||
_x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
_v[i + first] = _v[j];
|
||||
_v[i + first + _nmax] = _v[j + _nmax];
|
||||
_v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
|
||||
}
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
_omega[i + first] = _omega[j];
|
||||
_omega[i + first + _nmax] = _omega[j + _nmax];
|
||||
_omega[i + first + 2 * _nmax] = _omega[j + 2 * _nmax];
|
||||
}
|
||||
|
||||
if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j];
|
||||
|
||||
if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n, int first, void* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
int k = 0;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
_x[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
_v[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
_omega[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_omega[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_omega[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
|
||||
k++;
|
||||
|
||||
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
|
||||
k++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n, int dim)
|
||||
{
|
||||
double* buf = (double*) _buffer;
|
||||
buf = &buf[1];
|
||||
|
||||
//X_CFLOAT lo=slablo[iswap];
|
||||
//X_CFLOAT hi=slabhi[iswap];
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
bool add = false;
|
||||
|
||||
if(i < _nlocal) {
|
||||
double xdim_tmp = static_cast <double>(_x[i + dim * _nmax]);
|
||||
|
||||
if(xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim]) {
|
||||
add = true;
|
||||
}
|
||||
}
|
||||
|
||||
shared[threadIdx.x] = add ? 1 : 0;
|
||||
__syncthreads();
|
||||
int nsend = 0;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
for(int k = 0; k < blockDim.x; k++) {
|
||||
if(shared[k]) {
|
||||
nsend++;
|
||||
shared[k] = nsend;
|
||||
}
|
||||
}
|
||||
|
||||
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
|
||||
|
||||
if(add && nsend + 1 < n)
|
||||
buf[nsend] = i;
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist)
|
||||
{
|
||||
double* buf = (double*) _buffer;
|
||||
int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(k >= nsend) return;
|
||||
|
||||
buf = &buf[1 + k];
|
||||
|
||||
int i = static_cast <int>(buf[0]);
|
||||
int j = copylist[k];
|
||||
|
||||
int m = 1;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
buf[(m++)*nsend] = static_cast <double>(_x[i]);
|
||||
buf[(m++)*nsend] = static_cast <double>(_x[i + _nmax]);
|
||||
buf[(m++)*nsend] = static_cast <double>(_x[i + 2 * _nmax]);
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
buf[(m++)*nsend] = _v[i];
|
||||
buf[(m++)*nsend] = _v[i + _nmax];
|
||||
buf[(m++)*nsend] = _v[i + 2 * _nmax];
|
||||
}
|
||||
|
||||
if(data_mask & TAG_MASK) buf[(m++)*nsend] = _tag[i];
|
||||
|
||||
if(data_mask & TYPE_MASK) buf[(m++)*nsend] = _type[i];
|
||||
|
||||
if(data_mask & MASK_MASK) buf[(m++)*nsend] = _mask[i];
|
||||
|
||||
if(data_mask & IMAGE_MASK) buf[(m++)*nsend] = _image[i];
|
||||
|
||||
if(data_mask & Q_MASK) buf[(m++)*nsend] = _q[i];
|
||||
|
||||
if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i];
|
||||
|
||||
if(data_mask & RADIUS_MASK) buf[(m++)*nsend] = _radius[i];
|
||||
|
||||
if(data_mask & DENSITY_MASK) buf[(m++)*nsend] = _density[i];
|
||||
|
||||
if(data_mask & RMASS_MASK) buf[(m++)*nsend] = _rmass[i];
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
buf[(m++)*nsend] = _omega[i];
|
||||
buf[(m++)*nsend] = _omega[i + _nmax];
|
||||
buf[(m++)*nsend] = _omega[i + 2 * _nmax];
|
||||
}
|
||||
|
||||
/* if(data_mask & NSPECIAL_MASK)
|
||||
{
|
||||
buf[(m++)*nsend] = _nspecial[i];
|
||||
buf[(m++)*nsend] = _nspecial[i+_nmax];
|
||||
buf[(m++)*nsend] = _nspecial[i+2* _nmax];
|
||||
}*/
|
||||
|
||||
if(i >= _nlocal) return;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
_x[i] = _x[j];
|
||||
_x[i + _nmax] = _x[j + _nmax];
|
||||
_x[i + 2 * _nmax] = _x[j + 2 * _nmax];
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
_v[i] = _v[j];
|
||||
_v[i + _nmax] = _v[j + _nmax];
|
||||
_v[i + 2 * _nmax] = _v[j + 2 * _nmax];
|
||||
}
|
||||
|
||||
if(data_mask & TAG_MASK) _tag[i] = _tag[j];
|
||||
|
||||
if(data_mask & TYPE_MASK) _type[i] = _type[j];
|
||||
|
||||
if(data_mask & MASK_MASK) _mask[i] = _mask[j];
|
||||
|
||||
if(data_mask & IMAGE_MASK) _image[i] = _image[j];
|
||||
|
||||
if(data_mask & Q_MASK) _q[i] = _q[j];
|
||||
|
||||
if(data_mask & MOLECULE_MASK) _molecule[i] = _molecule[j];
|
||||
|
||||
if(data_mask & RADIUS_MASK) _radius[i] = _radius[j];
|
||||
|
||||
if(data_mask & DENSITY_MASK) _density[i] = _density[j];
|
||||
|
||||
if(data_mask & RMASS_MASK) _rmass[i] = _rmass[j];
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
_omega[i] = _omega[j];
|
||||
_omega[i + _nmax] = _omega[j + _nmax];
|
||||
_omega[i + 2 * _nmax] = _omega[j + 2 * _nmax];
|
||||
}
|
||||
|
||||
/* if(data_mask & NSPECIAL_MASK)
|
||||
{
|
||||
_nspecial[i] = _nspecial[j];
|
||||
_nspecial[i+_nmax] = _nspecial[j+_nmax];
|
||||
_nspecial[i+2* _nmax] = _nspecial[j+2* _nmax];
|
||||
}*/
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim, int nsend, int* copylist)
|
||||
{
|
||||
double* buf = (double*) _buffer;
|
||||
int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(k >= nsend) return;
|
||||
|
||||
buf = &buf[1 + k];
|
||||
int i = -1;
|
||||
double xdim_tmp = buf[(1 + dim) * nsend];
|
||||
|
||||
if(xdim_tmp >= _sublo[dim] - SMALL && xdim_tmp < _subhi[dim] + SMALL) {
|
||||
i = atomicAdd(_flag, 1) + _nlocal;
|
||||
|
||||
int m = 1;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
_x[i] = buf[(m++) * nsend];
|
||||
_x[i + _nmax] = buf[(m++) * nsend];
|
||||
_x[i + 2 * _nmax] = buf[(m++) * nsend];
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
_v[i] = buf[(m++) * nsend];
|
||||
_v[i + _nmax] = buf[(m++) * nsend];
|
||||
_v[i + 2 * _nmax] = buf[(m++) * nsend];
|
||||
}
|
||||
|
||||
if(data_mask & TAG_MASK) _tag[i] = buf[(m++) * nsend];
|
||||
|
||||
if(data_mask & TYPE_MASK) _type[i] = buf[(m++) * nsend];
|
||||
|
||||
if(data_mask & MASK_MASK) _mask[i] = buf[(m++) * nsend];
|
||||
|
||||
if(data_mask & IMAGE_MASK) _image[i] = buf[(m++) * nsend];
|
||||
|
||||
if(data_mask & Q_MASK) _q[i] = buf[(m++) * nsend];
|
||||
|
||||
if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++) * nsend];
|
||||
|
||||
if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++) * nsend];
|
||||
|
||||
if(data_mask & DENSITY_MASK) _density[i] = buf[(m++) * nsend];
|
||||
|
||||
if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++) * nsend];
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
_omega[i] = buf[(m++) * nsend];
|
||||
_omega[i + _nmax] = buf[(m++) * nsend];
|
||||
_omega[i + 2 * _nmax] = buf[(m++) * nsend];
|
||||
}
|
||||
|
||||
/* if(data_mask & NSPECIAL_MASK)
|
||||
{
|
||||
_nspecial[i] = buf[(m++)*nsend];
|
||||
_nspecial[i+_nmax] = buf[(m++)*nsend];
|
||||
_nspecial[i+2*_nmax] = buf[(m++)*nsend];
|
||||
}*/
|
||||
}
|
||||
|
||||
copylist[k] = i;
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = list[i];
|
||||
int m = 0;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx;
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy;
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz;
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax];
|
||||
}
|
||||
|
||||
if(data_mask & TAG_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _tag[j];
|
||||
|
||||
if(data_mask & TYPE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _type[j];
|
||||
|
||||
if(data_mask & MASK_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _mask[j];
|
||||
|
||||
if(data_mask & Q_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _q[j];
|
||||
|
||||
if(data_mask & MOLECULE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _molecule[j];
|
||||
|
||||
if(data_mask & RADIUS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _radius[i];
|
||||
|
||||
if(data_mask & DENSITY_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _density[i];
|
||||
|
||||
if(data_mask & RMASS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _rmass[i];
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = list[i];
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
_x[i + first] = _x[j] + dx;
|
||||
_x[i + first + _nmax] = _x[j + _nmax] + dy;
|
||||
_x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
_v[i + first] = _v[j];
|
||||
_v[i + first + _nmax] = _v[j + _nmax];
|
||||
_v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
|
||||
}
|
||||
|
||||
if(data_mask & TAG_MASK) _tag[i + first] = _tag[j];
|
||||
|
||||
if(data_mask & TYPE_MASK) _type[i + first] = _type[j];
|
||||
|
||||
if(data_mask & MASK_MASK) _mask[i + first] = _mask[j];
|
||||
|
||||
if(data_mask & Q_MASK) _q[i + first] = _q[j];
|
||||
|
||||
if(data_mask & MOLECULE_MASK) _molecule[i + first] = _molecule[j];
|
||||
|
||||
if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j];
|
||||
|
||||
if(data_mask & DENSITY_MASK) _density[i + first] = _density[j];
|
||||
|
||||
if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j];
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
_omega[i + first] = _omega[j];
|
||||
_omega[i + first + _nmax] = _omega[j + _nmax];
|
||||
_omega[i + first + 2 * _nmax] = _omega[j + 2 * _nmax];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
if(i + first < _nmax) {
|
||||
int m = 0;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
_x[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_x[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
_v[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_v[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
}
|
||||
|
||||
if(data_mask & TAG_MASK) _tag[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
|
||||
|
||||
if(data_mask & TYPE_MASK) _type[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
|
||||
|
||||
if(data_mask & MASK_MASK) _mask[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
|
||||
|
||||
if(data_mask & Q_MASK) _q[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
|
||||
if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
|
||||
|
||||
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
|
||||
if(data_mask & DENSITY_MASK) _density[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
|
||||
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
_omega[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_omega[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_omega[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
}
|
||||
} else {
|
||||
_flag[0] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
const unsigned int FULL_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
|
||||
|
||||
#include "atom_vec_full_cuda_cu.h"
|
||||
|
||||
void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
return Cuda_AtomVecCuda_Init<FULL_DATA_MASK>(sdata);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
|
||||
}
|
||||
|
||||
int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
|
||||
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
#ifndef ATOM_VEC_FULL_CUDA_CU_H_
|
||||
#define ATOM_VEC_FULL_CUDA_CU_H_
|
||||
|
||||
extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
|
||||
extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
|
||||
extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
|
||||
extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
|
||||
extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
|
||||
|
||||
#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/
|
|
@ -1,539 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX comm_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "comm_cuda_cu.h"
|
||||
#include "comm_cuda_kernel.cu"
|
||||
#include <ctime>
|
||||
|
||||
void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata, int n)
|
||||
{
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
|
||||
void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
|
||||
void Cuda_CommCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
int ntypesp = sdata->atom.ntypes + 1;
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &ntypesp, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata, sizeof(int*));
|
||||
}
|
||||
|
||||
int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
|
||||
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
}
|
||||
}
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemset(sdata->flag, 0, sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
|
||||
Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
|
||||
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
|
||||
|
||||
if(not sdata->overlap_comm)
|
||||
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
int aflag;
|
||||
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
return 3 * n;
|
||||
}
|
||||
|
||||
int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 6 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
|
||||
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
}
|
||||
}
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemset(sdata->flag, 0, sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
|
||||
Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
|
||||
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
|
||||
|
||||
if(not sdata->overlap_comm)
|
||||
cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
int aflag;
|
||||
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
return 6 * n;
|
||||
}
|
||||
|
||||
int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
static int count = -1;
|
||||
count++;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
|
||||
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_self +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
|
||||
}
|
||||
|
||||
return 3 * n;
|
||||
}
|
||||
|
||||
int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 6 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
static int count = -1;
|
||||
count++;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
|
||||
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_self +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
|
||||
}
|
||||
|
||||
return 6 * n;
|
||||
}
|
||||
|
||||
void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
|
||||
{
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(not sdata->overlap_comm || iswap < 0)
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
|
||||
Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
|
||||
{
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 6 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(not sdata->overlap_comm || iswap < 0)
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
|
||||
Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
|
||||
F_CFLOAT* buf = (F_CFLOAT*)buf_send;
|
||||
F_CFLOAT* f_dev = (F_CFLOAT*)sdata->atom.f.dev_data;
|
||||
f_dev += first;
|
||||
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
buf += n;
|
||||
f_dev += sdata->atom.nmax;
|
||||
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
buf += n;
|
||||
f_dev += sdata->atom.nmax;
|
||||
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
return n * 3;
|
||||
}
|
||||
|
||||
|
||||
void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemcpy(sdata->buffer, buf_recv, size, cudaMemcpyHostToDevice);
|
||||
Cuda_CommCuda_UnpackReverse_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed");
|
||||
}
|
||||
}
|
||||
|
||||
void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
Cuda_CommCuda_UnpackReverse_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, first);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new or (80 > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, 10);
|
||||
|
||||
int n;
|
||||
|
||||
if(!bordergroup || ineed >= 2)
|
||||
n = nlast - nfirst + 1;
|
||||
else {
|
||||
n = atom_nfirst;
|
||||
|
||||
if(nlast - sdata->atom.nlocal + 1 > n) n = nlast - sdata->atom.nlocal + 1;
|
||||
}
|
||||
|
||||
int3 layout = getgrid(n, 0, 512, true);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x + 1, layout.y, 1);
|
||||
|
||||
|
||||
cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(style == 1)
|
||||
Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.slablo.dev_data, (X_CFLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
|
||||
else
|
||||
Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.multilo.dev_data, (X_CFLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_border_kernel_buildlist +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed");
|
||||
int nsend;
|
||||
cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
return nsend;
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag);
|
||||
extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag);
|
||||
extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag);
|
||||
extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag);
|
||||
extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1);
|
||||
extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1);
|
||||
extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send);
|
||||
extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv);
|
||||
extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first);
|
||||
extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap);
|
|
@ -1,394 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = list[i];
|
||||
|
||||
if(j > _nmax) _flag[0] = 1;
|
||||
|
||||
((X_CFLOAT*) buffer)[i] = _x[j] + dx;
|
||||
((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
|
||||
((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = list[i];
|
||||
|
||||
if(j > _nmax) _flag[0] = 1;
|
||||
|
||||
((X_CFLOAT*) buffer)[i] = _x[j] + dx;
|
||||
((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
|
||||
((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
|
||||
((X_CFLOAT*) buffer)[i + 3 * n] = _v[j];
|
||||
((X_CFLOAT*) buffer)[i + 4 * n] = _v[j + _nmax];
|
||||
((X_CFLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = i;
|
||||
j = list[i];
|
||||
|
||||
_x[i + first] = _x[j] + dx;
|
||||
_x[i + first + _nmax] = _x[j + _nmax] + dy;
|
||||
_x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = i;
|
||||
j = list[i];
|
||||
|
||||
_x[i + first] = _x[j] + dx;
|
||||
_x[i + first + _nmax] = _x[j + _nmax] + dy;
|
||||
_x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
|
||||
_v[i + first] = _v[j];
|
||||
_v[i + first + _nmax] = _v[j + _nmax];
|
||||
_v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_UnpackComm_Kernel(int n, int first, void* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
_x[i + first] = ((X_CFLOAT*) buffer)[i];
|
||||
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n, int first, void* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
_x[i + first] = ((X_CFLOAT*) buffer)[i];
|
||||
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
|
||||
_v[i + first] = ((X_CFLOAT*) buffer)[i + 3 * n];
|
||||
_v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 4 * n];
|
||||
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 5 * n];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_PackReverse_Kernel(int n, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
((F_CFLOAT*) _buffer)[i] = _f[i + first];
|
||||
((F_CFLOAT*) _buffer)[i + n] = _f[i + first + _nmax];
|
||||
((F_CFLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist, int n, int maxlistlength, int iswap)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = list[i];
|
||||
_f[j] += ((F_CFLOAT*)_buffer)[i];
|
||||
_f[j + _nmax] += ((F_CFLOAT*) _buffer)[i + n];
|
||||
_f[j + 2 * _nmax] += ((F_CFLOAT*) _buffer)[i + 2 * n];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
||||
if(i < n) {
|
||||
int j = list[i];
|
||||
|
||||
_f[j] += _f[i + first];
|
||||
_f[j + _nmax] += _f[i + first + _nmax];
|
||||
_f[j + 2 * _nmax] += _f[i + first + 2 * _nmax];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
extern __shared__ int shared[];
|
||||
|
||||
__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, int atom_nfirst,
|
||||
int nfirst, int nlast, int dim, int iswap, X_CFLOAT* slablo, X_CFLOAT* slabhi, int* sendlist, int maxlistlength)
|
||||
{
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
X_CFLOAT lo = slablo[iswap];
|
||||
X_CFLOAT hi = slabhi[iswap];
|
||||
bool add = false;
|
||||
|
||||
if(!bordergroup || ineed >= 2) {
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst;
|
||||
|
||||
if(i < nlast)
|
||||
if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
|
||||
add = true;
|
||||
}
|
||||
|
||||
shared[threadIdx.x] = add ? 1 : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int nsend = 0;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
for(int k = 0; k < blockDim.x; k++) {
|
||||
if(shared[k]) {
|
||||
nsend++;
|
||||
shared[k] = nsend;
|
||||
}
|
||||
}
|
||||
|
||||
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
|
||||
|
||||
if(add && nsend < maxlistlength)
|
||||
list[nsend] = i;
|
||||
|
||||
|
||||
} else {
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < atom_nfirst)
|
||||
if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
|
||||
add = true;
|
||||
}
|
||||
|
||||
shared[threadIdx.x] = add ? 1 : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int nsend = 0;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
for(int k = 0; k < blockDim.x; k++) {
|
||||
if(shared[k]) {
|
||||
nsend++;
|
||||
shared[k] = nsend;
|
||||
}
|
||||
}
|
||||
|
||||
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
|
||||
|
||||
if(add && nsend < maxlistlength)
|
||||
list[nsend] = i;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
add = false;
|
||||
i += _nlocal;
|
||||
|
||||
if(i < nlast)
|
||||
if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
|
||||
add = true;
|
||||
}
|
||||
|
||||
shared[threadIdx.x] = add ? 1 : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nsend = 0;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
for(int k = 0; k < blockDim.x; k++) {
|
||||
if(shared[k]) {
|
||||
nsend++;
|
||||
shared[k] = nsend;
|
||||
}
|
||||
}
|
||||
|
||||
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
|
||||
|
||||
if(add && nsend < maxlistlength)
|
||||
list[nsend] = i;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup, int ineed, int atom_nfirst
|
||||
, int nfirst, int nlast, int dim, int iswap, X_CFLOAT* multilo, X_CFLOAT* multihi, int* sendlist, int maxlistlength)
|
||||
{
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
X_CFLOAT* mlo = &multilo[iswap * _cuda_ntypes];
|
||||
X_CFLOAT* mhi = &multihi[iswap * _cuda_ntypes];
|
||||
int itype = 0;
|
||||
bool add = false;
|
||||
|
||||
if(!bordergroup || ineed >= 2) {
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst;
|
||||
|
||||
if(i < nlast) {
|
||||
itype = _type[i];
|
||||
|
||||
if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
|
||||
add = true;
|
||||
}
|
||||
}
|
||||
|
||||
shared[threadIdx.x] = add ? 1 : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int nsend = 0;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
for(int k = 0; k < blockDim.x; k++) {
|
||||
if(shared[k]) {
|
||||
nsend++;
|
||||
shared[k] = nsend;
|
||||
}
|
||||
}
|
||||
|
||||
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
|
||||
|
||||
if(add && nsend < maxlistlength)
|
||||
list[nsend] = i;
|
||||
|
||||
|
||||
} else {
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < atom_nfirst) {
|
||||
itype = _type[i];
|
||||
|
||||
if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
|
||||
add = true;
|
||||
}
|
||||
}
|
||||
|
||||
shared[threadIdx.x] = add ? 1 : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int nsend = 0;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
for(int k = 0; k < blockDim.x; k++) {
|
||||
if(shared[k]) {
|
||||
nsend++;
|
||||
shared[k] = nsend;
|
||||
}
|
||||
}
|
||||
|
||||
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
|
||||
|
||||
if(add && nsend < maxlistlength)
|
||||
list[nsend] = i;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
add = false;
|
||||
i += _nlocal;
|
||||
|
||||
if(i < nlast) {
|
||||
itype = _type[i];
|
||||
|
||||
if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
|
||||
add = true;
|
||||
}
|
||||
}
|
||||
|
||||
shared[threadIdx.x] = add ? 1 : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nsend = 0;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
for(int k = 0; k < blockDim.x; k++) {
|
||||
if(shared[k]) {
|
||||
nsend++;
|
||||
shared[k] = nsend;
|
||||
}
|
||||
}
|
||||
|
||||
shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
|
||||
|
||||
if(add && nsend < maxlistlength)
|
||||
list[nsend] = i;
|
||||
|
||||
}
|
||||
}
|
|
@ -1,126 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX compute_temp_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "compute_temp_cuda_cu.h"
|
||||
#include "compute_temp_cuda_kernel.cu"
|
||||
|
||||
void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
|
||||
|
||||
if(sdata->atom.rmass_flag)
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_ComputeTempCuda_UpdateNmax(sdata);
|
||||
}
|
||||
|
||||
|
||||
void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
//if(sdata->buffer_new)
|
||||
Cuda_ComputeTempCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x * grid.y;
|
||||
grid.x = 6;
|
||||
grid.y = 1;
|
||||
threads.x = 512;
|
||||
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed");
|
||||
}
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
//if(sdata->buffer_new)
|
||||
Cuda_ComputeTempCuda_UpdateBuffer(sdata);
|
||||
MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n", sdata->atom.nlocal);)
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel");
|
||||
Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x * grid.y;
|
||||
grid.x = 1;
|
||||
grid.y = 1;
|
||||
threads.x = 512;
|
||||
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed");
|
||||
}
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);
|
||||
extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);
|
|
@ -1,118 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ ENERGY_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
|
||||
if(i < _nlocal) {
|
||||
if(_rmass_flag) {
|
||||
if(_mask[i] & groupbit)
|
||||
sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * _rmass[i];
|
||||
} else {
|
||||
if(_mask[i] & groupbit)
|
||||
sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * (_mass[_type[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
sharedmem[threadIdx.x + blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 4 * blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 5 * blockDim.x] = 0;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
V_CFLOAT massone;
|
||||
|
||||
if(_rmass_flag) massone = _rmass[i];
|
||||
else massone = _mass[_type[i]];
|
||||
|
||||
sharedmem[threadIdx.x] = massone * _v[i] * _v[i];
|
||||
sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax];
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax];
|
||||
sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax];
|
||||
sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax];
|
||||
sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax];
|
||||
}
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[3 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[4 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[5 * blockDim.x]);
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
|
||||
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
|
||||
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
|
||||
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
|
||||
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x];
|
||||
buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
ENERGY_CFLOAT myforig = 0.0;
|
||||
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
|
||||
if(i + threadIdx.x < n)
|
||||
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
reduceBlock(sharedmem);
|
||||
i += blockDim.x;
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
myforig += sharedmem[0];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
t[blockIdx.x] = myforig;
|
||||
}
|
|
@ -1,164 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX compute_temp_partial_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "compute_temp_partial_cuda_cu.h"
|
||||
#include "compute_temp_partial_cuda_kernel.cu"
|
||||
|
||||
void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
|
||||
|
||||
if(sdata->atom.rmass_flag)
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
|
||||
}
|
||||
|
||||
|
||||
void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
//if(sdata->buffer_new)
|
||||
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x * grid.y;
|
||||
grid.x = 6;
|
||||
threads.x = 512;
|
||||
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
|
||||
}
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
//if(sdata->buffer_new)
|
||||
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
|
||||
MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n", sdata->atom.nlocal);)
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
|
||||
Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x * grid.y;
|
||||
grid.x = 1;
|
||||
threads.x = 512;
|
||||
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
|
||||
}
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
//if(sdata->buffer_new)
|
||||
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
|
||||
}
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
//if(sdata->buffer_new)
|
||||
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
|
||||
}
|
||||
}
|
|
@ -1,30 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
|
|
@ -1,161 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ ENERGY_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xflag, int yflag, int zflag)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
|
||||
if(i < _nlocal) {
|
||||
if(_rmass_flag) {
|
||||
if(_mask[i] & groupbit)
|
||||
sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * _rmass[i];
|
||||
} else {
|
||||
if(_mask[i] & groupbit)
|
||||
sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * (_mass[_type[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xflag, int yflag, int zflag)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
sharedmem[threadIdx.x + blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 4 * blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 5 * blockDim.x] = 0;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
V_CFLOAT massone;
|
||||
|
||||
if(_rmass_flag) massone = _rmass[i];
|
||||
else massone = _mass[_type[i]];
|
||||
|
||||
sharedmem[threadIdx.x] = massone * _v[i] * _v[i] * xflag;
|
||||
sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax] * yflag;
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag;
|
||||
sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax] * xflag * yflag;
|
||||
sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax] * xflag * zflag;
|
||||
sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax] * yflag * zflag;
|
||||
}
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[3 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[4 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[5 * blockDim.x]);
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
ENERGY_CFLOAT myforig = 0.0;
|
||||
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
|
||||
if(i + threadIdx.x < n)
|
||||
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
reduceBlock(sharedmem);
|
||||
i += blockDim.x;
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
myforig += sharedmem[0];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
t[blockIdx.x] = myforig;
|
||||
}
|
||||
|
||||
__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
if(!xflag) {
|
||||
vbiasall[i] = _v[i];
|
||||
_v[i] = V_F(0.0);
|
||||
}
|
||||
|
||||
if(!yflag) {
|
||||
vbiasall[i + _nmax] = _v[i + _nmax];
|
||||
_v[i + _nmax] = V_F(0.0);
|
||||
}
|
||||
|
||||
if(!zflag) {
|
||||
vbiasall[i + 2 * _nmax] = _v[i + 2 * _nmax];
|
||||
_v[i + 2 * _nmax] = V_F(0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
if(!xflag) {
|
||||
_v[i] += vbiasall[i];
|
||||
}
|
||||
|
||||
if(!yflag) {
|
||||
_v[i + _nmax] += vbiasall[i + _nmax];
|
||||
}
|
||||
|
||||
if(!zflag) {
|
||||
_v[i + 2 * _nmax] += vbiasall[i + 2 * _nmax];
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,919 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef CRM_CUDA_UTILS
|
||||
#define CRM_CUDA_UTILS
|
||||
|
||||
//split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
|
||||
#define MIN(a,b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
|
||||
{
|
||||
int3 gridparams;
|
||||
int sharedsize = 16000;
|
||||
|
||||
if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
|
||||
|
||||
if((n < 60 * 32) || (threadsmax < 64))
|
||||
gridparams.z = 32;
|
||||
else if((n < 60 * 64) || (threadsmax < 128))
|
||||
gridparams.z = 64;
|
||||
else if((n < 60 * 128) || (threadsmax < 256))
|
||||
gridparams.z = 128;
|
||||
else if((n < 60 * 256) || (threadsmax < 512))
|
||||
gridparams.z = 256;
|
||||
else gridparams.z = 512;
|
||||
|
||||
if(p2) {
|
||||
gridparams.z = 16;
|
||||
|
||||
while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
|
||||
}
|
||||
|
||||
|
||||
int blocks = (n + gridparams.z - 1) / gridparams.z;
|
||||
|
||||
if(blocks > 10000)
|
||||
gridparams.x = gridparams.y = int(sqrt(blocks));
|
||||
else {
|
||||
gridparams.x = blocks;
|
||||
gridparams.y = 1;
|
||||
}
|
||||
|
||||
while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
|
||||
|
||||
if(gridparams.x == 0) gridparams.x = 1;
|
||||
|
||||
return gridparams;
|
||||
}
|
||||
|
||||
//return value: 1 if f<0; else: 0
|
||||
//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
|
||||
static inline __device__ int negativCUDA(float f)
|
||||
{
|
||||
return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
|
||||
}
|
||||
|
||||
//return value: -1 if f<0; else +1
|
||||
static inline __device__ float fsignCUDA(float f)
|
||||
{
|
||||
return f < 0.0f ? -1.0f : 1.0f;
|
||||
}
|
||||
|
||||
//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
|
||||
//blockDim.y and blockDim.z are assumed to be 1
|
||||
static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
|
||||
{
|
||||
int i, k;
|
||||
k = n - blockDim.x;
|
||||
|
||||
for(i = 0; i < k; i += blockDim.x) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) {
|
||||
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
//copy data between two memory areas on device, 3d BlockDims are allowed
|
||||
static __device__ inline void copyData(double* source, double* target, const int &n)
|
||||
{
|
||||
int i;
|
||||
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
||||
|
||||
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
if(offset < n - i) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static __device__ inline void copyData(float* source, float* target, const int &n)
|
||||
{
|
||||
int i;
|
||||
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
||||
|
||||
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
if(offset < n - i) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static __device__ inline void copyData(int* source, int* target, const int &n)
|
||||
{
|
||||
int i;
|
||||
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
||||
|
||||
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
if(offset < n - i) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
|
||||
{
|
||||
int i;
|
||||
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
||||
|
||||
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
if(offset < n - i) {
|
||||
target[i + offset] = source[i + offset];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
|
||||
//in the end in data[0]=sum_i=0^blockDim.x data[i]
|
||||
//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
|
||||
static __device__ inline void reduceBlockP2(int* data)
|
||||
{
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= blockDim.x; i *= 2) {
|
||||
if(threadIdx.x < blockDim.x / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void reduceBlockP2(unsigned int* data)
|
||||
{
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= blockDim.x; i *= 2) {
|
||||
if(threadIdx.x < blockDim.x / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void reduceBlockP2(float* data)
|
||||
{
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= blockDim.x; i *= 2) {
|
||||
if(threadIdx.x < blockDim.x / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void reduceBlockP2(double* data)
|
||||
{
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= blockDim.x; i *= 2) {
|
||||
if(threadIdx.x < blockDim.x / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void reduceBlock(float* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void reduceBlock(int* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void reduceBlock(unsigned int* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void reduceBlock(double* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
|
||||
{
|
||||
int i;
|
||||
|
||||
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
|
||||
data[i + threadIdx.x] = value;
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
|
||||
}
|
||||
|
||||
static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
|
||||
{
|
||||
int i;
|
||||
|
||||
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
|
||||
data[i + threadIdx.x] = value;
|
||||
}
|
||||
|
||||
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
|
||||
}
|
||||
|
||||
static __device__ inline void reduce(float* data, int n) //cautious not sure if working
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void reduce(double* data, int n) //cautious not sure if working
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void minOfBlock(float* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void maxOfBlock(float* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void minOfBlock(double* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void maxOfBlock(double* data)
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < blockDim.x) p2 *= 2;
|
||||
|
||||
if(threadIdx.x < blockDim.x - p2)
|
||||
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
if(threadIdx.x < p2 / i)
|
||||
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
|
||||
{
|
||||
__syncthreads();
|
||||
int p2 = 1;
|
||||
|
||||
while(p2 * 2 < n) p2 *= 2;
|
||||
|
||||
int j = 0;
|
||||
|
||||
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
||||
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int i = 2; i <= p2; i *= 2) {
|
||||
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
||||
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
||||
j++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
#if X_PRECISION == 2
|
||||
static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
|
||||
{
|
||||
int2 v = tex1Dfetch(t, i);
|
||||
return __hiloint2double(v.y, v.x);
|
||||
}
|
||||
|
||||
static __device__ inline X_CFLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
|
||||
{
|
||||
int4 v = tex1Dfetch(t, 2 * i);
|
||||
int4 u = tex1Dfetch(t, 2 * i + 1);
|
||||
X_CFLOAT4 w;
|
||||
|
||||
w.x = __hiloint2double(v.y, v.x);
|
||||
w.y = __hiloint2double(v.w, v.z);
|
||||
w.z = __hiloint2double(u.y, u.x);
|
||||
w.w = __hiloint2double(u.w, u.z);
|
||||
return w;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void BindXTypeTexture(cuda_shared_data* sdata)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
_x_type_tex.normalized = false; // access with normalized texture coordinates
|
||||
_x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
|
||||
|
||||
#if X_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
|
||||
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_CFLOAT4));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
|
||||
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline X_CFLOAT4 fetchXType(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if X_PRECISION == 1
|
||||
return tex1Dfetch(_x_type_tex, i);
|
||||
#else
|
||||
return tex1Dfetch_double(_x_type_tex, i);
|
||||
#endif
|
||||
#else
|
||||
return _x_type[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
#if V_PRECISION == 2
|
||||
static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
|
||||
{
|
||||
int2 v = tex1Dfetch(t, i);
|
||||
return __hiloint2double(v.y, v.x);
|
||||
}
|
||||
|
||||
static __device__ inline V_CFLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
|
||||
{
|
||||
int4 v = tex1Dfetch(t, 2 * i);
|
||||
int4 u = tex1Dfetch(t, 2 * i + 1);
|
||||
V_CFLOAT4 w;
|
||||
|
||||
w.x = __hiloint2double(v.y, v.x);
|
||||
w.y = __hiloint2double(v.w, v.z);
|
||||
w.z = __hiloint2double(u.y, u.x);
|
||||
w.w = __hiloint2double(u.w, u.z);
|
||||
return w;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void BindVRadiusTexture(cuda_shared_data* sdata)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
_v_radius_tex.normalized = false; // access with normalized texture coordinates
|
||||
_v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
|
||||
|
||||
#if V_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
|
||||
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_CFLOAT4));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
|
||||
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline V_CFLOAT4 fetchVRadius(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if V_PRECISION == 1
|
||||
return tex1Dfetch(_v_radius_tex, i);
|
||||
#else
|
||||
return tex1Dfetch_double_v(_v_radius_tex, i);
|
||||
#endif
|
||||
#else
|
||||
return _v_radius[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
_omega_rmass_tex.normalized = false; // access with normalized texture coordinates
|
||||
_omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
|
||||
|
||||
#if V_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
|
||||
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_CFLOAT4));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
|
||||
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline V_CFLOAT4 fetchOmegaRmass(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if V_PRECISION == 1
|
||||
return tex1Dfetch(_omega_rmass_tex, i);
|
||||
#else
|
||||
return tex1Dfetch_double_v(_omega_rmass_tex, i);
|
||||
#endif
|
||||
#else
|
||||
return _omega_rmass[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
#if F_PRECISION == 2
|
||||
static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
|
||||
{
|
||||
int2 v = tex1Dfetch(t, i);
|
||||
return __hiloint2double(v.y, v.x);
|
||||
}
|
||||
|
||||
static __device__ inline F_CFLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
|
||||
{
|
||||
int4 v = tex1Dfetch(t, 2 * i);
|
||||
int4 u = tex1Dfetch(t, 2 * i + 1);
|
||||
F_CFLOAT4 w;
|
||||
|
||||
w.x = __hiloint2double(v.y, v.x);
|
||||
w.y = __hiloint2double(v.w, v.z);
|
||||
w.z = __hiloint2double(u.y, u.x);
|
||||
w.w = __hiloint2double(u.w, u.z);
|
||||
return w;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void BindQTexture(cuda_shared_data* sdata)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
_q_tex.normalized = false; // access with normalized texture coordinates
|
||||
_q_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* q_texture_ptr = &MY_AP(q_tex);
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline F_CFLOAT fetchQ(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if F_PRECISION == 1
|
||||
return tex1Dfetch(_q_tex, i);
|
||||
#else
|
||||
return tex1Dfetch_double_f(_q_tex, i);
|
||||
#endif
|
||||
#else
|
||||
return _q[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
||||
inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
_coeff_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
_coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
||||
const textureReference* coeff_texture_ptr;
|
||||
cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
|
||||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
|
||||
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_CFLOAT4));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
|
||||
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline X_CFLOAT4 fetchXType(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if X_PRECISION == 1
|
||||
return tex1Dfetch(_x_type_tex,i);
|
||||
#else
|
||||
return tex1Dfetch_double(_x_type_tex,i);
|
||||
#endif
|
||||
#else
|
||||
return _x_type[i];
|
||||
#endif
|
||||
}
|
||||
*/
|
||||
#define SBBITS 30
|
||||
|
||||
static inline __device__ int sbmask(int j)
|
||||
{
|
||||
return j >> SBBITS & 3;
|
||||
}
|
||||
|
||||
static inline __device__ void minimum_image(X_CFLOAT4 &delta)
|
||||
{
|
||||
if(_triclinic == 0) {
|
||||
if(_periodicity[0]) {
|
||||
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
|
||||
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
|
||||
}
|
||||
|
||||
if(_periodicity[1]) {
|
||||
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
|
||||
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
|
||||
}
|
||||
|
||||
if(_periodicity[2]) {
|
||||
delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
|
||||
(delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
|
||||
}
|
||||
|
||||
} else {
|
||||
if(_periodicity[1]) {
|
||||
delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
|
||||
(delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
|
||||
delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
|
||||
(delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
|
||||
delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
|
||||
(delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
|
||||
|
||||
}
|
||||
|
||||
if(_periodicity[1]) {
|
||||
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
|
||||
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
|
||||
delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
|
||||
(delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
|
||||
|
||||
}
|
||||
|
||||
if(_periodicity[0]) {
|
||||
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
|
||||
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline __device__ void closest_image(X_CFLOAT4 &x1, X_CFLOAT4 &x2, X_CFLOAT4 &ci)
|
||||
{
|
||||
ci.x = x2.x - x1.x;
|
||||
ci.y = x2.y - x1.y;
|
||||
ci.z = x2.z - x1.z;
|
||||
minimum_image(ci);
|
||||
ci.x += x1.x;
|
||||
ci.y += x1.y;
|
||||
ci.z += x1.z;
|
||||
}
|
|
@ -1,22 +0,0 @@
|
|||
#include "cuda_precision.h"
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_cu.h"
|
||||
|
||||
void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata)
|
||||
{
|
||||
sdata->compile_settings.prec_glob = sizeof(CUDA_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_x = sizeof(X_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_v = sizeof(V_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_f = sizeof(F_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_pppm = sizeof(PPPM_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_fft = sizeof(FFT_CFLOAT) / 4;
|
||||
|
||||
#ifdef FFT_CUFFT
|
||||
sdata->compile_settings.cufft = 1;
|
||||
#else
|
||||
sdata->compile_settings.cufft = 0;
|
||||
#endif
|
||||
|
||||
sdata->compile_settings.arch = CUDA_ARCH;
|
||||
|
||||
}
|
|
@ -1,344 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef _CUDA_COMMON_H_
|
||||
#define _CUDA_COMMON_H_
|
||||
|
||||
//#include "cutil.h"
|
||||
#include "cuda_precision.h"
|
||||
#include "cuda_wrapper_cu.h"
|
||||
|
||||
#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
|
||||
//this can not be arbitrarly large, since constant space is limited.
|
||||
//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
|
||||
//Christian
|
||||
#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
|
||||
#define CUDA_MAX_NSPECIAL 25
|
||||
|
||||
// define some easy-to-use debug and emulation macros
|
||||
#ifdef _DEBUG
|
||||
#define MYDBG(a) a
|
||||
#else
|
||||
#define MYDBG(a)
|
||||
#endif
|
||||
|
||||
#if __DEVICE_EMULATION__
|
||||
#define MYEMU(a) a
|
||||
#else
|
||||
#define MYEMU(a)
|
||||
#endif
|
||||
|
||||
#define MYEMUDBG(a) MYEMU(MYDBG(a))
|
||||
|
||||
// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
|
||||
#define MY_ADD_PREFIX(prefix, var) prefix##_##var
|
||||
#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
|
||||
#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
|
||||
|
||||
#define MY_VAR_TO_STR(var) #var
|
||||
#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
|
||||
//#define &MY_AP(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
|
||||
//#define &MY_AP(var) &(MY_AP(var))
|
||||
#define CUDA_USE_TEXTURE
|
||||
#define CUDA_USE_CFLOAT4
|
||||
|
||||
//constants used by many classes
|
||||
|
||||
//domain
|
||||
#define _boxhi MY_AP(boxhi)
|
||||
#define _boxlo MY_AP(boxlo)
|
||||
#define _subhi MY_AP(subhi)
|
||||
#define _sublo MY_AP(sublo)
|
||||
#define _box_size MY_AP(box_size)
|
||||
#define _prd MY_AP(prd)
|
||||
#define _periodicity MY_AP(periodicity)
|
||||
#define _triclinic MY_AP(triclinic)
|
||||
#define _boxhi_lamda MY_AP(boxhi_lamda)
|
||||
#define _boxlo_lamda MY_AP(boxlo_lamda)
|
||||
#define _prd_lamda MY_AP(prd_lamda)
|
||||
#define _h MY_AP(h)
|
||||
#define _h_inv MY_AP(h_inv)
|
||||
#define _h_rate MY_AP(h_rate)
|
||||
__device__ __constant__ X_CFLOAT _boxhi[3];
|
||||
__device__ __constant__ X_CFLOAT _boxlo[3];
|
||||
__device__ __constant__ X_CFLOAT _subhi[3];
|
||||
__device__ __constant__ X_CFLOAT _sublo[3];
|
||||
__device__ __constant__ X_CFLOAT _box_size[3];
|
||||
__device__ __constant__ X_CFLOAT _prd[3];
|
||||
__device__ __constant__ int _periodicity[3];
|
||||
__device__ __constant__ int _triclinic;
|
||||
__device__ __constant__ X_CFLOAT _boxhi_lamda[3];
|
||||
__device__ __constant__ X_CFLOAT _boxlo_lamda[3];
|
||||
__device__ __constant__ X_CFLOAT _prd_lamda[3];
|
||||
__device__ __constant__ X_CFLOAT _h[6];
|
||||
__device__ __constant__ X_CFLOAT _h_inv[6];
|
||||
__device__ __constant__ V_CFLOAT _h_rate[6];
|
||||
|
||||
|
||||
//atom properties
|
||||
#define _x MY_AP(x)
|
||||
#define _v MY_AP(v)
|
||||
#define _f MY_AP(f)
|
||||
#define _tag MY_AP(tag)
|
||||
#define _type MY_AP(type)
|
||||
#define _mask MY_AP(mask)
|
||||
#define _image MY_AP(image)
|
||||
#define _q MY_AP(q)
|
||||
#define _mass MY_AP(mass)
|
||||
#define _rmass MY_AP(rmass)
|
||||
#define _rmass_flag MY_AP(rmass_flag)
|
||||
#define _eatom MY_AP(eatom)
|
||||
#define _vatom MY_AP(vatom)
|
||||
#define _x_type MY_AP(x_type)
|
||||
#define _radius MY_AP(radius)
|
||||
#define _density MY_AP(density)
|
||||
#define _omega MY_AP(omega)
|
||||
#define _torque MY_AP(torque)
|
||||
#define _special MY_AP(special)
|
||||
#define _maxspecial MY_AP(maxspecial)
|
||||
#define _nspecial MY_AP(nspecial)
|
||||
#define _special_flag MY_AP(special_flag)
|
||||
#define _molecule MY_AP(molecule)
|
||||
#define _v_radius MY_AP(v_radius)
|
||||
#define _omega_rmass MY_AP(omega_rmass)
|
||||
#define _freeze_group_bit MY_AP(freeze_group_bit)
|
||||
#define _map_array MY_AP(map_array)
|
||||
__device__ __constant__ X_CFLOAT* _x; //holds pointer to positions
|
||||
__device__ __constant__ V_CFLOAT* _v;
|
||||
__device__ __constant__ F_CFLOAT* _f;
|
||||
__device__ __constant__ int* _tag;
|
||||
__device__ __constant__ int* _type;
|
||||
__device__ __constant__ int* _mask;
|
||||
__device__ __constant__ int* _image;
|
||||
__device__ __constant__ V_CFLOAT* _mass;
|
||||
__device__ __constant__ F_CFLOAT* _q;
|
||||
__device__ __constant__ V_CFLOAT* _rmass;
|
||||
__device__ __constant__ int _rmass_flag;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _eatom;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _vatom;
|
||||
__device__ __constant__ X_CFLOAT4* _x_type; //holds pointer to positions
|
||||
__device__ __constant__ X_CFLOAT* _radius;
|
||||
__device__ __constant__ F_CFLOAT* _density;
|
||||
__device__ __constant__ V_CFLOAT* _omega;
|
||||
__device__ __constant__ F_CFLOAT* _torque;
|
||||
__device__ __constant__ int* _special;
|
||||
__device__ __constant__ int _maxspecial;
|
||||
__device__ __constant__ int* _nspecial;
|
||||
__device__ __constant__ int _special_flag[4];
|
||||
__device__ __constant__ int* _molecule;
|
||||
__device__ __constant__ V_CFLOAT4* _v_radius; //holds pointer to positions
|
||||
__device__ __constant__ V_CFLOAT4* _omega_rmass; //holds pointer to positions
|
||||
__device__ __constant__ int _freeze_group_bit;
|
||||
__device__ __constant__ int* _map_array;
|
||||
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
|
||||
#define _x_tex MY_AP(x_tex)
|
||||
#if X_PRECISION == 1
|
||||
texture<float> _x_tex;
|
||||
#else
|
||||
texture<int2, 1> _x_tex;
|
||||
#endif
|
||||
|
||||
#define _type_tex MY_AP(type_tex)
|
||||
texture<int> _type_tex;
|
||||
|
||||
#define _x_type_tex MY_AP(x_type_tex)
|
||||
#if X_PRECISION == 1
|
||||
texture<float4, 1> _x_type_tex;
|
||||
#else
|
||||
texture<int4, 1> _x_type_tex;
|
||||
#endif
|
||||
|
||||
#define _v_radius_tex MY_AP(v_radius_tex)
|
||||
#if V_PRECISION == 1
|
||||
texture<float4, 1> _v_radius_tex;
|
||||
#else
|
||||
texture<int4, 1> _v_radius_tex;
|
||||
#endif
|
||||
|
||||
#define _omega_rmass_tex MY_AP(omega_rmass_tex)
|
||||
#if V_PRECISION == 1
|
||||
texture<float4, 1> _omega_rmass_tex;
|
||||
#else
|
||||
texture<int4, 1> _omega_rmass_tex;
|
||||
#endif
|
||||
|
||||
#define _q_tex MY_AP(q_tex)
|
||||
#if F_PRECISION == 1
|
||||
texture<float> _q_tex;
|
||||
#else
|
||||
texture<int2, 1> _q_tex;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
//neighbor
|
||||
#ifdef IncludeCommonNeigh
|
||||
#define _inum MY_AP(inum)
|
||||
#define _inum_border MY_AP(inum_border)
|
||||
#define _ilist MY_AP(ilist)
|
||||
#define _ilist_border MY_AP(ilist_border)
|
||||
#define _numneigh MY_AP(numneigh)
|
||||
#define _numneigh_border MY_AP(numneigh_border)
|
||||
#define _numneigh_inner MY_AP(numneigh_inner)
|
||||
#define _firstneigh MY_AP(firstneigh)
|
||||
#define _neighbors MY_AP(neighbors)
|
||||
#define _neighbors_border MY_AP(neighbors_border)
|
||||
#define _neighbors_inner MY_AP(neighbors_inner)
|
||||
#define _reneigh_flag MY_AP(reneigh_flag)
|
||||
#define _triggerneighsq MY_AP(triggerneighsq)
|
||||
#define _xhold MY_AP(xhold)
|
||||
#define _maxhold MY_AP(maxhold)
|
||||
#define _dist_check MY_AP(dist_check)
|
||||
#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
|
||||
#define _maxneighbors MY_AP(maxneighbors)
|
||||
#define _overlap_comm MY_AP(overlap_comm)
|
||||
__device__ __constant__ int _inum;
|
||||
__device__ __constant__ int* _inum_border;
|
||||
__device__ __constant__ int* _ilist;
|
||||
__device__ __constant__ int* _ilist_border;
|
||||
__device__ __constant__ int* _numneigh;
|
||||
__device__ __constant__ int* _numneigh_border;
|
||||
__device__ __constant__ int* _numneigh_inner;
|
||||
__device__ __constant__ int** _firstneigh;
|
||||
__device__ __constant__ int* _neighbors;
|
||||
__device__ __constant__ int* _neighbors_border;
|
||||
__device__ __constant__ int* _neighbors_inner;
|
||||
__device__ __constant__ int* _reneigh_flag;
|
||||
__device__ __constant__ X_CFLOAT _triggerneighsq;
|
||||
__device__ __constant__ X_CFLOAT* _xhold; //holds pointer to positions
|
||||
__device__ __constant__ int _maxhold;
|
||||
__device__ __constant__ int _dist_check;
|
||||
__device__ __constant__ int _neighbor_maxlocal;
|
||||
__device__ __constant__ int _maxneighbors;
|
||||
__device__ __constant__ int _overlap_comm;
|
||||
#endif
|
||||
|
||||
//system properties
|
||||
#define _nall MY_AP(nall)
|
||||
#define _nghost MY_AP(nghost)
|
||||
#define _nlocal MY_AP(nlocal)
|
||||
#define _nmax MY_AP(nmax)
|
||||
#define _cuda_ntypes MY_AP(cuda_ntypes)
|
||||
#define _dtf MY_AP(dtf)
|
||||
#define _dtv MY_AP(dtv)
|
||||
#define _factor MY_AP(factor)
|
||||
#define _virial MY_AP(virial)
|
||||
#define _eng_vdwl MY_AP(eng_vdwl)
|
||||
#define _eng_coul MY_AP(eng_coul)
|
||||
#define _molecular MY_AP(molecular)
|
||||
__device__ __constant__ unsigned _nall;
|
||||
__device__ __constant__ unsigned _nghost;
|
||||
__device__ __constant__ unsigned _nlocal;
|
||||
__device__ __constant__ unsigned _nmax;
|
||||
__device__ __constant__ unsigned _cuda_ntypes;
|
||||
__device__ __constant__ V_CFLOAT _dtf;
|
||||
__device__ __constant__ X_CFLOAT _dtv;
|
||||
__device__ __constant__ V_CFLOAT _factor;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _virial;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _eng_vdwl;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _eng_coul;
|
||||
__device__ __constant__ int _molecular;
|
||||
|
||||
//other general constants
|
||||
#define _buffer MY_AP(buffer)
|
||||
#define _flag MY_AP(flag)
|
||||
#define _debugdata MY_AP(debugdata)
|
||||
__device__ __constant__ void* _buffer;
|
||||
__device__ __constant__ int* _flag;
|
||||
__device__ __constant__ int* _debugdata;
|
||||
|
||||
// pointers to data fields on GPU are hold in constant space
|
||||
// -> reduces register usage and number of parameters for kernelcalls
|
||||
// will be variables of file scope in cuda files
|
||||
|
||||
|
||||
|
||||
|
||||
// maybe used to output cudaError_t
|
||||
#define MY_OUTPUT_RESULT(result) \
|
||||
switch(result) \
|
||||
{ \
|
||||
case cudaSuccess: printf(" => cudaSuccess\n"); break; \
|
||||
case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
|
||||
case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
|
||||
case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
|
||||
case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
|
||||
default: printf(" => unknown\n"); break; \
|
||||
}
|
||||
|
||||
#ifdef _DEBUG
|
||||
# define CUT_CHECK_ERROR(errorMessage) { \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
err = cudaThreadSynchronize(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
# define CUT_CHECK_ERROR(errorMessage) { \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
|
||||
cudaError err = call; \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
|
||||
__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} }
|
||||
|
||||
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
|
||||
|
||||
#define X_MASK 1
|
||||
#define V_MASK 2
|
||||
#define F_MASK 4
|
||||
#define TAG_MASK 8
|
||||
#define TYPE_MASK 16
|
||||
#define MASK_MASK 32
|
||||
#define IMAGE_MASK 64
|
||||
#define Q_MASK 128
|
||||
#define MOLECULE_MASK 256
|
||||
#define RMASS_MASK 512
|
||||
#define RADIUS_MASK 1024
|
||||
#define DENSITY_MASK 2048
|
||||
#define OMEGA_MASK 4096
|
||||
#define TORQUE_MASK 8192
|
||||
|
||||
|
||||
|
||||
#endif // #ifdef _CUDA_COMMON_H_
|
|
@ -1 +0,0 @@
|
|||
extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata);
|
|
@ -1,220 +0,0 @@
|
|||
enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
|
||||
|
||||
#include "cuda_data_cu.h"
|
||||
#include "cuda_wrapper_cu.h"
|
||||
#include "cuda_data_kernel.cu"
|
||||
#include <cstdio>
|
||||
|
||||
void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
|
||||
{
|
||||
int size = n[0];
|
||||
|
||||
if(n[1] > 0) size *= n[1];
|
||||
|
||||
if(n[2] > 0) size *= n[2];
|
||||
|
||||
dim3 threads;
|
||||
threads.x = 1;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
dim3 grid;
|
||||
grid.x = 1;
|
||||
grid.y = 1;
|
||||
grid.z = 1;
|
||||
|
||||
if(size <= 128 * 30)
|
||||
threads.x = 32;
|
||||
else if(size <= 256 * 30)
|
||||
threads.x = 64;
|
||||
else if(size <= 512 * 30)
|
||||
threads.x = 128;
|
||||
else
|
||||
threads.x = 256;
|
||||
|
||||
grid.x = ((size - 1) + threads.x) / threads.x;
|
||||
|
||||
if(grid.x > 32000)
|
||||
grid.x = 32000;
|
||||
|
||||
while(grid.x * grid.y * threads.x < size) grid.y++;
|
||||
|
||||
float debugdata[size];
|
||||
//int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
|
||||
size *= sizeof(double);
|
||||
printf("size: %i (%i %i %i) (%i %i %i) %p\n", size, grid.x, grid.y, threads.x, n[0], n[1], n[2], buffer);
|
||||
CudaWrapper_UploadCudaData(host_data, buffer, size);
|
||||
CudaData_Upload_Kernel_DoubleFloat <<< grid, threads>>>((double*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
|
||||
cudaThreadSynchronize();
|
||||
CudaWrapper_DownloadCudaData(debugdata, dev_data, size / 2);
|
||||
double sum = 0;
|
||||
printf("debugdata: ");
|
||||
|
||||
for(int i = 0; i < size / sizeof(double); i++) sum += (debugdata[i] - ((double*) host_data)[i]) * (debugdata[i] - ((double*) host_data)[i]);
|
||||
|
||||
printf("%lf \n", sum);
|
||||
|
||||
}
|
||||
|
||||
void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
|
||||
{
|
||||
int size = n[0];
|
||||
|
||||
if(n[1] > 0) size *= n[1];
|
||||
|
||||
if(n[2] > 0) size *= n[2];
|
||||
|
||||
dim3 threads;
|
||||
threads.x = 1;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
dim3 grid;
|
||||
grid.x = 1;
|
||||
grid.y = 1;
|
||||
grid.z = 1;
|
||||
|
||||
if(size <= 128 * 30)
|
||||
threads.x = 32;
|
||||
else if(size <= 256 * 30)
|
||||
threads.x = 64;
|
||||
else if(size <= 512 * 30)
|
||||
threads.x = 128;
|
||||
else
|
||||
threads.x = 256;
|
||||
|
||||
grid.x = ((size - 1) + threads.x) / threads.x;
|
||||
|
||||
if(grid.x > 32000)
|
||||
grid.x = 32000;
|
||||
|
||||
while(grid.x * grid.y * threads.x < size) grid.y++;
|
||||
|
||||
size *= sizeof(double);
|
||||
|
||||
CudaWrapper_UploadCudaData(host_data, buffer, size);
|
||||
CudaData_Upload_Kernel_DoubleDouble <<< grid, threads>>>((double*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
|
||||
{
|
||||
int size = n[0];
|
||||
|
||||
if(n[1] > 0) size *= n[1];
|
||||
|
||||
if(n[2] > 0) size *= n[2];
|
||||
|
||||
dim3 threads;
|
||||
threads.x = 1;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
dim3 grid;
|
||||
grid.x = 1;
|
||||
grid.y = 1;
|
||||
grid.z = 1;
|
||||
|
||||
if(size <= 128 * 30)
|
||||
threads.x = 32;
|
||||
else if(size <= 256 * 30)
|
||||
threads.x = 64;
|
||||
else if(size <= 512 * 30)
|
||||
threads.x = 128;
|
||||
else
|
||||
threads.x = 256;
|
||||
|
||||
grid.x = ((size - 1) + threads.x) / threads.x;
|
||||
|
||||
if(grid.x > 32000)
|
||||
grid.x = 32000;
|
||||
|
||||
while(grid.x * grid.y * threads.x < size) grid.y++;
|
||||
|
||||
size *= sizeof(float);
|
||||
|
||||
CudaWrapper_UploadCudaData(host_data, buffer, size);
|
||||
CudaData_Upload_Kernel_FloatDouble <<< grid, threads>>>((float*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
|
||||
{
|
||||
int size = n[0];
|
||||
|
||||
if(n[1] > 0) size *= n[1];
|
||||
|
||||
if(n[2] > 0) size *= n[2];
|
||||
|
||||
dim3 threads;
|
||||
threads.x = 1;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
dim3 grid;
|
||||
grid.x = 1;
|
||||
grid.y = 1;
|
||||
grid.z = 1;
|
||||
|
||||
if(size <= 128 * 30)
|
||||
threads.x = 32;
|
||||
else if(size <= 256 * 30)
|
||||
threads.x = 64;
|
||||
else if(size <= 512 * 30)
|
||||
threads.x = 128;
|
||||
else
|
||||
threads.x = 256;
|
||||
|
||||
grid.x = ((size - 1) + threads.x) / threads.x;
|
||||
|
||||
if(grid.x > 32000)
|
||||
grid.x = 32000;
|
||||
|
||||
while(grid.x * grid.y * threads.x < size) grid.y++;
|
||||
|
||||
size *= sizeof(float);
|
||||
|
||||
CudaWrapper_UploadCudaData(host_data, buffer, size);
|
||||
CudaData_Upload_Kernel_FloatFloat <<< grid, threads>>>((float*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
|
||||
{
|
||||
int size = n[0];
|
||||
|
||||
if(n[1] > 0) size *= n[1];
|
||||
|
||||
if(n[2] > 0) size *= n[2];
|
||||
|
||||
dim3 threads;
|
||||
threads.x = 1;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
dim3 grid;
|
||||
grid.x = 1;
|
||||
grid.y = 1;
|
||||
grid.z = 1;
|
||||
|
||||
if(size <= 128 * 30)
|
||||
threads.x = 32;
|
||||
else if(size <= 256 * 30)
|
||||
threads.x = 64;
|
||||
else if(size <= 512 * 30)
|
||||
threads.x = 128;
|
||||
else
|
||||
threads.x = 256;
|
||||
|
||||
grid.x = ((size - 1) + threads.x) / threads.x;
|
||||
|
||||
if(grid.x > 32000)
|
||||
grid.x = 32000;
|
||||
|
||||
while(grid.x * grid.y * threads.x < size) grid.y++;
|
||||
|
||||
size *= sizeof(int);
|
||||
|
||||
CudaWrapper_UploadCudaData(host_data, buffer, size);
|
||||
CudaData_Upload_Kernel_IntInt <<< grid, threads>>>((int*)buffer, (int*)dev_data, n[0], n[1], n[2], mode);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer)
|
||||
{
|
||||
}
|
|
@ -1,13 +0,0 @@
|
|||
#ifndef CUDA_DATA_CU_H_
|
||||
#define CUDA_DATA_CU_H_
|
||||
|
||||
extern "C" void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
|
||||
extern "C" void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
|
||||
extern "C" void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
|
||||
extern "C" void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
|
||||
extern "C" void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
|
||||
|
||||
extern "C" void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer);
|
||||
|
||||
|
||||
#endif /*CUDA_DATA_CU_H_*/
|
|
@ -1,195 +0,0 @@
|
|||
__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer, float* dev_data,
|
||||
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
|
||||
{
|
||||
if(mode == x) mode = xx;
|
||||
|
||||
unsigned length = nx;
|
||||
|
||||
if(ny > 0) length *= ny;
|
||||
|
||||
if(nz > 0) length *= nz;
|
||||
|
||||
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
|
||||
|
||||
|
||||
if(i >= length) return;
|
||||
|
||||
switch(mode) {
|
||||
case xx: {
|
||||
dev_data[i] = buffer[i];
|
||||
}
|
||||
|
||||
case xy: {
|
||||
dev_data[i] = buffer[i];
|
||||
}
|
||||
|
||||
case yx: {
|
||||
j = i / ny;
|
||||
k = i % ny;
|
||||
dev_data[k * nx + j] = buffer[j * ny + k];
|
||||
}
|
||||
|
||||
case xyz: {
|
||||
dev_data[i] = buffer[i];
|
||||
}
|
||||
|
||||
case xzy: {
|
||||
j = i / (ny * nz);
|
||||
k = (i % (ny * nz)) / nz;
|
||||
l = i % nz;
|
||||
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer, double* dev_data,
|
||||
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
|
||||
{
|
||||
if(mode == x) mode = xx;
|
||||
|
||||
unsigned length = nx;
|
||||
|
||||
if(ny > 0) length *= ny;
|
||||
|
||||
if(nz > 0) length *= nz;
|
||||
|
||||
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
|
||||
|
||||
if(i >= length) return;
|
||||
|
||||
switch(mode) {
|
||||
case xx:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case xy:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case yx:
|
||||
j = i / ny;
|
||||
k = i % ny;
|
||||
dev_data[k * nx + j] = buffer[j * ny + k];
|
||||
|
||||
case xyz:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case xzy:
|
||||
j = i / (ny * nz);
|
||||
k = (i % (ny * nz)) / nz;
|
||||
l = i % nz;
|
||||
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer, double* dev_data,
|
||||
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
|
||||
{
|
||||
if(mode == x) mode = xx;
|
||||
|
||||
unsigned length = nx;
|
||||
|
||||
if(ny > 0) length *= ny;
|
||||
|
||||
if(nz > 0) length *= nz;
|
||||
|
||||
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
|
||||
|
||||
if(i >= length) return;
|
||||
|
||||
switch(mode) {
|
||||
case xx:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case xy:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case yx:
|
||||
j = i / ny;
|
||||
k = i % ny;
|
||||
dev_data[k * nx + j] = buffer[j * ny + k];
|
||||
|
||||
case xyz:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case xzy:
|
||||
j = i / (ny * nz);
|
||||
k = (i % (ny * nz)) / nz;
|
||||
l = i % nz;
|
||||
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer, float* dev_data,
|
||||
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
|
||||
{
|
||||
if(mode == x) mode = xx;
|
||||
|
||||
unsigned length = nx;
|
||||
|
||||
if(ny > 0) length *= ny;
|
||||
|
||||
if(nz > 0) length *= nz;
|
||||
|
||||
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
|
||||
|
||||
if(i >= length) return;
|
||||
|
||||
switch(mode) {
|
||||
case xx:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case xy:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case yx:
|
||||
j = i / ny;
|
||||
k = i % ny;
|
||||
dev_data[k * nx + j] = buffer[j * ny + k];
|
||||
|
||||
case xyz:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case xzy:
|
||||
j = i / (ny * nz);
|
||||
k = (i % (ny * nz)) / nz;
|
||||
l = i % nz;
|
||||
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CudaData_Upload_Kernel_IntInt(int* buffer, int* dev_data,
|
||||
unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
|
||||
{
|
||||
if(mode == x) mode = xx;
|
||||
|
||||
unsigned length = nx;
|
||||
|
||||
if(ny > 0) length *= ny;
|
||||
|
||||
if(nz > 0) length *= nz;
|
||||
|
||||
unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
|
||||
|
||||
if(i >= length) return;
|
||||
|
||||
switch(mode) {
|
||||
case xx:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case xy:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case yx:
|
||||
j = i / ny;
|
||||
k = i % ny;
|
||||
dev_data[k * nx + j] = buffer[j * ny + k];
|
||||
|
||||
case xyz:
|
||||
dev_data[i] = buffer[i];
|
||||
|
||||
case xzy:
|
||||
j = i / (ny * nz);
|
||||
k = (i % (ny * nz)) / nz;
|
||||
l = i % nz;
|
||||
dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,30 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag);
|
File diff suppressed because it is too large
Load Diff
|
@ -1,126 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ ENERGY_CFLOAT sharedmem[];
|
||||
|
||||
static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, int coulflag = 0)
|
||||
{
|
||||
__syncthreads();
|
||||
ENERGY_CFLOAT* shared = sharedmem;
|
||||
|
||||
if(eflag) {
|
||||
reduceBlock(shared);
|
||||
shared += blockDim.x;
|
||||
|
||||
if(coulflag) {
|
||||
reduceBlock(shared);
|
||||
shared += blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
if(vflag) {
|
||||
reduceBlock(shared + 0 * blockDim.x);
|
||||
reduceBlock(shared + 1 * blockDim.x);
|
||||
reduceBlock(shared + 2 * blockDim.x);
|
||||
reduceBlock(shared + 3 * blockDim.x);
|
||||
reduceBlock(shared + 4 * blockDim.x);
|
||||
reduceBlock(shared + 5 * blockDim.x);
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
shared = sharedmem;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(eflag) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
|
||||
shared += blockDim.x;
|
||||
buffer += gridDim.x * gridDim.y;
|
||||
|
||||
if(coulflag) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
|
||||
shared += blockDim.x;
|
||||
buffer += gridDim.x * gridDim.y;
|
||||
}
|
||||
}
|
||||
|
||||
if(vflag) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[0 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[1 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[2 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[3 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[4 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[5 * blockDim.x];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__global__ void MY_AP(PairVirialCompute_reduce)(int n)
|
||||
{
|
||||
sharedmem[threadIdx.x] = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT sum = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
//if(blockIdx.x==2) buf=&buf[n];
|
||||
|
||||
for(int i = 0; i < n; i += blockDim.x) {
|
||||
sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0);
|
||||
__syncthreads();
|
||||
reduceBlock(sharedmem);
|
||||
|
||||
if(threadIdx.x == 0) sum += sharedmem[0];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
if(gridDim.x == 1) { //evdwl
|
||||
_eng_vdwl[0] += sum;
|
||||
}
|
||||
|
||||
if(gridDim.x == 2) { //evdwl + ecoul only
|
||||
if(blockIdx.x == 0)
|
||||
_eng_vdwl[0] += sum;
|
||||
else
|
||||
_eng_coul[0] += sum;
|
||||
}
|
||||
|
||||
if(gridDim.x == 6) { //virial
|
||||
_virial[blockIdx.x] += sum;
|
||||
}
|
||||
|
||||
if(gridDim.x == 7) { //evdwl+virial
|
||||
if(blockIdx.x == 0)
|
||||
_eng_vdwl[0] += sum;
|
||||
else _virial[blockIdx.x - 1] += sum;
|
||||
}
|
||||
|
||||
if(gridDim.x == 8) { //evdwl+ecoul+virial
|
||||
if(blockIdx.x == 0)
|
||||
_eng_vdwl[0] += sum;
|
||||
else if(blockIdx.x == 1)
|
||||
_eng_coul[0] += sum;
|
||||
else
|
||||
_virial[blockIdx.x - 2] += sum;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,278 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef CUDA_PRECISION_H_
|
||||
#define CUDA_PRECISION_H_
|
||||
/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
|
||||
* Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
|
||||
* ***_CFLOAT: type definition of given property
|
||||
* ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
|
||||
*/
|
||||
|
||||
#ifdef CUDA_USE_BINNING
|
||||
#define CUDA_IF_BINNING(a) a
|
||||
#else
|
||||
#define CUDA_IF_BINNING(a)
|
||||
#endif
|
||||
|
||||
//GLOBAL
|
||||
|
||||
#ifdef CUDA_PRECISION
|
||||
#if CUDA_PRECISION == 1
|
||||
#define CUDA_CFLOAT float
|
||||
#define CUDA_F(x) x##f
|
||||
#endif
|
||||
#if CUDA_PRECISION == 2
|
||||
#define CUDA_CFLOAT double
|
||||
#define CUDA_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef CUDA_PRECISION
|
||||
#define CUDA_CFLOAT double
|
||||
#define CUDA_F(x) x
|
||||
#define CUDA_PRECISION 2
|
||||
#endif
|
||||
//--------------------------------
|
||||
//-----------FFT-----------------
|
||||
//--------------------------------
|
||||
|
||||
#ifdef FFT_PRECISION_CU
|
||||
#if FFT_PRECISION_CU == 1
|
||||
#define FFT_CFLOAT float
|
||||
#define FFT_F(x) x##f
|
||||
#endif
|
||||
#if FFT_PRECISION_CU == 2
|
||||
#define FFT_CFLOAT double
|
||||
#define FFT_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef FFT_PRECISION_CU
|
||||
#define FFT_CFLOAT CUDA_CFLOAT
|
||||
#define FFT_F(x) CUDA_F(x)
|
||||
#define FFT_PRECISION_CU CUDA_PRECISION
|
||||
#endif
|
||||
|
||||
//--------------------------------
|
||||
//-----------PPPM-----------------
|
||||
//--------------------------------
|
||||
|
||||
#ifndef PPPM_PRECISION
|
||||
#define PPPM_PRECISION CUDA_PRECISION
|
||||
#endif
|
||||
|
||||
#ifdef PPPM_PRECISION
|
||||
#if PPPM_PRECISION == 1
|
||||
#define PPPM_CFLOAT float
|
||||
#ifdef float3
|
||||
#define PPPM_CFLOAT3 float3
|
||||
#else
|
||||
struct PPPM_CFLOAT3 {
|
||||
PPPM_CFLOAT x;
|
||||
PPPM_CFLOAT y;
|
||||
PPPM_CFLOAT z;
|
||||
};
|
||||
#endif
|
||||
#define PPPM_F(x) x##f
|
||||
#endif
|
||||
#if PPPM_PRECISION == 2
|
||||
#define PPPM_CFLOAT double
|
||||
struct PPPM_CFLOAT3 {
|
||||
PPPM_CFLOAT x;
|
||||
PPPM_CFLOAT y;
|
||||
PPPM_CFLOAT z;
|
||||
};
|
||||
#define PPPM_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
//--------------------------------
|
||||
//-----------FORCE-----------------
|
||||
//--------------------------------
|
||||
|
||||
|
||||
#ifdef F_PRECISION
|
||||
#if F_PRECISION == 1
|
||||
#define F_CFLOAT float
|
||||
#define F_F(x) x##f
|
||||
#endif
|
||||
#if F_PRECISION == 2
|
||||
#define F_CFLOAT double
|
||||
#define F_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef F_PRECISION
|
||||
#define F_CFLOAT CUDA_CFLOAT
|
||||
#define F_F(x) CUDA_F(x)
|
||||
#define F_PRECISION CUDA_PRECISION
|
||||
#endif
|
||||
|
||||
#if F_PRECISION == 1
|
||||
#define _SQRT_ sqrtf
|
||||
#define _RSQRT_ rsqrtf
|
||||
#define _EXP_ expf
|
||||
#else
|
||||
#define _SQRT_ sqrt
|
||||
#define _RSQRT_ rsqrt
|
||||
#define _EXP_ exp
|
||||
#endif
|
||||
|
||||
#if F_PRECISION == 2
|
||||
struct F_CFLOAT2 {
|
||||
F_CFLOAT x;
|
||||
F_CFLOAT y;
|
||||
};
|
||||
struct F_CFLOAT3 {
|
||||
F_CFLOAT x;
|
||||
F_CFLOAT y;
|
||||
F_CFLOAT z;
|
||||
};
|
||||
struct F_CFLOAT4 {
|
||||
F_CFLOAT x;
|
||||
F_CFLOAT y;
|
||||
F_CFLOAT z;
|
||||
F_CFLOAT w;
|
||||
};
|
||||
#else
|
||||
#define F_CFLOAT2 float2
|
||||
#define F_CFLOAT3 float3
|
||||
#define F_CFLOAT4 float4
|
||||
#endif
|
||||
|
||||
//--------------------------------
|
||||
//-----------ENERGY-----------------
|
||||
//--------------------------------
|
||||
|
||||
#ifndef ENERGY_PRECISION
|
||||
#define ENERGY_CFLOAT CUDA_CFLOAT
|
||||
#define ENERGY_F(x) CUDA_F(x)
|
||||
#endif
|
||||
|
||||
#ifdef ENERGY_PRECISION
|
||||
#if ENERGY_PRECISION == 1
|
||||
#define ENERGY_CFLOAT float
|
||||
#define ENERGY_F(x) x##f
|
||||
#endif
|
||||
#if ENERGY_PRECISION == 2
|
||||
#define ENERGY_CFLOAT double
|
||||
#define ENERGY_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ENERGY_PRECISION
|
||||
#define ENERGY_CFLOAT CUDA_CFLOAT
|
||||
#define ENERGY_F(x) CUDA_F(x)
|
||||
#define ENERGY_PRECISION CUDA_PRECISION
|
||||
#endif
|
||||
|
||||
//--------------------------------
|
||||
//-----------POSITIONS------------
|
||||
//--------------------------------
|
||||
|
||||
#ifdef X_PRECISION
|
||||
#if X_PRECISION == 1
|
||||
#define X_CFLOAT float
|
||||
#define X_F(x) x##f
|
||||
#endif
|
||||
#if X_PRECISION == 2
|
||||
#define X_CFLOAT double
|
||||
#define X_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef X_PRECISION
|
||||
#define X_CFLOAT CUDA_CFLOAT
|
||||
#define X_F(x) CUDA_F(x)
|
||||
#define X_PRECISION CUDA_PRECISION
|
||||
#endif
|
||||
|
||||
#if X_PRECISION == 2
|
||||
struct X_CFLOAT2 {
|
||||
X_CFLOAT x;
|
||||
X_CFLOAT y;
|
||||
};
|
||||
struct X_CFLOAT3 {
|
||||
X_CFLOAT x;
|
||||
X_CFLOAT y;
|
||||
X_CFLOAT z;
|
||||
};
|
||||
struct X_CFLOAT4 {
|
||||
X_CFLOAT x;
|
||||
X_CFLOAT y;
|
||||
X_CFLOAT z;
|
||||
X_CFLOAT w;
|
||||
};
|
||||
#else
|
||||
#define X_CFLOAT2 float2
|
||||
#define X_CFLOAT3 float3
|
||||
#define X_CFLOAT4 float4
|
||||
#endif
|
||||
|
||||
//--------------------------------
|
||||
//-----------velocities-----------
|
||||
//--------------------------------
|
||||
|
||||
#ifdef V_PRECISION
|
||||
#if V_PRECISION == 1
|
||||
#define V_CFLOAT float
|
||||
#define V_F(x) x##f
|
||||
#endif
|
||||
#if V_PRECISION == 2
|
||||
#define V_CFLOAT double
|
||||
#define V_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef V_PRECISION
|
||||
#define V_CFLOAT CUDA_CFLOAT
|
||||
#define V_F(x) CUDA_F(x)
|
||||
#define V_PRECISION CUDA_PRECISION
|
||||
#endif
|
||||
|
||||
#if V_PRECISION == 2
|
||||
struct V_CFLOAT4 {
|
||||
V_CFLOAT x;
|
||||
V_CFLOAT y;
|
||||
V_CFLOAT z;
|
||||
V_CFLOAT w;
|
||||
};
|
||||
#else
|
||||
#define V_CFLOAT4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NO_PREC_TIMING
|
||||
struct my_times {
|
||||
unsigned int tv_sec;
|
||||
unsigned int tv_nsec;
|
||||
};
|
||||
|
||||
#define my_gettime(a,b)
|
||||
#else
|
||||
#define my_times timespec
|
||||
#define my_gettime(a,b) clock_gettime(a,b)
|
||||
#endif
|
||||
|
||||
#endif /*CUDA_PRECISION_H_*/
|
|
@ -1,370 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef _CUDA_SHARED_H_
|
||||
#define _CUDA_SHARED_H_
|
||||
#include "cuda_precision.h"
|
||||
|
||||
#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
|
||||
|
||||
struct dev_array {
|
||||
void* dev_data; // pointer to memory address on cuda device
|
||||
unsigned dim[3]; // array dimensions
|
||||
};
|
||||
|
||||
struct cuda_shared_atom { // relevent data from atom class
|
||||
dev_array dx; // cumulated distance for binning settings
|
||||
dev_array x; // position
|
||||
dev_array v; // velocity
|
||||
dev_array f; // force
|
||||
dev_array tag;
|
||||
dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1)
|
||||
dev_array mask;
|
||||
dev_array image;
|
||||
dev_array q; // charges
|
||||
dev_array mass; // per-type masses
|
||||
dev_array rmass; // per-atom masses
|
||||
dev_array radius; // per-atom radius
|
||||
dev_array density;
|
||||
dev_array omega;
|
||||
dev_array torque;
|
||||
dev_array molecule;
|
||||
|
||||
dev_array special;
|
||||
int maxspecial;
|
||||
dev_array nspecial;
|
||||
int* special_flag;
|
||||
int molecular;
|
||||
|
||||
dev_array eatom; // per-atom energy
|
||||
dev_array vatom; // per-atom virial
|
||||
int need_eatom;
|
||||
int need_vatom;
|
||||
|
||||
dev_array x_type; // position + type in X_CFLOAT4 struct
|
||||
dev_array v_radius; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
|
||||
dev_array omega_rmass; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
|
||||
|
||||
double* mass_host; // remember per-type host pointer to masses
|
||||
//int natoms; // total # of atoms in system, could be 0
|
||||
int nghost; // and ghost atoms on this proc
|
||||
int nlocal; // # of owned
|
||||
int nall; // total # of atoms in this proc
|
||||
int nmax; // max # of owned+ghost in arrays on this proc
|
||||
int ntypes;
|
||||
int q_flag; // do we have charges?
|
||||
int rmass_flag; // do we have per-atom masses?
|
||||
int firstgroup;
|
||||
int nfirst;
|
||||
|
||||
int update_nlocal;
|
||||
int update_nmax;
|
||||
int update_neigh;
|
||||
|
||||
dev_array xhold; // position at last neighboring
|
||||
X_CFLOAT triggerneighsq; // maximum square movement before reneighboring
|
||||
int reneigh_flag; // is reneighboring necessary
|
||||
int maxhold; // size of xhold
|
||||
int dist_check; //perform distance check for reneighboring
|
||||
dev_array binned_id; //id of each binned atom (not tag!!)
|
||||
dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
|
||||
float bin_extraspace;
|
||||
int bin_dim[3];
|
||||
int bin_nmax;
|
||||
dev_array map_array;
|
||||
};
|
||||
|
||||
struct cuda_shared_pair { // relevent data from pair class
|
||||
char cudable_force; // check for (cudable_force!=0)
|
||||
X_CFLOAT cut_global;
|
||||
X_CFLOAT cut_inner_global;
|
||||
X_CFLOAT cut_coul_global;
|
||||
double** cut; // type-type cutoff
|
||||
double** cutsq; // type-type cutoff
|
||||
double** cut_inner; // type-type cutoff for coul
|
||||
double** cut_coul; // type-type cutoff for coul
|
||||
double** coeff1; // tpye-type pair parameters
|
||||
double** coeff2;
|
||||
double** coeff3;
|
||||
double** coeff4;
|
||||
double** coeff5;
|
||||
double** coeff6;
|
||||
double** coeff7;
|
||||
double** coeff8;
|
||||
double** coeff9;
|
||||
double** coeff10;
|
||||
double** offset;
|
||||
double* special_lj;
|
||||
double* special_coul;
|
||||
dev_array virial; // ENERGY_CFLOAT
|
||||
dev_array eng_vdwl; // ENERGY_CFLOAT
|
||||
dev_array eng_coul; // ENERGY_CFLOAT
|
||||
X_CFLOAT cut_coulsq_global;
|
||||
F_CFLOAT g_ewald, kappa;
|
||||
int freeze_group_bit;
|
||||
|
||||
dev_array coeff1_gm;
|
||||
dev_array coeff2_gm;
|
||||
dev_array coeff3_gm;
|
||||
dev_array coeff4_gm;
|
||||
dev_array coeff5_gm;
|
||||
dev_array coeff6_gm;
|
||||
dev_array coeff7_gm;
|
||||
dev_array coeff8_gm;
|
||||
dev_array coeff9_gm;
|
||||
dev_array coeff10_gm;
|
||||
|
||||
int lastgridsize;
|
||||
int n_energy_virial;
|
||||
int collect_forces_later;
|
||||
int use_block_per_atom;
|
||||
int override_block_per_atom;
|
||||
bool neighall;
|
||||
|
||||
};
|
||||
|
||||
struct cuda_shared_domain { // relevent data from domain class
|
||||
X_CFLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
|
||||
X_CFLOAT subhi[3];
|
||||
X_CFLOAT boxlo[3];
|
||||
X_CFLOAT boxhi[3];
|
||||
X_CFLOAT prd[3];
|
||||
int periodicity[3]; // xyz periodicity as array
|
||||
|
||||
int triclinic;
|
||||
X_CFLOAT xy;
|
||||
X_CFLOAT xz;
|
||||
X_CFLOAT yz;
|
||||
X_CFLOAT boxlo_lamda[3];
|
||||
X_CFLOAT boxhi_lamda[3];
|
||||
X_CFLOAT prd_lamda[3];
|
||||
X_CFLOAT h[6];
|
||||
X_CFLOAT h_inv[6];
|
||||
V_CFLOAT h_rate[6];
|
||||
int update;
|
||||
};
|
||||
|
||||
struct cuda_shared_pppm {
|
||||
char cudable_force;
|
||||
#ifdef FFT_CUFFT
|
||||
FFT_CFLOAT* work1;
|
||||
FFT_CFLOAT* work2;
|
||||
FFT_CFLOAT* work3;
|
||||
PPPM_CFLOAT* greensfn;
|
||||
PPPM_CFLOAT* fkx;
|
||||
PPPM_CFLOAT* fky;
|
||||
PPPM_CFLOAT* fkz;
|
||||
PPPM_CFLOAT* vg;
|
||||
#endif
|
||||
int* part2grid;
|
||||
PPPM_CFLOAT* density_brick;
|
||||
int* density_brick_int;
|
||||
PPPM_CFLOAT density_intScale;
|
||||
PPPM_CFLOAT* vdx_brick;
|
||||
PPPM_CFLOAT* vdy_brick;
|
||||
PPPM_CFLOAT* vdz_brick;
|
||||
PPPM_CFLOAT* density_fft;
|
||||
ENERGY_CFLOAT* energy;
|
||||
ENERGY_CFLOAT* virial;
|
||||
int nxlo_in;
|
||||
int nxhi_in;
|
||||
int nxlo_out;
|
||||
int nxhi_out;
|
||||
int nylo_in;
|
||||
int nyhi_in;
|
||||
int nylo_out;
|
||||
int nyhi_out;
|
||||
int nzlo_in;
|
||||
int nzhi_in;
|
||||
int nzlo_out;
|
||||
int nzhi_out;
|
||||
int nx_pppm;
|
||||
int ny_pppm;
|
||||
int nz_pppm;
|
||||
PPPM_CFLOAT qqrd2e;
|
||||
int order;
|
||||
// float3 sublo;
|
||||
PPPM_CFLOAT* rho_coeff;
|
||||
int nmax;
|
||||
int nlocal;
|
||||
PPPM_CFLOAT* debugdata;
|
||||
PPPM_CFLOAT delxinv;
|
||||
PPPM_CFLOAT delyinv;
|
||||
PPPM_CFLOAT delzinv;
|
||||
int nlower;
|
||||
int nupper;
|
||||
PPPM_CFLOAT shiftone;
|
||||
PPPM_CFLOAT3* fH;
|
||||
};
|
||||
|
||||
struct cuda_shared_comm {
|
||||
int maxswap;
|
||||
int maxlistlength;
|
||||
dev_array pbc;
|
||||
dev_array slablo;
|
||||
dev_array slabhi;
|
||||
dev_array multilo;
|
||||
dev_array multihi;
|
||||
dev_array sendlist;
|
||||
int grow_flag;
|
||||
int comm_phase;
|
||||
|
||||
int nsend;
|
||||
int* nsend_swap;
|
||||
int* send_size;
|
||||
int* recv_size;
|
||||
double** buf_send;
|
||||
void** buf_send_dev;
|
||||
double** buf_recv;
|
||||
void** buf_recv_dev;
|
||||
void* buffer;
|
||||
int buffer_size;
|
||||
double overlap_split_ratio;
|
||||
};
|
||||
|
||||
struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cuda_shared_data
|
||||
int maxlocal;
|
||||
int inum; // # of I atoms neighbors are stored for local indices of I atoms
|
||||
int inum_border2;
|
||||
dev_array inum_border; // # of atoms which interact with border atoms
|
||||
dev_array ilist;
|
||||
dev_array ilist_border;
|
||||
dev_array numneigh;
|
||||
dev_array numneigh_inner;
|
||||
dev_array numneigh_border;
|
||||
dev_array firstneigh;
|
||||
dev_array neighbors;
|
||||
dev_array neighbors_border;
|
||||
dev_array neighbors_inner;
|
||||
int maxpage;
|
||||
dev_array page_pointers;
|
||||
dev_array* pages;
|
||||
int maxneighbors;
|
||||
int neigh_lists_per_page;
|
||||
double** cutneighsq;
|
||||
CUDA_CFLOAT* cu_cutneighsq;
|
||||
int* binned_id;
|
||||
int* bin_dim;
|
||||
int bin_nmax;
|
||||
float bin_extraspace;
|
||||
double maxcut;
|
||||
dev_array ex_type;
|
||||
int nex_type;
|
||||
dev_array ex1_bit;
|
||||
dev_array ex2_bit;
|
||||
int nex_group;
|
||||
dev_array ex_mol_bit;
|
||||
int nex_mol;
|
||||
|
||||
};
|
||||
|
||||
struct cuda_compile_settings { // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
|
||||
int prec_glob;
|
||||
int prec_x;
|
||||
int prec_v;
|
||||
int prec_f;
|
||||
int prec_pppm;
|
||||
int prec_fft;
|
||||
int cufft;
|
||||
int arch;
|
||||
};
|
||||
|
||||
struct cuda_timings_struct {
|
||||
//Debug:
|
||||
double test1;
|
||||
double test2;
|
||||
//transfers
|
||||
double transfer_upload_tmp_constr;
|
||||
double transfer_download_tmp_deconstr;
|
||||
|
||||
//communication
|
||||
double comm_forward_total;
|
||||
double comm_forward_mpi_upper;
|
||||
double comm_forward_mpi_lower;
|
||||
double comm_forward_kernel_pack;
|
||||
double comm_forward_kernel_unpack;
|
||||
double comm_forward_kernel_self;
|
||||
double comm_forward_upload;
|
||||
double comm_forward_download;
|
||||
|
||||
double comm_exchange_total;
|
||||
double comm_exchange_mpi;
|
||||
double comm_exchange_kernel_pack;
|
||||
double comm_exchange_kernel_unpack;
|
||||
double comm_exchange_kernel_fill;
|
||||
double comm_exchange_cpu_pack;
|
||||
double comm_exchange_upload;
|
||||
double comm_exchange_download;
|
||||
|
||||
double comm_border_total;
|
||||
double comm_border_mpi;
|
||||
double comm_border_kernel_pack;
|
||||
double comm_border_kernel_unpack;
|
||||
double comm_border_kernel_self;
|
||||
double comm_border_kernel_buildlist;
|
||||
double comm_border_upload;
|
||||
double comm_border_download;
|
||||
|
||||
//pair forces
|
||||
double pair_xtype_conversion;
|
||||
double pair_kernel;
|
||||
double pair_virial;
|
||||
double pair_force_collection;
|
||||
|
||||
//neighbor
|
||||
double neigh_bin;
|
||||
double neigh_build;
|
||||
double neigh_special;
|
||||
|
||||
//PPPM
|
||||
double pppm_particle_map;
|
||||
double pppm_make_rho;
|
||||
double pppm_brick2fft;
|
||||
double pppm_poisson;
|
||||
double pppm_fillbrick;
|
||||
double pppm_fieldforce;
|
||||
double pppm_compute;
|
||||
|
||||
};
|
||||
|
||||
struct cuda_shared_data { // holds space for all relevent data from the different classes
|
||||
void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
|
||||
int buffersize; //maxsize of buffer
|
||||
int buffer_new; //should be 1 if the pointer to buffer has changed
|
||||
void* flag;
|
||||
void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
|
||||
cuda_shared_atom atom;
|
||||
cuda_shared_pair pair;
|
||||
cuda_shared_domain domain;
|
||||
cuda_shared_pppm pppm;
|
||||
cuda_shared_comm comm;
|
||||
cuda_compile_settings compile_settings;
|
||||
cuda_timings_struct cuda_timings;
|
||||
int exchange_dim;
|
||||
int me; //mpi rank
|
||||
unsigned int datamask;
|
||||
int overlap_comm;
|
||||
};
|
||||
|
||||
|
||||
#endif // #ifndef _CUDA_SHARED_H_
|
|
@ -1,337 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "cuda_wrapper_cu.h"
|
||||
#include "cuda_wrapper_kernel.cu"
|
||||
|
||||
static int CudaWrapper_total_gpu_mem = 0;
|
||||
static double CudaWrapper_total_upload_time = 0;
|
||||
static double CudaWrapper_total_download_time = 0;
|
||||
static double CudaWrapper_cpubuffer_upload_time = 0;
|
||||
static double CudaWrapper_cpubuffer_download_time = 0;
|
||||
static cudaStream_t* streams;
|
||||
static int nstreams = 0;
|
||||
|
||||
void CudaWrapper_Init(int argc, char** argv, int me, int ppn, int* devicelist)
|
||||
{
|
||||
MYDBG(printf("# CUDA: debug mode on\n");)
|
||||
|
||||
#if __DEVICE_EMULATION__
|
||||
|
||||
printf("# CUDA: emulation mode on\n");
|
||||
|
||||
#else
|
||||
|
||||
// modified from cutil.h
|
||||
static int deviceCount = 0;
|
||||
static bool sharedmode = false;
|
||||
|
||||
if(deviceCount && !sharedmode) return;
|
||||
|
||||
if(deviceCount && sharedmode) cudaThreadExit();
|
||||
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));
|
||||
|
||||
if(deviceCount == 0) {
|
||||
fprintf(stderr, "cutil error: no devices supporting CUDA.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
MYDBG(printf("# CUDA There are %i devices supporting CUDA in this system.\n", deviceCount);)
|
||||
|
||||
cudaDeviceProp deviceProp[deviceCount];
|
||||
|
||||
for(int i = 0; i < deviceCount; i++)
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&(deviceProp[i]), i));
|
||||
|
||||
|
||||
int dev_list[deviceCount];
|
||||
|
||||
for(int i = 0; i < deviceCount; i++) dev_list[i] = i;
|
||||
|
||||
for(int i = 0; i < deviceCount; i++) {
|
||||
for(int j = 0; j < deviceCount - 1 - i; j++)
|
||||
if(deviceProp[dev_list[j]].multiProcessorCount < deviceProp[dev_list[j + 1]].multiProcessorCount) {
|
||||
int k = dev_list[j];
|
||||
dev_list[j] = dev_list[j + 1];
|
||||
dev_list[j + 1] = k;
|
||||
}
|
||||
}
|
||||
|
||||
for(int i = 0; i < deviceCount; i++) {
|
||||
if((deviceProp[dev_list[i]].computeMode == 0)) sharedmode = true;
|
||||
|
||||
cudaSetDevice(i);
|
||||
cudaSetDeviceFlags(cudaDeviceMapHost);
|
||||
}
|
||||
|
||||
if(sharedmode) {
|
||||
if(ppn && (me % ppn + 1) > deviceCount) {
|
||||
printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
int devicea = me % ppn;
|
||||
|
||||
if(devicelist) devicea = devicelist[devicea];
|
||||
else
|
||||
devicea = dev_list[devicea];
|
||||
|
||||
if(devicea >= deviceCount) {
|
||||
printf("Asking for non existent GPU %i. Found only %i GPUs.\n", devicea, deviceCount);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
MYDBG(
|
||||
printf(" # CUDA myid: %i take device: %i\n", me, devicea);
|
||||
)
|
||||
CUDA_SAFE_CALL(cudaSetDevice(devicea));
|
||||
} else {
|
||||
CUDA_SAFE_CALL(cudaSetValidDevices(dev_list, deviceCount));
|
||||
}
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
int dev;
|
||||
CUDA_SAFE_CALL(cudaGetDevice(&dev));
|
||||
|
||||
if(deviceProp[dev].major < 1) {
|
||||
fprintf(stderr, "CUDA error: device does not support CUDA.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
} else if((deviceProp[dev].major == 1) && (deviceProp[dev].minor != 3)) {
|
||||
fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n", dev, deviceProp[dev].name, deviceProp[dev].major, deviceProp[dev].minor);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if((deviceProp[dev].major == 2) && (CUDA_ARCH < 20)) {
|
||||
fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n", deviceProp[dev].major, deviceProp[dev].minor);
|
||||
}
|
||||
|
||||
if((deviceProp[dev].major == 1) && (CUDA_ARCH >= 20)) {
|
||||
fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n", CUDA_ARCH);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
|
||||
fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);
|
||||
MYDBG(fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);)
|
||||
|
||||
MYDBG
|
||||
(
|
||||
printf("name = %s\n", deviceProp[dev].name);
|
||||
printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem);
|
||||
printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock);
|
||||
printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock);
|
||||
printf("warpSize = %i\n", deviceProp[dev].warpSize);
|
||||
printf("memPitch = %i\n", deviceProp[dev].memPitch);
|
||||
printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock);
|
||||
printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]);
|
||||
printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]);
|
||||
printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem);
|
||||
printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor);
|
||||
printf("clockRate = %i\n", deviceProp[dev].clockRate);
|
||||
printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment);
|
||||
printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap);
|
||||
printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount);
|
||||
printf("computeMode = %i\n", deviceProp[dev].computeMode);
|
||||
)
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void* CudaWrapper_AllocCudaData(unsigned nbytes)
|
||||
{
|
||||
void* dev_data;
|
||||
CUDA_SAFE_CALL(cudaMalloc((void**)&dev_data, nbytes));
|
||||
MYDBG(printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data);)
|
||||
CudaWrapper_total_gpu_mem += nbytes;
|
||||
return dev_data;
|
||||
}
|
||||
|
||||
void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes)
|
||||
{
|
||||
MYDBG(printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data, host_data);)
|
||||
cudaThreadSynchronize();
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
CUDA_SAFE_CALL(cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice));
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
CudaWrapper_total_upload_time +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
}
|
||||
|
||||
void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
|
||||
{
|
||||
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
|
||||
cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice, streams[stream]);
|
||||
}
|
||||
|
||||
void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes)
|
||||
{
|
||||
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
|
||||
cudaThreadSynchronize();
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
CUDA_SAFE_CALL(cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost));
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
CudaWrapper_total_download_time +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
}
|
||||
|
||||
void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
|
||||
{
|
||||
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
|
||||
cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost, streams[stream]);
|
||||
}
|
||||
|
||||
void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes)
|
||||
{
|
||||
MYDBG(printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data, nbytes, (char*)dev_data + nbytes);)
|
||||
CUDA_SAFE_CALL(cudaFree(dev_data));
|
||||
CudaWrapper_total_gpu_mem -= nbytes;
|
||||
}
|
||||
|
||||
void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes)
|
||||
{
|
||||
MYDBG(printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data);)
|
||||
CUDA_SAFE_CALL(cudaMemset(dev_data, value, nbytes));
|
||||
}
|
||||
|
||||
void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes)
|
||||
{
|
||||
MYDBG(printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source, dev_dest);)
|
||||
CUDA_SAFE_CALL(cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped, bool writeCombined)
|
||||
{
|
||||
void* host_data;
|
||||
int flags = 0;
|
||||
|
||||
if(mapped) flags = flags | cudaHostAllocMapped;
|
||||
|
||||
if(writeCombined) flags = flags | cudaHostAllocWriteCombined;
|
||||
|
||||
CUDA_SAFE_CALL(cudaHostAlloc((void**)&host_data, nbytes, flags));
|
||||
// CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) );
|
||||
MYDBG(printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data);)
|
||||
return host_data;
|
||||
}
|
||||
|
||||
void CudaWrapper_FreePinnedHostData(void* host_data)
|
||||
{
|
||||
MYDBG(printf("# CUDA: freeing pinned host memory at %p \n", host_data);)
|
||||
|
||||
if(host_data)
|
||||
CUDA_SAFE_CALL(cudaFreeHost(host_data));
|
||||
}
|
||||
|
||||
void cuda_check_error(char* comment)
|
||||
{
|
||||
printf("ERROR-CUDA %s %s\n", comment, cudaGetErrorString(cudaGetLastError()));
|
||||
}
|
||||
|
||||
int CudaWrapper_CheckMemUsage()
|
||||
{
|
||||
size_t free, total;
|
||||
cudaMemGetInfo(&free, &total);
|
||||
return total - free; //possible with cuda 3.0 ???
|
||||
//return CudaWrapper_total_gpu_mem;
|
||||
}
|
||||
|
||||
double CudaWrapper_CheckUploadTime(bool reset)
|
||||
{
|
||||
if(reset) CudaWrapper_total_upload_time = 0.0;
|
||||
|
||||
return CudaWrapper_total_upload_time;
|
||||
}
|
||||
|
||||
double CudaWrapper_CheckDownloadTime(bool reset)
|
||||
{
|
||||
if(reset) CudaWrapper_total_download_time = 0.0;
|
||||
|
||||
return CudaWrapper_total_download_time;
|
||||
}
|
||||
|
||||
double CudaWrapper_CheckCPUBufUploadTime(bool reset)
|
||||
{
|
||||
if(reset) CudaWrapper_cpubuffer_upload_time = 0.0;
|
||||
|
||||
return CudaWrapper_cpubuffer_upload_time;
|
||||
}
|
||||
|
||||
double CudaWrapper_CheckCPUBufDownloadTime(bool reset)
|
||||
{
|
||||
if(reset) CudaWrapper_cpubuffer_download_time = 0.0;
|
||||
|
||||
return CudaWrapper_cpubuffer_download_time;
|
||||
}
|
||||
|
||||
void CudaWrapper_AddCPUBufUploadTime(double dt)
|
||||
{
|
||||
CudaWrapper_cpubuffer_upload_time += dt;
|
||||
}
|
||||
|
||||
void CudaWrapper_AddCPUBufDownloadTime(double dt)
|
||||
{
|
||||
CudaWrapper_cpubuffer_download_time += dt;
|
||||
}
|
||||
|
||||
void CudaWrapper_Sync()
|
||||
{
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
void CudaWrapper_SyncStream(int stream)
|
||||
{
|
||||
cudaStreamSynchronize(streams[stream]);
|
||||
}
|
||||
|
||||
void CudaWrapper_AddStreams(int n)
|
||||
{
|
||||
cudaStream_t* new_streams = new cudaStream_t[nstreams + n];
|
||||
|
||||
for(int i = 0; i < nstreams; i++) new_streams[i] = streams[i];
|
||||
|
||||
for(int i = nstreams; i < nstreams + n; i++) cudaStreamCreate(&new_streams[i]);
|
||||
|
||||
if(nstreams > 0)
|
||||
delete [] streams;
|
||||
|
||||
streams = new_streams;
|
||||
nstreams += n;
|
||||
}
|
||||
|
||||
void* CudaWrapper_returnStreams()
|
||||
{
|
||||
return (void*) streams;
|
||||
}
|
||||
|
||||
int CudaWrapper_returnNStreams()
|
||||
{
|
||||
return nstreams;
|
||||
}
|
||||
|
|
@ -1,52 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef _CUDA_DATA_WRAPPER_H_
|
||||
#define _CUDA_DATA_WRAPPER_H_
|
||||
|
||||
extern "C" void CudaWrapper_Init(int argc, char** argv, int me = 0, int ppn = 2, int* devicelist = NULL);
|
||||
extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes);
|
||||
extern "C" void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes);
|
||||
extern "C" void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
|
||||
extern "C" void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes);
|
||||
extern "C" void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
|
||||
extern "C" void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes = 0);
|
||||
extern "C" void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes);
|
||||
extern "C" void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes);
|
||||
extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped = false, bool writeCombind = false);
|
||||
extern "C" void CudaWrapper_FreePinnedHostData(void* dev_data);
|
||||
extern "C" void cuda_check_error(char* comment);
|
||||
extern "C" int CudaWrapper_CheckMemUsage();
|
||||
extern "C" double CudaWrapper_CheckUploadTime(bool reset = false);
|
||||
extern "C" double CudaWrapper_CheckDownloadTime(bool reset = false);
|
||||
extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset = false);
|
||||
extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset = false);
|
||||
extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt);
|
||||
extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt);
|
||||
extern "C" void CudaWrapper_Sync();
|
||||
extern "C" void CudaWrapper_SyncStream(int n);
|
||||
extern "C" void CudaWrapper_AddStreams(int n);
|
||||
extern "C" void* CudaWrapper_returnStreams();
|
||||
extern "C" int CudaWrapper_returnNStreams();
|
||||
|
||||
#endif // _CUDA_DATA_WRAPPER_H_
|
|
@ -1,24 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
// empty file to obay common make rule
|
|
@ -1,202 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX domain
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "domain_cu.h"
|
||||
#include "domain_kernel.cu"
|
||||
|
||||
void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata, int size)
|
||||
{
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_Domain Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(boxhi) , sdata->domain.boxhi , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , 3 * sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(triclinic) , & sdata->domain.triclinic , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(boxlo_lamda) , sdata->domain.boxlo_lamda , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(boxhi_lamda) , sdata->domain.boxhi_lamda , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(prd_lamda) , sdata->domain.prd_lamda , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , 6 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(h_inv) , sdata->domain.h_inv , 6 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(h_rate) , sdata->domain.h_rate , 6 * sizeof(V_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata , sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_Domain_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_Domain_UpdateNmax(sdata);
|
||||
Cuda_Domain_UpdateDomain(sdata);
|
||||
}
|
||||
|
||||
void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent)
|
||||
{
|
||||
Cuda_Domain_UpdateNmax(sdata);
|
||||
//if(sdata->domain.update)
|
||||
Cuda_Domain_UpdateDomain(sdata);
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int box_change = 0;
|
||||
|
||||
if(extent) box_change = 1;
|
||||
|
||||
int sharedmem = 0;
|
||||
|
||||
if(box_change) sharedmem = 6 * sizeof(X_CFLOAT);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, sharedmem);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
sharedmem *= threads.x;
|
||||
|
||||
if((box_change) && (sdata->buffer_new or (6 * sizeof(X_CFLOAT)*grid.x * grid.y > sdata->buffersize)))
|
||||
Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_CFLOAT));
|
||||
|
||||
|
||||
Domain_PBC_Kernel <<< grid, threads, sharedmem>>>(deform_remap, deform_groupbit, box_change);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed");
|
||||
|
||||
if(box_change) {
|
||||
X_CFLOAT buf2[6 * layout.x * layout.y];
|
||||
X_CFLOAT* buf = buf2;
|
||||
int flag;
|
||||
cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//printf("Flag: %i\n",flag);
|
||||
X_CFLOAT min, max;
|
||||
min = 1.0 * BIG;
|
||||
max = -1.0 * BIG;
|
||||
|
||||
for(int i = 0; i < layout.x * layout.y; i++) {
|
||||
if(buf[i] < min) min = buf[i];
|
||||
|
||||
if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
|
||||
}
|
||||
|
||||
extent[0] = min;
|
||||
extent[1] = max;
|
||||
|
||||
buf += 2 * layout.x * layout.y;
|
||||
min = 1.0 * BIG;
|
||||
max = -1.0 * BIG;
|
||||
|
||||
for(int i = 0; i < layout.x * layout.y; i++) {
|
||||
if(buf[i] < min) min = buf[i];
|
||||
|
||||
if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
|
||||
}
|
||||
|
||||
extent[2] = min;
|
||||
extent[3] = max;
|
||||
|
||||
buf += 2 * layout.x * layout.y;
|
||||
min = 1.0 * BIG;
|
||||
max = -1.0 * BIG;
|
||||
|
||||
for(int i = 0; i < layout.x * layout.y; i++) {
|
||||
if(buf[i] < min) min = buf[i];
|
||||
|
||||
if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
|
||||
}
|
||||
|
||||
extent[4] = min;
|
||||
extent[5] = max;
|
||||
//printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]);
|
||||
/* int n=grid.x*grid.y;
|
||||
if(n<128) threads.x=32;
|
||||
else if(n<256) threads.x=64;
|
||||
else threads.x=128;
|
||||
sharedmem=n*sizeof(X_CFLOAT);
|
||||
grid.x=6;
|
||||
grid.y=1;
|
||||
Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/
|
||||
}
|
||||
}
|
||||
|
||||
void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n)
|
||||
{
|
||||
Cuda_Domain_UpdateNmax(sdata);
|
||||
//if(sdata->domain.update)
|
||||
Cuda_Domain_UpdateDomain(sdata);
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Domain_lamda2x_Kernel <<< grid, threads, 0>>>(n);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed");
|
||||
}
|
||||
|
||||
void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n)
|
||||
{
|
||||
Cuda_Domain_UpdateNmax(sdata);
|
||||
//if(sdata->domain.update)
|
||||
Cuda_Domain_UpdateDomain(sdata);
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Domain_x2lamda_Kernel <<< grid, threads, 0>>>(n);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed");
|
||||
}
|
|
@ -1,29 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent = NULL);
|
||||
extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n);
|
||||
extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n);
|
|
@ -1,293 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ X_CFLOAT sharedmem[];
|
||||
|
||||
#define BIG 1e10
|
||||
__global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box_change)
|
||||
{
|
||||
int idim, otherdims;
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
X_CFLOAT lo[3];
|
||||
X_CFLOAT hi[3];
|
||||
X_CFLOAT* period;
|
||||
|
||||
if(_triclinic == 0) {
|
||||
lo[0] = _boxlo[0];
|
||||
lo[1] = _boxlo[1];
|
||||
lo[2] = _boxlo[2];
|
||||
|
||||
hi[0] = _boxhi[0];
|
||||
hi[1] = _boxhi[1];
|
||||
hi[2] = _boxhi[2];
|
||||
period = _prd;
|
||||
} else {
|
||||
lo[0] = _boxlo_lamda[0];
|
||||
lo[1] = _boxlo_lamda[1];
|
||||
lo[2] = _boxlo_lamda[2];
|
||||
|
||||
hi[0] = _boxhi_lamda[0];
|
||||
hi[1] = _boxhi_lamda[1];
|
||||
hi[2] = _boxhi_lamda[2];
|
||||
period = _prd_lamda;
|
||||
}
|
||||
|
||||
|
||||
X_CFLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]);
|
||||
X_CFLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]);
|
||||
X_CFLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]);
|
||||
|
||||
X_CFLOAT* buf = (X_CFLOAT*) _buffer;
|
||||
buf += blockIdx.x * gridDim.y + blockIdx.y;
|
||||
buf[0] = tmpx;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = tmpx;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = tmpy;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = tmpy;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = tmpz;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = tmpz;
|
||||
|
||||
if(i < _nlocal) {
|
||||
|
||||
if(_periodicity[0]) {
|
||||
if(_x[i] < lo[0]) {
|
||||
_x[i] += period[0];
|
||||
|
||||
if(deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0];
|
||||
|
||||
idim = _image[i] & 1023;
|
||||
otherdims = _image[i] ^ idim;
|
||||
idim--;
|
||||
idim &= 1023;
|
||||
_image[i] = otherdims | idim;
|
||||
}
|
||||
|
||||
if(_x[i] >= hi[0]) {
|
||||
_x[i] -= period[0];
|
||||
_x[i] = MAX(_x[i], lo[0]);
|
||||
|
||||
if(deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0];
|
||||
|
||||
idim = _image[i] & 1023;
|
||||
otherdims = _image[i] ^ idim;
|
||||
idim++;
|
||||
idim &= 1023;
|
||||
_image[i] = otherdims | idim;
|
||||
}
|
||||
}
|
||||
|
||||
if(_periodicity[1]) {
|
||||
if(_x[i + _nmax] < lo[1]) {
|
||||
_x[i + _nmax] += period[1];
|
||||
|
||||
if(deform_remap && _mask[i] & deform_groupbit) {
|
||||
_v[i] += _h_rate[5];
|
||||
_v[i + _nmax] += _h_rate[1];
|
||||
}
|
||||
|
||||
idim = (_image[i] >> 10) & 1023;
|
||||
otherdims = _image[i] ^ (idim << 10);
|
||||
idim--;
|
||||
idim &= 1023;
|
||||
_image[i] = otherdims | (idim << 10);
|
||||
}
|
||||
|
||||
if(_x[i + _nmax] >= hi[1]) {
|
||||
_x[i + _nmax] -= period[1];
|
||||
_x[i + _nmax] = MAX(_x[i + _nmax], lo[1]);
|
||||
|
||||
if(deform_remap && _mask[i] & deform_groupbit) {
|
||||
_v[i] -= _h_rate[5];
|
||||
_v[i + _nmax] -= _h_rate[1];
|
||||
}
|
||||
|
||||
idim = (_image[i] >> 10) & 1023;
|
||||
otherdims = _image[i] ^ (idim << 10);
|
||||
idim++;
|
||||
idim &= 1023;
|
||||
_image[i] = otherdims | (idim << 10);
|
||||
}
|
||||
}
|
||||
|
||||
if(_periodicity[2]) {
|
||||
if(_x[i + 2 * _nmax] < lo[2]) {
|
||||
_x[i + 2 * _nmax] += period[2];
|
||||
|
||||
if(deform_remap && _mask[i] & deform_groupbit) {
|
||||
_v[i] += _h_rate[4];
|
||||
_v[i + _nmax] += _h_rate[3];
|
||||
_v[i + 2 * _nmax] += _h_rate[2];
|
||||
}
|
||||
|
||||
idim = _image[i] >> 20;
|
||||
otherdims = _image[i] ^ (idim << 20);
|
||||
idim--;
|
||||
idim &= 1023;
|
||||
_image[i] = otherdims | (idim << 20);
|
||||
}
|
||||
|
||||
if(_x[i + 2 * _nmax] >= hi[2]) {
|
||||
_x[i + 2 * _nmax] -= period[2];
|
||||
_x[i + 2 * _nmax] = MAX(_x[i + 2 * _nmax], lo[2]);
|
||||
|
||||
if(deform_remap && _mask[i] & deform_groupbit) {
|
||||
_v[i] -= _h_rate[4];
|
||||
_v[i + _nmax] -= _h_rate[3];
|
||||
_v[i + 2 * _nmax] -= _h_rate[2];
|
||||
}
|
||||
|
||||
idim = _image[i] >> 20;
|
||||
otherdims = _image[i] ^ (idim << 20);
|
||||
idim++;
|
||||
idim &= 1023;
|
||||
_image[i] = otherdims | (idim << 20);
|
||||
}
|
||||
}
|
||||
|
||||
if(box_change) {
|
||||
tmpx = _x[i];
|
||||
tmpy = _x[i + _nmax];
|
||||
tmpz = _x[i + 2 * _nmax];
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if(box_change) {
|
||||
X_CFLOAT minx = BIG;
|
||||
X_CFLOAT maxx = -BIG;
|
||||
X_CFLOAT miny = BIG;
|
||||
X_CFLOAT maxy = -BIG;
|
||||
X_CFLOAT minz = BIG;
|
||||
X_CFLOAT maxz = -BIG;
|
||||
|
||||
if(not _periodicity[0]) {
|
||||
sharedmem[threadIdx.x] = tmpx;
|
||||
minOfBlock(sharedmem);
|
||||
minx = sharedmem[0];
|
||||
__syncthreads();
|
||||
sharedmem[threadIdx.x] = tmpx;
|
||||
maxOfBlock(sharedmem);
|
||||
maxx = sharedmem[0];
|
||||
__syncthreads();
|
||||
} else {
|
||||
minx = lo[0];
|
||||
maxx = hi[0];
|
||||
}
|
||||
|
||||
if(not _periodicity[1]) {
|
||||
sharedmem[threadIdx.x] = tmpy;
|
||||
minOfBlock(sharedmem);
|
||||
miny = sharedmem[0];
|
||||
__syncthreads();
|
||||
sharedmem[threadIdx.x] = tmpy;
|
||||
maxOfBlock(sharedmem);
|
||||
maxy = sharedmem[0];
|
||||
__syncthreads();
|
||||
} else {
|
||||
minx = lo[1];
|
||||
maxx = hi[1];
|
||||
}
|
||||
|
||||
if(not _periodicity[2]) {
|
||||
sharedmem[threadIdx.x] = tmpz;
|
||||
minOfBlock(sharedmem);
|
||||
minz = sharedmem[0];
|
||||
__syncthreads();
|
||||
sharedmem[threadIdx.x] = tmpz;
|
||||
maxOfBlock(sharedmem);
|
||||
maxz = sharedmem[0];
|
||||
__syncthreads();
|
||||
} else {
|
||||
minz = lo[2];
|
||||
maxz = hi[2];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buf = (X_CFLOAT*) _buffer;
|
||||
buf += blockIdx.x * gridDim.y + blockIdx.y;
|
||||
buf[0] = minx;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = maxx;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = miny;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = maxy;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = minz;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
buf[0] = maxz;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Domain_reduceBoxExtent(double* extent, int n)
|
||||
{
|
||||
X_CFLOAT* buf = (X_CFLOAT*) _buffer;
|
||||
buf += blockIdx.x * n;
|
||||
copyGlobToShared(buf, sharedmem, n);
|
||||
|
||||
if(blockIdx.x % 2 == 0)
|
||||
minOfData(sharedmem, n);
|
||||
else
|
||||
maxOfData(sharedmem, n);
|
||||
|
||||
extent[blockIdx.x] = sharedmem[0];
|
||||
}
|
||||
|
||||
__global__ void Domain_lamda2x_Kernel(int n)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
X_CFLOAT ytmp = _x[i + _nmax];
|
||||
X_CFLOAT ztmp = _x[i + 2 * _nmax];
|
||||
_x[i] = _h[0] * _x[i] + _h[5] * ytmp + _h[4] * ztmp + _boxlo[0];
|
||||
_x[i + _nmax] = _h[1] * ytmp + _h[3] * ztmp + _boxlo[1];
|
||||
_x[i + 2 * _nmax] = _h[2] * ztmp + _boxlo[2];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Domain_x2lamda_Kernel(int n)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
X_CFLOAT delta[3];
|
||||
|
||||
if(i < n) {
|
||||
delta[0] = _x[i] - _boxlo[0];
|
||||
delta[1] = _x[i + _nmax] - _boxlo[1];
|
||||
delta[2] = _x[i + 2 * _nmax] - _boxlo[2];
|
||||
|
||||
_x[i] = _h_inv[0] * delta[0] + _h_inv[5] * delta[1] + _h_inv[4] * delta[2];
|
||||
_x[i + _nmax] = _h_inv[1] * delta[1] + _h_inv[3] * delta[2];
|
||||
_x[i + 2 * _nmax] = _h_inv[2] * delta[2];
|
||||
}
|
||||
}
|
|
@ -1,103 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
//#define CUDA_PRECISION 1
|
||||
#include "cuda_precision.h"
|
||||
#include "cuda_common.h"
|
||||
struct FFT_DATA {
|
||||
FFT_CFLOAT re;
|
||||
FFT_CFLOAT im;
|
||||
};
|
||||
|
||||
#include "fft3d_cuda_cu.h"
|
||||
#include "fft3d_cuda_kernel.cu"
|
||||
#include <stdio.h>
|
||||
|
||||
void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow)
|
||||
{
|
||||
|
||||
dim3 grid;
|
||||
grid.x = nslow;
|
||||
grid.y = nmid;
|
||||
grid.z = 1;
|
||||
dim3 threads;
|
||||
threads.x = nfast;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
cudaThreadSynchronize();
|
||||
initfftdata_kernel <<< grid, threads, 0>>>(in, out);
|
||||
cudaThreadSynchronize();
|
||||
MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
|
||||
}
|
||||
|
||||
|
||||
void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
|
||||
{
|
||||
|
||||
dim3 grid;
|
||||
grid.x = nslow;
|
||||
grid.y = nmid;
|
||||
grid.z = 1;
|
||||
dim3 threads;
|
||||
threads.x = nfast * 2;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
|
||||
cudaThreadSynchronize();
|
||||
MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
|
||||
}
|
||||
|
||||
void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
|
||||
{
|
||||
|
||||
dim3 grid;
|
||||
grid.x = nslow;
|
||||
grid.y = nmid;
|
||||
grid.z = 1;
|
||||
dim3 threads;
|
||||
threads.x = nfast * 2;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
|
||||
{
|
||||
|
||||
dim3 grid;
|
||||
grid.x = (ihi - ilo + 1);
|
||||
grid.y = (jhi - jlo + 1);
|
||||
grid.z = 1;
|
||||
dim3 threads;
|
||||
threads.x = (khi - klo + 1) * 2;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
permute_part_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
void FFTsyncthreads()
|
||||
{
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow);
|
||||
extern "C" void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
|
||||
extern "C" void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
|
||||
extern "C" void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo);
|
||||
extern "C" void FFTsyncthreads();
|
|
@ -1,46 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__global__ void initfftdata_kernel(double* in, FFT_CFLOAT* out)
|
||||
{
|
||||
out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
|
||||
out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x) + 1] = 0;
|
||||
}
|
||||
|
||||
|
||||
__global__ void permute_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
|
||||
{
|
||||
out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
|
||||
}
|
||||
|
||||
__global__ void permute_scale_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
|
||||
{
|
||||
out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x] * gridDim.x * gridDim.y * blockDim.x * 0.5;
|
||||
}
|
||||
|
||||
__global__ void permute_part_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
|
||||
{
|
||||
{
|
||||
out[2 * ((threadIdx.x / 2) * (ihi - ilo + 1) * (jhi - jlo + 1) + (blockIdx.x) * (jhi - jlo + 1) + blockIdx.y - jlo) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[2 * (blockIdx.x + ilo) * nmid * nslow + 2 * (blockIdx.y + jlo) * nmid + threadIdx.x + 2 * klo];
|
||||
}
|
||||
}
|
|
@ -1,93 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_add_force_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "fix_addforce_cuda_cu.h"
|
||||
#include "fix_addforce_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_FixAddForceCuda_UpdateNmax(sdata);
|
||||
}
|
||||
|
||||
void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixAddForceCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixAddForceCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit, axvalue, ayvalue, azvalue);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x;
|
||||
grid.x = 4;
|
||||
threads.x = 512;
|
||||
reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed");
|
||||
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal);
|
|
@ -1,90 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ F_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
sharedmem[threadIdx.x + blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit)
|
||||
//if (iregion >= 0 &&
|
||||
//match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported
|
||||
{
|
||||
sharedmem[threadIdx.x] = -xvalue * _x[i] - yvalue * _x[i + 1 * _nmax] - zvalue * _x[i + 2 * _nmax];
|
||||
sharedmem[threadIdx.x + blockDim.x] = _f[i];
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 1 * _nmax];
|
||||
sharedmem[threadIdx.x + 3 * blockDim.x] = _f[i + 2 * _nmax];
|
||||
_f[i] += xvalue;
|
||||
_f[i + 1 * _nmax] += yvalue;
|
||||
_f[i + 2 * _nmax] += zvalue;
|
||||
}
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[3 * blockDim.x]);
|
||||
F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__global__ void reduce_foriginal(int n, F_CFLOAT* foriginal)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
F_CFLOAT myforig = 0.0;
|
||||
F_CFLOAT* buf = (F_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
|
||||
if(i + threadIdx.x < n)
|
||||
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
reduceBlock(sharedmem);
|
||||
i += blockDim.x;
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
myforig += sharedmem[0];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
foriginal[blockIdx.x] = myforig;
|
||||
}
|
|
@ -1,107 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_ave_force_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "fix_aveforce_cuda_cu.h"
|
||||
#include "fix_aveforce_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_FixAveForceCuda_UpdateNmax(sdata);
|
||||
}
|
||||
|
||||
void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixAveForceCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixAveForceCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
|
||||
Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x;
|
||||
grid.x = 4;
|
||||
threads.x = 512;
|
||||
Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
|
||||
Cuda_FixAveForceCuda_PostForce_Set_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, axvalue, ayvalue, azvalue);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed");
|
||||
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal);
|
||||
extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue);
|
|
@ -1,96 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ F_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
sharedmem[threadIdx.x + blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
sharedmem[threadIdx.x] = _f[i];
|
||||
sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
|
||||
sharedmem[threadIdx.x + 3 * blockDim.x] = 1;
|
||||
}
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[3 * blockDim.x]);
|
||||
F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_CFLOAT* foriginal)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
F_CFLOAT myforig = 0.0;
|
||||
F_CFLOAT* buf = (F_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
|
||||
if(i + threadIdx.x < n)
|
||||
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
reduceBlock(sharedmem);
|
||||
i += blockDim.x;
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
myforig += sharedmem[0];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
foriginal[blockIdx.x] = myforig;
|
||||
}
|
||||
|
||||
__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
if(xflag) _f[i] = xvalue;
|
||||
|
||||
if(yflag) _f[i + 1 * _nmax] = yvalue;
|
||||
|
||||
if(zflag) _f[i + 2 * _nmax] = zvalue;
|
||||
}
|
||||
}
|
|
@ -1,55 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_enforce2d_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
#include "fix_enforce2d_cuda_cu.h"
|
||||
#include "fix_enforce2d_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixEnforce2dCuda_Init(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
Cuda_FixEnforce2dCuda_PostForce_Kernel <<< grid, threads>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed");
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit);
|
|
@ -1,34 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
_v[i + 2 * _nmax] = V_F(0.0);
|
||||
_f[i + 2 * _nmax] = F_F(0.0);
|
||||
}
|
||||
}
|
|
@ -1,98 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_freeze_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "fix_freeze_cuda_cu.h"
|
||||
#include "fix_freeze_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_FixFreezeCuda_UpdateNmax(sdata);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixFreezeCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixFreezeCuda_UpdateBuffer(sdata);
|
||||
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x;
|
||||
grid.x = 3;
|
||||
threads.x = 512;
|
||||
Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
|
||||
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal);
|
|
@ -1,87 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ F_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
sharedmem[threadIdx.x + blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
sharedmem[threadIdx.x] = _f[i];
|
||||
sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
|
||||
|
||||
_f[i] = F_F(0.0);
|
||||
_f[i + 1 * _nmax] = F_F(0.0);
|
||||
_f[i + 2 * _nmax] = F_F(0.0);
|
||||
_torque[i] = F_F(0.0);
|
||||
_torque[i + 1 * _nmax] = F_F(0.0);
|
||||
_torque[i + 2 * _nmax] = F_F(0.0);
|
||||
}
|
||||
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
F_CFLOAT myforig = 0.0;
|
||||
F_CFLOAT* buf = (F_CFLOAT*)_buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
|
||||
if(i + threadIdx.x < n)
|
||||
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
reduceBlock(sharedmem);
|
||||
i += blockDim.x;
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
myforig += sharedmem[0];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
foriginal[blockIdx.x] = myforig;
|
||||
}
|
||||
|
|
@ -1,92 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_gravity_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "fix_gravity_cuda_cu.h"
|
||||
#include "fix_gravity_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_FixGravityCuda_UpdateNmax(sdata);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixGravityCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixGravityCuda_UpdateBuffer(sdata);
|
||||
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixGravityCuda_PostForce_Kernel <<< grid, threads>>> (groupbit, xacc, yacc, zacc);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed");
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc);
|
|
@ -1,36 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
F_CFLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]];
|
||||
_f[i] += mass * xacc;
|
||||
_f[i + 1 * _nmax] += mass * yacc;
|
||||
_f[i + 2 * _nmax] += mass * zacc;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,255 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_nh_cuda
|
||||
#define IncludeCommonNeigh
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
#include "fix_nh_cuda_cu.h"
|
||||
#include "fix_nh_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
|
||||
}
|
||||
|
||||
void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int size = (unsigned)10 * sizeof(int);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixNHCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
|
||||
}
|
||||
|
||||
void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixNHCuda_UpdateBuffer(sdata);
|
||||
|
||||
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
|
||||
F_CFLOAT3 factor2;
|
||||
|
||||
if(p_triclinic) {
|
||||
factor2.x = factor_h[3], factor2.y = factor_h[4];
|
||||
factor2.z = factor_h[5];
|
||||
}
|
||||
|
||||
int3 layout = getgrid(mynlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
FixNHCuda_nh_v_press_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixNHCuda_UpdateBuffer(sdata);
|
||||
|
||||
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
|
||||
F_CFLOAT3 factor2;
|
||||
|
||||
if(p_triclinic) {
|
||||
factor2.x = factor_h[3], factor2.y = factor_h[4];
|
||||
factor2.z = factor_h[5];
|
||||
}
|
||||
|
||||
int3 layout = getgrid(mynlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
CUT_CHECK_ERROR("FixNHCuda: fix nh v_press pre Kernel execution failed");
|
||||
FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixNHCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(mynlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
FixNHCuda_nh_v_temp_Kernel <<< grid, threads>>> (groupbit, factor_eta);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed");
|
||||
|
||||
}
|
||||
void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixNHCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(mynlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
FixNHCuda_nve_v_Kernel <<< grid, threads>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed");
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixNHCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(mynlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
cudaMemset(sdata->buffer, 0, sizeof(int));
|
||||
FixNHCuda_nve_x_Kernel <<< grid, threads>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
int reneigh_flag;
|
||||
cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
sdata->atom.reneigh_flag += reneigh_flag;
|
||||
CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed");
|
||||
}
|
||||
|
||||
void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixNHCuda_UpdateBuffer(sdata);
|
||||
|
||||
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
|
||||
F_CFLOAT3 factor2;
|
||||
|
||||
if(p_triclinic) {
|
||||
factor2.x = factor_h[3], factor2.y = factor_h[4];
|
||||
factor2.z = factor_h[5];
|
||||
}
|
||||
|
||||
int3 layout = getgrid(mynlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias: Kernel execution failed");
|
||||
}
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
|
||||
extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
|
@ -1,205 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
|
||||
{
|
||||
if(_dist_check) {
|
||||
|
||||
X_CFLOAT d = X_F(0.0);
|
||||
|
||||
if(i < _nlocal) {
|
||||
X_CFLOAT tmp = xtmp - _xhold[i];
|
||||
d = tmp * tmp;
|
||||
tmp = ytmp - _xhold[i + _maxhold];
|
||||
d += tmp * tmp;
|
||||
tmp = ztmp - _xhold[i + 2 * _maxhold];
|
||||
d += tmp * tmp;
|
||||
|
||||
d = ((_mask[i] & groupbit)) ? d : X_F(0.0);
|
||||
}
|
||||
|
||||
if(not __all(d <= _triggerneighsq))
|
||||
_reneigh_flag[0] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
V_CFLOAT vx = my_v[0];
|
||||
V_CFLOAT vy = my_v[_nmax];
|
||||
V_CFLOAT vz = my_v[2 * _nmax];
|
||||
vx *= factor.x;
|
||||
vy *= factor.y;
|
||||
vz *= factor.z;
|
||||
|
||||
if(p_triclinic) {
|
||||
vx += vy * factor2.z + vz * factor2.y;
|
||||
vy += vz * factor2.x;
|
||||
}
|
||||
|
||||
vx *= factor.x;
|
||||
vy *= factor.y;
|
||||
vz *= factor.z;
|
||||
my_v[0] = vx;
|
||||
my_v[_nmax] = vy;
|
||||
my_v[2 * _nmax] = vz;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_CFLOAT factor_eta)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
my_v[0] *= factor_eta;
|
||||
my_v[_nmax] *= factor_eta;
|
||||
my_v[2 * _nmax] *= factor_eta;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
||||
V_CFLOAT vx = my_v[0];
|
||||
V_CFLOAT vy = my_v[_nmax];
|
||||
V_CFLOAT vz = my_v[2 * _nmax];
|
||||
vx *= factor.x;
|
||||
vy *= factor.y;
|
||||
vz *= factor.z;
|
||||
|
||||
if(p_triclinic) {
|
||||
vx += vy * factor2.z + vz * factor2.y;
|
||||
vy += vz * factor2.x;
|
||||
}
|
||||
|
||||
vx *= factor.x;
|
||||
vy *= factor.y;
|
||||
vz *= factor.z;
|
||||
my_v[0] = vx + dtfm * my_f[0];
|
||||
my_v[_nmax] = vy + dtfm * my_f[_nmax];
|
||||
my_v[2 * _nmax] = vz + dtfm * my_f[_nmax * 2];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void FixNHCuda_nve_v_Kernel(int groupbit)
|
||||
{
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
||||
*my_v = (*my_v + dtfm * (*my_f));
|
||||
my_f += _nmax;
|
||||
my_v += _nmax;
|
||||
*my_v = (*my_v + dtfm * (*my_f));
|
||||
my_f += _nmax;
|
||||
my_v += _nmax;
|
||||
*my_v = (*my_v + dtfm * (*my_f));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void FixNHCuda_nve_x_Kernel(int groupbit)
|
||||
{
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
X_CFLOAT* my_x = _x + i;
|
||||
|
||||
xtmp = *my_x += _dtv * *my_v;
|
||||
my_v += _nmax;
|
||||
my_x += _nmax;
|
||||
ytmp = *my_x += _dtv * *my_v;
|
||||
my_v += _nmax;
|
||||
my_x += _nmax;
|
||||
ztmp = *my_x += _dtv * *my_v;
|
||||
}
|
||||
|
||||
check_distance(xtmp, ytmp, ztmp, i, groupbit);
|
||||
}
|
||||
|
||||
|
||||
__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
|
||||
{
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
||||
V_CFLOAT vx = my_v[0] + dtfm * my_f[0];
|
||||
V_CFLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax];
|
||||
V_CFLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax];
|
||||
|
||||
vx *= factor.x;
|
||||
vy *= factor.y;
|
||||
vz *= factor.z;
|
||||
|
||||
if(p_triclinic) {
|
||||
vx += vy * factor2.z + vz * factor2.y;
|
||||
vy += vz * factor2.x;
|
||||
}
|
||||
|
||||
vx *= factor.x;
|
||||
vy *= factor.y;
|
||||
vz *= factor.z;
|
||||
my_v[0] = vx;
|
||||
my_v[_nmax] = vy;
|
||||
my_v[2 * _nmax] = vz;
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -1,134 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_nve_cuda
|
||||
#define IncludeCommonNeigh
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
#include "fix_nve_cuda_cu.h"
|
||||
#include "fix_nve_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
|
||||
}
|
||||
|
||||
void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int size = (unsigned)10 * sizeof(int);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixNVECuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
|
||||
}
|
||||
|
||||
void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
|
||||
Cuda_FixNVECuda_UpdateNmax(sdata);
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNVECuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixNVECuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(mynlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
cudaMemset(sdata->buffer, 0, sizeof(int));
|
||||
FixNVECuda_InitialIntegrate_Kernel <<< grid, threads>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
int reneigh_flag;
|
||||
cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
sdata->atom.reneigh_flag += reneigh_flag;
|
||||
CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNVECuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixNVECuda_UpdateBuffer(sdata);
|
||||
|
||||
#ifdef CUDA_USE_BINNING
|
||||
|
||||
dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
|
||||
dim3 threads(sdata->domain.bin_nmax, 1, 1);
|
||||
FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed");
|
||||
|
||||
#else
|
||||
|
||||
int3 layout = getgrid(mynlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed");
|
||||
|
||||
#endif
|
||||
}
|
||||
|
|
@ -1,28 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
|
||||
extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
|
||||
extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
|
|
@ -1,166 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
|
||||
{
|
||||
if(_dist_check) {
|
||||
X_CFLOAT tmp = xtmp - _xhold[i];
|
||||
X_CFLOAT d = tmp * tmp;
|
||||
tmp = ytmp - _xhold[i + _maxhold];
|
||||
d += tmp * tmp;
|
||||
tmp = ztmp - _xhold[i + 2 * _maxhold];
|
||||
d += tmp * tmp;
|
||||
|
||||
d = ((i < _nlocal) && (_mask[i] & groupbit)) ? d : X_F(0.0);
|
||||
|
||||
if(not __all(d <= _triggerneighsq))
|
||||
_reneigh_flag[0] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
|
||||
{
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
#ifdef CUDA_USE_BINNING
|
||||
|
||||
const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
|
||||
|
||||
if(threadIdx.x < _bin_count_local[bin]) {
|
||||
const int i = 3 * blockDim.x * bin + threadIdx.x;
|
||||
|
||||
if(_mask[i] & groupbit) {
|
||||
F_CFLOAT* my_f = _binned_f + i;
|
||||
V_CFLOAT* my_v = _binned_v + i;
|
||||
X_CFLOAT* my_x = _binned_x + i;
|
||||
|
||||
V_CFLOAT dtfm = _dtf
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
|
||||
|
||||
V_CFLOAT v_mem;
|
||||
v_mem = *my_v += dtfm * (*my_f);
|
||||
xtmp = *my_x += _dtv * v_mem;
|
||||
my_f += blockDim.x;
|
||||
my_v += blockDim.x;
|
||||
my_x += blockDim.x;
|
||||
v_mem = *my_v += dtfm * (*my_f);
|
||||
ytmp = *my_x += _dtv * v_mem;
|
||||
my_f += blockDim.x;
|
||||
my_v += blockDim.x;
|
||||
my_x += blockDim.x;
|
||||
v_mem = *my_v += dtfm * (*my_f);
|
||||
ztmp = *my_x += _dtv * v_mem;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
X_CFLOAT* my_x = _x + i;
|
||||
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
||||
V_CFLOAT v_mem;
|
||||
v_mem = *my_v += dtfm * (*my_f);
|
||||
xtmp = *my_x += _dtv * v_mem;
|
||||
my_f += _nmax;
|
||||
my_v += _nmax;
|
||||
my_x += _nmax;
|
||||
v_mem = *my_v += dtfm * (*my_f);
|
||||
ytmp = *my_x += _dtv * v_mem;
|
||||
my_f += _nmax;
|
||||
my_v += _nmax;
|
||||
my_x += _nmax;
|
||||
v_mem = *my_v += dtfm * (*my_f);
|
||||
ztmp = *my_x += _dtv * v_mem;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
check_distance(xtmp, ytmp, ztmp, i, groupbit);
|
||||
}
|
||||
|
||||
__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
|
||||
{
|
||||
#ifdef CUDA_USE_BINNING
|
||||
|
||||
const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
|
||||
|
||||
if(threadIdx.x < _bin_count_local[bin]) {
|
||||
const int i = 3 * blockDim.x * bin + threadIdx.x;
|
||||
|
||||
if(_mask[i] & groupbit) {
|
||||
F_CFLOAT* my_f = _binned_f + i;
|
||||
V_CFLOAT* my_v = _binned_v + i;
|
||||
|
||||
V_CFLOAT dtfm = _dtf
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
|
||||
|
||||
*my_v += dtfm * (*my_f);
|
||||
my_f += blockDim.x;
|
||||
my_v += blockDim.x;
|
||||
*my_v += dtfm * (*my_f);
|
||||
my_f += blockDim.x;
|
||||
my_v += blockDim.x;
|
||||
*my_v += dtfm * (*my_f);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
||||
*my_v += dtfm * (*my_f);
|
||||
my_f += _nmax;
|
||||
my_v += _nmax;
|
||||
*my_v += dtfm * (*my_f);
|
||||
my_f += _nmax;
|
||||
my_v += _nmax;
|
||||
*my_v += dtfm * (*my_f);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1,96 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_set_force_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "fix_set_force_cuda_cu.h"
|
||||
#include "fix_set_force_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_FixSetForceCuda_UpdateNmax(sdata);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixSetForceCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixSetForceCuda_UpdateBuffer(sdata);
|
||||
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x;
|
||||
grid.x = 3;
|
||||
threads.x = 512;
|
||||
Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
|
||||
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz);
|
|
@ -1,86 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ F_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, int flagx, int flagy, int flagz)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
sharedmem[threadIdx.x + blockDim.x] = 0;
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
sharedmem[threadIdx.x] = _f[i];
|
||||
sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
|
||||
sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
|
||||
|
||||
if(flagx) _f[i] = xvalue;
|
||||
|
||||
if(flagy) _f[i + 1 * _nmax] = yvalue;
|
||||
|
||||
if(flagz) _f[i + 2 * _nmax] = zvalue;
|
||||
}
|
||||
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
F_CFLOAT myforig = 0.0;
|
||||
F_CFLOAT* buf = (F_CFLOAT*)_buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
|
||||
if(i + threadIdx.x < n)
|
||||
sharedmem[threadIdx.x] = buf[i + threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
reduceBlock(sharedmem);
|
||||
i += blockDim.x;
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
myforig += sharedmem[0];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
foriginal[blockIdx.x] = myforig;
|
||||
}
|
||||
|
|
@ -1,297 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_shake_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
#include "fix_shake_cuda_cu.h"
|
||||
#include "cuda_pair_virial_kernel_nc.cu"
|
||||
|
||||
#define _shake_atom MY_AP(shake_atom)
|
||||
#define _shake_type MY_AP(shake_type)
|
||||
#define _shake_flag MY_AP(shake_flag)
|
||||
#define _xshake MY_AP(xshake)
|
||||
#define _dtfsq MY_AP(dtfsq)
|
||||
#define _bond_distance MY_AP(bond_distance)
|
||||
#define _angle_distance MY_AP(angle_distance)
|
||||
#define _max_iter MY_AP(max_iter)
|
||||
#define _tolerance MY_AP(tolerance)
|
||||
__device__ __constant__ int* _shake_atom;
|
||||
__device__ __constant__ int* _shake_type;
|
||||
__device__ __constant__ int* _shake_flag;
|
||||
__device__ __constant__ X_CFLOAT3* _xshake;
|
||||
__device__ __constant__ F_CFLOAT _dtfsq;
|
||||
__device__ __constant__ X_CFLOAT* _bond_distance;
|
||||
__device__ __constant__ X_CFLOAT* _angle_distance;
|
||||
__device__ __constant__ int _max_iter;
|
||||
__device__ __constant__ X_CFLOAT _tolerance;
|
||||
|
||||
#include "fix_shake_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata), & sdata->debugdata , sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity , sizeof(int) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , sizeof(X_CFLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(triclinic) , &sdata->domain.triclinic , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , sizeof(X_CFLOAT) * 6);
|
||||
}
|
||||
|
||||
void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
|
||||
{
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixShakeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
|
||||
void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
|
||||
void* bond_distance, void* angle_distance, void* virial,
|
||||
int max_iter, X_CFLOAT tolerance)
|
||||
{
|
||||
Cuda_FixShakeCuda_UpdateNmax(sdata);
|
||||
Cuda_FixShakeCuda_UpdateDomain(sdata);
|
||||
cudaMemcpyToSymbol(MY_AP(shake_atom) , & shake_atom , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(shake_type) , & shake_type , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(shake_flag) , & shake_flag , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(xshake) , & xshake , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtfsq) , & dtfsq , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(bond_distance) , & bond_distance , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(angle_distance) , & angle_distance , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , & virial , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(max_iter) , &max_iter , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(tolerance) , &tolerance , sizeof(X_CFLOAT));
|
||||
|
||||
if(sdata->atom.mass_host)
|
||||
cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag, sizeof(int*));
|
||||
|
||||
}
|
||||
|
||||
void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixShakeCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, 10 * sizeof(double));
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
FixShakeCuda_UnconstrainedUpdate_Kernel <<< grid, threads>>> ();
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed");
|
||||
}
|
||||
|
||||
void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixShakeCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->domain.update)
|
||||
Cuda_FixShakeCuda_UpdateDomain(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_CFLOAT), 64);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_CFLOAT));
|
||||
|
||||
BindXTypeTexture(sdata);
|
||||
|
||||
FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_CFLOAT)>>> (vflag, vflag_atom, list, nlist);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed");
|
||||
|
||||
if(vflag) {
|
||||
int n = grid.x * grid.y;
|
||||
grid.x = 6;
|
||||
grid.y = 1;
|
||||
threads.x = 256;
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixShakeCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
|
||||
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
}
|
||||
}
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemset(sdata->flag, 0, sizeof(int));
|
||||
FixShakeCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz);
|
||||
cudaThreadSynchronize();
|
||||
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
int aflag;
|
||||
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
|
||||
CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
return 3 * n;
|
||||
}
|
||||
|
||||
int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixShakeCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
static int count = -1;
|
||||
count++;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
dx = pbc[0] * sdata->domain.prd[0];
|
||||
dy = pbc[1] * sdata->domain.prd[1];
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
} else {
|
||||
dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
|
||||
dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
|
||||
dz = pbc[2] * sdata->domain.prd[2];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
FixShakeCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
|
||||
}
|
||||
|
||||
return 3 * n;
|
||||
}
|
||||
|
||||
void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixShakeCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
int3 layout = getgrid(n);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
|
||||
FixShakeCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed");
|
||||
|
||||
}
|
||||
}
|
|
@ -1,34 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
|
||||
void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
|
||||
void* bond_distance, void* angle_distance, void* virial,
|
||||
int max_iter, X_CFLOAT tolerance);
|
||||
extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist);
|
||||
extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
|
||||
extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv);
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -1,66 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_temp_berendsen_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "fix_temp_berendsen_cuda_cu.h"
|
||||
#include "fix_temp_berendsen_cuda_kernel.cu"
|
||||
|
||||
|
||||
void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
|
||||
{
|
||||
V_CFLOAT factor = afactor;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixTempBerendsenCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed");
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor);
|
|
@ -1,37 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
|
||||
__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
_v[i] *= factor;
|
||||
_v[i + _nmax] *= factor;
|
||||
_v[i + 2 * _nmax] *= factor;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_temp_rescale_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "fix_temp_rescale_cuda_cu.h"
|
||||
#include "fix_temp_rescale_cuda_kernel.cu"
|
||||
|
||||
|
||||
void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
|
||||
{
|
||||
V_CFLOAT factor = afactor;
|
||||
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
|
||||
Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
//cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixTempRescaleCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed");
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor);
|
|
@ -1,37 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
|
||||
__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
_v[i] *= factor;
|
||||
_v[i + _nmax] *= factor;
|
||||
_v[i + 2 * _nmax] *= factor;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_temp_rescale_limit_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "fix_temp_rescale_limit_cuda_cu.h"
|
||||
#include "fix_temp_rescale_limit_cuda_kernel.cu"
|
||||
|
||||
|
||||
void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit)
|
||||
{
|
||||
V_CFLOAT factor = afactor;
|
||||
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
|
||||
Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
//cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int) );
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor, limit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed");
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit);
|
|
@ -1,44 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
|
||||
__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor, V_CFLOAT limit)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
V_CFLOAT vx = _v[i];
|
||||
V_CFLOAT vy = _v[i + _nmax];
|
||||
V_CFLOAT vz = _v[i + 2 * _nmax];
|
||||
vx *= factor;
|
||||
vy *= factor;
|
||||
vz *= factor;
|
||||
|
||||
_v[i] = vx > 0 ? min(vx, limit) : max(vx, -limit);
|
||||
_v[i + _nmax] = vy > 0 ? min(vy, limit) : max(vy, -limit);
|
||||
_v[i + 2 * _nmax] = vz > 0 ? min(vz, limit) : max(vz, -limit);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#define MY_PREFIX fix_viscous_cuda
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
|
||||
#include "fix_viscous_cuda_cu.h"
|
||||
#include "fix_viscous_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_FixViscousCuda_UpdateNmax(sdata);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixViscousCuda_UpdateNmax(sdata);
|
||||
|
||||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 0);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_CFLOAT*) gamma);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed");
|
||||
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma);
|
|
@ -1,35 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_CFLOAT* gamma)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
F_CFLOAT drag = gamma[_type[i]];
|
||||
_f[i] -= drag * _v[i];
|
||||
_f[i + 1 * _nmax] -= drag * _v[i + 1 * _nmax];
|
||||
_f[i + 2 * _nmax] -= drag * _v[i + 2 * _nmax];
|
||||
}
|
||||
}
|
|
@ -1,364 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#define MY_PREFIX neighbor
|
||||
#define IncludeCommonNeigh
|
||||
#include "cuda_shared.h"
|
||||
#include "cuda_common.h"
|
||||
#include "crm_cuda_utils.cu"
|
||||
#include "cuda_wrapper_cu.h"
|
||||
|
||||
#define _cutneighsq MY_AP(cutneighsq)
|
||||
#define _ex_type MY_AP(ex_type)
|
||||
#define _nex_type MY_AP(nex_type)
|
||||
#define _ex1_bit MY_AP(ex1_bit)
|
||||
#define _ex2_bit MY_AP(ex2_bit)
|
||||
#define _nex_group MY_AP(nex_group)
|
||||
#define _ex_mol_bit MY_AP(ex_mol_bit)
|
||||
#define _nex_mol MY_AP(nex_mol)
|
||||
__device__ __constant__ CUDA_CFLOAT* _cutneighsq;
|
||||
__device__ __constant__ int* _ex_type;
|
||||
__device__ __constant__ int _nex_type;
|
||||
__device__ __constant__ int* _ex1_bit;
|
||||
__device__ __constant__ int* _ex2_bit;
|
||||
__device__ __constant__ int _nex_group;
|
||||
__device__ __constant__ int* _ex_mol_bit;
|
||||
__device__ __constant__ int _nex_mol;
|
||||
|
||||
#include "neighbor_cu.h"
|
||||
#include "neighbor_kernel.cu"
|
||||
|
||||
void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
|
||||
|
||||
int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_CFLOAT)));
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
||||
if(sdata->buffer != NULL) CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
|
||||
CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed");
|
||||
}
|
||||
|
||||
int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
if(sdata->buffer_new)
|
||||
Cuda_Neighbor_UpdateBuffer(sdata, sneighlist);
|
||||
|
||||
// initialize only on first call
|
||||
CUDA_CFLOAT rez_bin_size[3] = {
|
||||
(1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
|
||||
(1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
|
||||
(1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
|
||||
};
|
||||
|
||||
short init = 0;
|
||||
|
||||
if(! init) {
|
||||
init = 0;
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , sizeof(X_CFLOAT) * 3);
|
||||
}
|
||||
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
my_times starttime, endtime;
|
||||
my_gettime(CLOCK_REALTIME, &starttime);
|
||||
|
||||
cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_CFLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
|
||||
|
||||
Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &endtime);
|
||||
sdata->cuda_timings.neigh_bin +=
|
||||
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
|
||||
|
||||
|
||||
int binning_error;
|
||||
cudaMemcpy((void*) &binning_error, (void*) sdata->buffer, 1 * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
if(binning_error) {
|
||||
sneighlist->bin_extraspace += 0.05;
|
||||
} else {
|
||||
MYDBG(printf("CUDA: binning successful\n");)
|
||||
}
|
||||
CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
|
||||
return binning_error;
|
||||
}
|
||||
|
||||
int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
//Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
|
||||
CUDA_CFLOAT globcutoff = -1.0;
|
||||
|
||||
short init = 0;
|
||||
|
||||
if(! init) {
|
||||
init = 1;
|
||||
|
||||
// !! LAMMPS indexes atom types starting with 1 !!
|
||||
|
||||
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
|
||||
|
||||
unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
|
||||
|
||||
CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
|
||||
//printf("Allocate: %i\n",nx);
|
||||
sneighlist->cu_cutneighsq = (CUDA_CFLOAT*) CudaWrapper_AllocCudaData(nx);
|
||||
|
||||
if(sneighlist->cutneighsq) {
|
||||
int cutoffsdiffer = 0;
|
||||
double cutoff0 = sneighlist->cutneighsq[1][1];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
|
||||
|
||||
if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++;
|
||||
}
|
||||
}
|
||||
|
||||
if(not cutoffsdiffer) globcutoff = (CUDA_CFLOAT) cutoff0;
|
||||
} else {
|
||||
MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int size = 100;
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx);
|
||||
cudaMemcpyToSymbol(MY_AP(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_CFLOAT*));
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(special_flag) , sdata->atom.special_flag , 4 * sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(molecular) , & sdata->atom.molecular , sizeof(int));
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned));
|
||||
//cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) );
|
||||
cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(special) , & sdata->atom.special .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(maxspecial) , & sdata->atom.maxspecial , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nspecial) , & sdata->atom.nspecial .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(overlap_comm) , & sdata->overlap_comm, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(ex_type) , & sneighlist->ex_type.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(ex1_bit) , & sneighlist->ex1_bit.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(ex2_bit) , & sneighlist->ex2_bit.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(ex_mol_bit) , & sneighlist->ex_mol_bit.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nex_type) , & sneighlist->nex_type, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nex_group) , & sneighlist->nex_group, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nex_mol) , & sneighlist->nex_mol, sizeof(int));
|
||||
|
||||
if(sdata->overlap_comm) {
|
||||
cudaMemcpyToSymbol(MY_AP(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
//dim3 threads(sneighlist->bin_nmax,1,1);
|
||||
dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1);
|
||||
dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1);
|
||||
|
||||
//printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_CFLOAT))*threads.x,sneighlist->bin_nmax);
|
||||
int buffer[20];
|
||||
buffer[0] = 1;
|
||||
buffer[1] = 0;
|
||||
CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int));
|
||||
CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
|
||||
//cudaMemset(sdata->debugdata,0,100*sizeof(int));
|
||||
unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_CFLOAT)) * threads.x;
|
||||
MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
|
||||
//shared_size=2056;
|
||||
my_times starttime, endtime;
|
||||
my_gettime(CLOCK_REALTIME, &starttime);
|
||||
//for(int i=0;i<100;i++)
|
||||
{
|
||||
if(sdata->overlap_comm)
|
||||
NeighborBuildFullBin_OverlapComm_Kernel <<< grid, threads, shared_size>>>
|
||||
(sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom);
|
||||
else {
|
||||
int exclude = sneighlist->nex_mol | sneighlist->nex_group | sneighlist->nex_type;
|
||||
|
||||
if(exclude)
|
||||
NeighborBuildFullBin_Kernel<1> <<< grid, threads, shared_size>>>
|
||||
(sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
|
||||
else
|
||||
NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>>
|
||||
(sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
|
||||
}
|
||||
//NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_CFLOAT))*threads.x+sizeof(int)>>>
|
||||
// (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
|
||||
my_gettime(CLOCK_REALTIME, &endtime);
|
||||
sdata->cuda_timings.neigh_build +=
|
||||
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
|
||||
//dim3 threads,grid;
|
||||
CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int));
|
||||
|
||||
if(buffer[0] >= 0 && true && sdata->atom.molecular) {
|
||||
//printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
|
||||
my_gettime(CLOCK_REALTIME, &starttime);
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 0, 512);
|
||||
threads.x = layout.z;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
grid.x = layout.x;
|
||||
grid.y = layout.y;
|
||||
grid.z = 1;
|
||||
FindSpecial <<< grid, threads>>>(sdata->pair.use_block_per_atom);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
|
||||
my_gettime(CLOCK_REALTIME, &endtime);
|
||||
sdata->cuda_timings.neigh_special +=
|
||||
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
|
||||
}
|
||||
}
|
||||
//printf("Neightime: %lf\n",sdata->cuda_timings.test1);
|
||||
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
|
||||
|
||||
//CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int));
|
||||
|
||||
MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");)
|
||||
return buffer[0];
|
||||
}
|
||||
|
||||
int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");)
|
||||
// initialize only on first call
|
||||
/*static*/ short init = 0;
|
||||
|
||||
if(! init) {
|
||||
init = 1;
|
||||
|
||||
// !! LAMMPS indexes atom types starting with 1 !!
|
||||
|
||||
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
|
||||
|
||||
if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
|
||||
printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u "
|
||||
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
|
||||
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
|
||||
|
||||
unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
|
||||
CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
|
||||
|
||||
if(sneighlist->cutneighsq) {
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
|
||||
//printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int size = 100;
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
|
||||
sdata->buffer = CudaWrapper_AllocCudaData(size);
|
||||
sdata->buffersize = size;
|
||||
sdata->buffer_new++;
|
||||
MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer , sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(cutneighsq) , acutneighsq , nx);
|
||||
cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
|
||||
|
||||
free(acutneighsq);
|
||||
}
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
int return_value = 1;
|
||||
CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int));
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed");
|
||||
NeighborBuildFullNsq_Kernel <<< grid, threads>>> ();
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
|
||||
|
||||
int buffer[20];
|
||||
CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int) * 20);
|
||||
MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");)
|
||||
return return_value = buffer[0];
|
||||
}
|
|
@ -1,32 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NEIGHBOR_CU_H_
|
||||
#define NEIGHBOR_CU_H_
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
|
||||
extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
|
||||
extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
|
||||
|
||||
#endif /*NEIGHBOR_CU_H_*/
|
|
@ -1,660 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#define SBBITS 30
|
||||
|
||||
__global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z,
|
||||
CUDA_CFLOAT rez_bin_size_x, CUDA_CFLOAT rez_bin_size_y, CUDA_CFLOAT rez_bin_size_z)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
/*int* bin_count=(int*) _buffer;
|
||||
bin_count=bin_count+20;
|
||||
CUDA_CFLOAT* binned_x=(CUDA_CFLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
|
||||
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
|
||||
binned_x = &binned_x[2];
|
||||
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
|
||||
|
||||
if(i < _nall) {
|
||||
// copy atom position from global device memory to local register
|
||||
// in this 3 steps to get as much coalesced access as possible
|
||||
X_CFLOAT* my_x = _x + i;
|
||||
CUDA_CFLOAT x_i = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_CFLOAT y_i = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_CFLOAT z_i = *my_x;
|
||||
|
||||
|
||||
// calculate flat bin index
|
||||
int bx = __float2int_rd(rez_bin_size_x * (x_i - _sublo[0])) + 2;
|
||||
int by = __float2int_rd(rez_bin_size_y * (y_i - _sublo[1])) + 2;
|
||||
int bz = __float2int_rd(rez_bin_size_z * (z_i - _sublo[2])) + 2;
|
||||
|
||||
bx -= bx * negativCUDA(1.0f * bx);
|
||||
bx -= (bx - bin_dim_x + 1) * negativCUDA(1.0f * bin_dim_x - 1.0f - 1.0f * bx);
|
||||
by -= by * negativCUDA(1.0f * by);
|
||||
by -= (by - bin_dim_y + 1) * negativCUDA(1.0f * bin_dim_y - 1.0f - 1.0f * by);
|
||||
bz -= bz * negativCUDA(1.0f * bz);
|
||||
bz -= (bz - bin_dim_z + 1) * negativCUDA(1.0f * bin_dim_z - 1.0f - 1.0f * bz);
|
||||
|
||||
|
||||
const unsigned j = bin_dim_z * (bin_dim_y * bx + by) + bz;
|
||||
|
||||
// add new atom to bin, get bin-array position
|
||||
const unsigned k = atomicAdd(& bin_count[j], 1);
|
||||
|
||||
if(k < bin_nmax) {
|
||||
binned_id [bin_nmax * j + k] = i;
|
||||
binned_x [3 * bin_nmax * j + k] = x_i;
|
||||
binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
|
||||
binned_x [3 * bin_nmax * j + k + 2 * bin_nmax] = z_i;
|
||||
} else {
|
||||
// normally, this should not happen:
|
||||
int errorn = atomicAdd((int*) _buffer, 1);
|
||||
MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
|
||||
{
|
||||
int m;
|
||||
|
||||
if(_nex_type)
|
||||
if(_ex_type[itype * _cuda_ntypes + jtype]) return 1;
|
||||
|
||||
if(_nex_group) {
|
||||
for(m = 0; m < _nex_group; m++) {
|
||||
if(_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;
|
||||
|
||||
if(_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if(_nex_mol) {
|
||||
if(_molecule[i] == _molecule[j])
|
||||
for(m = 0; m < _nex_mol; m++)
|
||||
if(_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m]) return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern __shared__ CUDA_CFLOAT shared[];
|
||||
|
||||
__device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
|
||||
{
|
||||
int k = n.z;
|
||||
|
||||
for(int l = 0; l < n.z; l++) k = ((list[l] == tag) ? l : k);
|
||||
|
||||
return k < n.x ? flag.x : (k < n.y ? flag.y : (k < n.z ? flag.z : 0));
|
||||
}
|
||||
|
||||
template <const unsigned int exclude>
|
||||
__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style, bool neighall)
|
||||
{
|
||||
int natoms = neighall ? _nall : _nlocal;
|
||||
//const bool domol=false;
|
||||
int bin_dim_z = gridDim.y;
|
||||
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
|
||||
binned_x = &binned_x[2];
|
||||
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
|
||||
int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
|
||||
int bin_x = blockIdx.x / bin_dim_y;
|
||||
int bin_y = blockIdx.x - bin_x * bin_dim_y;
|
||||
int bin_z = blockIdx.y;
|
||||
int bin_c = bin_count[bin];
|
||||
|
||||
|
||||
CUDA_CFLOAT cut;
|
||||
|
||||
if(globcutoff > 0)
|
||||
cut = globcutoff;
|
||||
|
||||
int i = _nall;
|
||||
CUDA_CFLOAT* my_x;
|
||||
CUDA_CFLOAT x_i, y_i, z_i;
|
||||
|
||||
for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
|
||||
|
||||
int actIdx = threadIdx.x + actOffset;
|
||||
CUDA_CFLOAT* other_x = shared;
|
||||
int* other_id = (int*) &other_x[3 * blockDim.x];
|
||||
|
||||
if(actIdx < bin_c) {
|
||||
i = binned_id[__mul24(bin, bin_nmax) + actIdx];
|
||||
my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
|
||||
x_i = *my_x;
|
||||
my_x += bin_nmax;
|
||||
y_i = *my_x;
|
||||
my_x += bin_nmax;
|
||||
z_i = *my_x;
|
||||
} else
|
||||
i = 2 * _nall;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int jnum = 0;
|
||||
int itype;
|
||||
|
||||
if(i < natoms) {
|
||||
jnum = 0;
|
||||
_ilist[i] = i;
|
||||
itype = _type[i];
|
||||
}
|
||||
|
||||
//__syncthreads();
|
||||
|
||||
|
||||
for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
|
||||
int otherActIdx = threadIdx.x + otherActOffset;
|
||||
|
||||
if(otherActIdx < bin_c) {
|
||||
if(otherActOffset == actOffset) {
|
||||
other_id[threadIdx.x] = i;
|
||||
other_x[threadIdx.x] = x_i;
|
||||
other_x[threadIdx.x + blockDim.x] = y_i;
|
||||
other_x[threadIdx.x + 2 * blockDim.x] = z_i;
|
||||
} else {
|
||||
other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
|
||||
my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
|
||||
other_x[threadIdx.x] = *my_x;
|
||||
my_x += bin_nmax;
|
||||
other_x[threadIdx.x + blockDim.x] = *my_x;
|
||||
my_x += bin_nmax;
|
||||
other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
int kk = threadIdx.x;
|
||||
|
||||
for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
|
||||
if(i < natoms) {
|
||||
kk++;
|
||||
kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
|
||||
int j = other_id[kk];
|
||||
|
||||
if(exclude && exclusion(i, j, itype, _type[j])) continue;
|
||||
|
||||
if(globcutoff < 0) {
|
||||
int jtype = _type[j];
|
||||
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
|
||||
}
|
||||
|
||||
CUDA_CFLOAT delx = x_i - other_x[kk];
|
||||
CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
|
||||
CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
|
||||
if(rsq <= cut && i != j) {
|
||||
if(jnum < _maxneighbors) {
|
||||
if(block_style)
|
||||
_neighbors[i * _maxneighbors + jnum] = j;
|
||||
else
|
||||
_neighbors[i + jnum * natoms] = j;
|
||||
}
|
||||
|
||||
++jnum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
|
||||
for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
|
||||
for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
|
||||
for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
|
||||
if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
|
||||
|
||||
if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
|
||||
|
||||
int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
|
||||
|
||||
if(other_bin == bin) continue;
|
||||
|
||||
int obin_c = bin_count[other_bin];
|
||||
|
||||
for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
|
||||
int otherActIdx = otherActOffset + threadIdx.x;
|
||||
|
||||
if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
|
||||
other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
|
||||
my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
|
||||
other_x[threadIdx.x] = *my_x;
|
||||
my_x += bin_nmax;
|
||||
other_x[threadIdx.x + blockDim.x] = *my_x;
|
||||
my_x += bin_nmax;
|
||||
other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
|
||||
if(i < natoms) {
|
||||
int j = other_id[k];
|
||||
|
||||
if(exclude && exclusion(i, j, itype, _type[j])) continue;
|
||||
|
||||
if(globcutoff < 0) {
|
||||
int jtype = _type[j];
|
||||
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
|
||||
}
|
||||
|
||||
CUDA_CFLOAT delx = x_i - other_x[k];
|
||||
CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
|
||||
CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq <= cut && i != j) {
|
||||
if(jnum < _maxneighbors) {
|
||||
if(block_style)
|
||||
_neighbors[i * _maxneighbors + jnum] = j;
|
||||
else
|
||||
_neighbors[i + jnum * natoms] = j;
|
||||
}
|
||||
|
||||
++jnum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
|
||||
|
||||
if(i < natoms)
|
||||
_numneigh[i] = jnum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void FindSpecial(int block_style)
|
||||
{
|
||||
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int which;
|
||||
int tag_mask = 0;
|
||||
int3 spec_flag;
|
||||
|
||||
int3 mynspecial = {0, 0, 1};
|
||||
|
||||
if(ii >= _nlocal) return;
|
||||
|
||||
int special_id[CUDA_MAX_NSPECIAL];
|
||||
|
||||
int i = _ilist[ii];
|
||||
|
||||
if(i >= _nlocal) return;
|
||||
|
||||
int jnum = _numneigh[i];
|
||||
|
||||
if(_special_flag[1] == 0) spec_flag.x = -1;
|
||||
else if(_special_flag[1] == 1) spec_flag.x = 0;
|
||||
else spec_flag.x = 1;
|
||||
|
||||
if(_special_flag[2] == 0) spec_flag.y = -1;
|
||||
else if(_special_flag[2] == 1) spec_flag.y = 0;
|
||||
else spec_flag.y = 2;
|
||||
|
||||
if(_special_flag[3] == 0) spec_flag.z = -1;
|
||||
else if(_special_flag[3] == 1) spec_flag.z = 0;
|
||||
else spec_flag.z = 3;
|
||||
|
||||
mynspecial.x = _nspecial[i];
|
||||
mynspecial.y = _nspecial[i + _nmax];
|
||||
mynspecial.z = _nspecial[i + 2 * _nmax];
|
||||
|
||||
if(i < _nlocal) {
|
||||
int* list = &_special[i];
|
||||
|
||||
for(int k = 0; k < mynspecial.z; k++) {
|
||||
special_id[k] = list[k * _nmax];
|
||||
tag_mask = tag_mask | special_id[k];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for(int k = 0; k < MIN(jnum, _maxneighbors); k++) {
|
||||
int j;
|
||||
|
||||
if(block_style)
|
||||
j = _neighbors[i * _maxneighbors + k];
|
||||
else
|
||||
j = _neighbors[i + k * _nlocal];
|
||||
|
||||
int tag_j = _tag[j];
|
||||
which = 0;
|
||||
|
||||
if((tag_mask & tag_j) == tag_j) {
|
||||
which = find_special(mynspecial, special_id, tag_j, spec_flag);
|
||||
|
||||
if(which > 0) {
|
||||
if(block_style)
|
||||
_neighbors[i * _maxneighbors + k] = j ^ (which << SBBITS);
|
||||
else
|
||||
_neighbors[i + k * _nlocal] = j ^ (which << SBBITS);
|
||||
} else if(which < 0) {
|
||||
if(block_style)
|
||||
_neighbors[i * _maxneighbors + k] = _neighbors[i * _maxneighbors + jnum - 1];
|
||||
else
|
||||
_neighbors[i + k * _nlocal] = _neighbors[i + (jnum - 1) * _nlocal];
|
||||
|
||||
jnum--;
|
||||
k--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_numneigh[i] = jnum;
|
||||
}
|
||||
|
||||
__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style)
|
||||
{
|
||||
int bin_dim_z = gridDim.y;
|
||||
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
|
||||
binned_x = &binned_x[2];
|
||||
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
|
||||
int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
|
||||
int bin_x = blockIdx.x / bin_dim_y;
|
||||
int bin_y = blockIdx.x - bin_x * bin_dim_y;
|
||||
int bin_z = blockIdx.y;
|
||||
int bin_c = bin_count[bin];
|
||||
|
||||
|
||||
CUDA_CFLOAT cut;
|
||||
|
||||
if(globcutoff > 0)
|
||||
cut = globcutoff;
|
||||
|
||||
int i = _nall;
|
||||
CUDA_CFLOAT* my_x;
|
||||
CUDA_CFLOAT x_i, y_i, z_i;
|
||||
|
||||
for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
|
||||
|
||||
int actIdx = threadIdx.x + actOffset;
|
||||
CUDA_CFLOAT* other_x = shared;
|
||||
int* other_id = (int*) &other_x[3 * blockDim.x];
|
||||
|
||||
if(actIdx < bin_c) {
|
||||
i = binned_id[__mul24(bin, bin_nmax) + actIdx];
|
||||
my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
|
||||
x_i = *my_x;
|
||||
my_x += bin_nmax;
|
||||
y_i = *my_x;
|
||||
my_x += bin_nmax;
|
||||
z_i = *my_x;
|
||||
} else
|
||||
i = 2 * _nall;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int jnum = 0;
|
||||
int jnum_border = 0;
|
||||
int jnum_inner = 0;
|
||||
int i_border = -1;
|
||||
int itype;
|
||||
|
||||
if(i < _nlocal) {
|
||||
jnum = 0;
|
||||
_ilist[i] = i;
|
||||
itype = _type[i];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
|
||||
int otherActIdx = threadIdx.x + otherActOffset;
|
||||
|
||||
if(otherActIdx < bin_c) {
|
||||
if(otherActOffset == actOffset) {
|
||||
other_id[threadIdx.x] = i;
|
||||
other_x[threadIdx.x] = x_i;
|
||||
other_x[threadIdx.x + blockDim.x] = y_i;
|
||||
other_x[threadIdx.x + 2 * blockDim.x] = z_i;
|
||||
} else {
|
||||
other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
|
||||
my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
|
||||
other_x[threadIdx.x] = *my_x;
|
||||
my_x += bin_nmax;
|
||||
other_x[threadIdx.x + blockDim.x] = *my_x;
|
||||
my_x += bin_nmax;
|
||||
other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
int kk = threadIdx.x;
|
||||
|
||||
for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
|
||||
if(i < _nlocal) {
|
||||
kk++;
|
||||
kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
|
||||
int j = other_id[kk];
|
||||
|
||||
if(globcutoff < 0) {
|
||||
int jtype = _type[j];
|
||||
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
|
||||
}
|
||||
|
||||
CUDA_CFLOAT delx = x_i - other_x[kk];
|
||||
CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
|
||||
CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
|
||||
if(rsq <= cut && i != j) {
|
||||
if((j >= _nlocal) && (i_border < 0))
|
||||
i_border = atomicAdd(_inum_border, 1);
|
||||
|
||||
if(jnum < _maxneighbors) {
|
||||
if(block_style) {
|
||||
_neighbors[i * _maxneighbors + jnum] = j;
|
||||
|
||||
if(j >= _nlocal) {
|
||||
_neighbors_border[i_border * _maxneighbors + jnum_border] = j;
|
||||
} else {
|
||||
_neighbors_inner[i * _maxneighbors + jnum_inner] = j;
|
||||
}
|
||||
} else {
|
||||
_neighbors[i + jnum * _nlocal] = j;
|
||||
|
||||
if(j >= _nlocal) {
|
||||
_neighbors_border[i_border + jnum_border * _nlocal] = j;
|
||||
} else {
|
||||
_neighbors_inner[i + jnum_inner * _nlocal] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++jnum;
|
||||
|
||||
if(j >= _nlocal)
|
||||
jnum_border++;
|
||||
else
|
||||
jnum_inner++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
|
||||
for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
|
||||
for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
|
||||
if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
|
||||
|
||||
if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
|
||||
|
||||
int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
|
||||
|
||||
if(other_bin == bin) continue;
|
||||
|
||||
int obin_c = bin_count[other_bin];
|
||||
|
||||
for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
|
||||
int otherActIdx = otherActOffset + threadIdx.x;
|
||||
|
||||
if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
|
||||
other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
|
||||
my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
|
||||
other_x[threadIdx.x] = *my_x;
|
||||
my_x += bin_nmax;
|
||||
other_x[threadIdx.x + blockDim.x] = *my_x;
|
||||
my_x += bin_nmax;
|
||||
other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
|
||||
if(i < _nlocal) {
|
||||
int j = other_id[k];
|
||||
|
||||
if(globcutoff < 0) {
|
||||
int jtype = _type[j];
|
||||
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
|
||||
}
|
||||
|
||||
CUDA_CFLOAT delx = x_i - other_x[k];
|
||||
CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
|
||||
CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq <= cut && i != j) {
|
||||
if((j >= _nlocal) && (i_border < 0))
|
||||
i_border = atomicAdd(_inum_border, 1);
|
||||
|
||||
if(jnum < _maxneighbors) {
|
||||
if(block_style) {
|
||||
_neighbors[i * _maxneighbors + jnum] = j;
|
||||
|
||||
if(j >= _nlocal) {
|
||||
_neighbors_border[i_border * _maxneighbors + jnum_border] = j;
|
||||
} else {
|
||||
_neighbors_inner[i * _maxneighbors + jnum_inner] = j;
|
||||
}
|
||||
} else {
|
||||
_neighbors[i + jnum * _nlocal] = j;
|
||||
|
||||
if(j >= _nlocal) {
|
||||
_neighbors_border[i_border + jnum_border * _nlocal] = j;
|
||||
} else {
|
||||
_neighbors_inner[i + jnum_inner * _nlocal] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++jnum;
|
||||
|
||||
if(j >= _nlocal)
|
||||
jnum_border++;
|
||||
else
|
||||
jnum_inner++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
|
||||
|
||||
if(i < _nlocal) {
|
||||
_numneigh[i] = jnum;
|
||||
_numneigh_inner[i] = jnum_inner;
|
||||
|
||||
if(i_border >= 0) _numneigh_border[i_border] = jnum_border;
|
||||
|
||||
if(i_border >= 0) _ilist_border[i_border] = i;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void NeighborBuildFullNsq_Kernel()
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* buffer = (int*) _buffer;
|
||||
|
||||
if(i < _nlocal) {
|
||||
X_CFLOAT* my_x = _x + i;
|
||||
CUDA_CFLOAT x_i = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_CFLOAT y_i = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_CFLOAT z_i = *my_x;
|
||||
int jnum = 0;
|
||||
int* jlist = _firstneigh[i];
|
||||
_ilist[i] = i;
|
||||
|
||||
int itype = _type[i];
|
||||
__syncthreads();
|
||||
|
||||
for(int j = 0; j < _nall; ++j) {
|
||||
my_x = _x + j;
|
||||
CUDA_CFLOAT x_j = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_CFLOAT y_j = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_CFLOAT z_j = *my_x;
|
||||
CUDA_CFLOAT delx = x_i - x_j;
|
||||
CUDA_CFLOAT dely = y_i - y_j;
|
||||
CUDA_CFLOAT delz = z_i - z_j;
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
int jtype = _type[j];
|
||||
|
||||
if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) {
|
||||
if(jnum < _maxneighbors)
|
||||
jlist[jnum] = j;
|
||||
|
||||
if(i == 151)((int*)_buffer)[jnum + 2] = j;
|
||||
|
||||
++jnum;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(jnum > _maxneighbors) buffer[0] = 0;
|
||||
|
||||
_numneigh[i] = jnum;
|
||||
|
||||
if(i == 151)((int*)_buffer)[1] = jnum;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,78 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define _rhoinv MY_AP(coeff1)
|
||||
#define _sigma MY_AP(coeff2)
|
||||
#define _a MY_AP(coeff3)
|
||||
#define _c MY_AP(coeff4)
|
||||
#define _d MY_AP(coeff5)
|
||||
|
||||
#include "pair_born_coul_long_cuda_cu.h"
|
||||
#include "pair_born_coul_long_cuda_kernel_nc.cu"
|
||||
|
||||
#include <time.h>
|
||||
|
||||
void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_Pair_Init_AllStyles(sdata, 5, true);
|
||||
}
|
||||
|
||||
void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||||
{
|
||||
|
||||
|
||||
static short init = 0;
|
||||
|
||||
if(! init) {
|
||||
init = 1;
|
||||
Cuda_PairBornCoulLongCuda_Init(sdata);
|
||||
}
|
||||
|
||||
dim3 grid, threads;
|
||||
int sharedperproc;
|
||||
|
||||
|
||||
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
|
||||
|
||||
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
|
||||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_BORN, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_BORN, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#undef _rhoinv
|
||||
#undef _sigma
|
||||
#undef _a
|
||||
#undef _c
|
||||
#undef _d
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
#ifdef CUDA_USE_BINNING
|
||||
extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
|
||||
#else
|
||||
extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
|
||||
#endif
|
|
@ -1,36 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
__device__ inline F_CFLOAT PairBornCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
|
||||
{
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r = _RSQRT_(r2inv);
|
||||
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_CFLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]);
|
||||
const F_CFLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp -
|
||||
F_F(6.0) * _c[ij_type] * r6inv + F_F(8.0) * _d[ij_type] * r2inv * r6inv;
|
||||
|
||||
if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv
|
||||
+ _d[ij_type] * r2inv * r6inv - _offset[ij_type]);
|
||||
|
||||
return factor_lj * forceborn * r2inv;
|
||||
}
|
|
@ -1,75 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define _rhoinv MY_AP(coeff1)
|
||||
#define _buck1 MY_AP(coeff2)
|
||||
#define _buck2 MY_AP(coeff3)
|
||||
#define _a MY_AP(coeff4)
|
||||
#define _c MY_AP(coeff5)
|
||||
|
||||
#include "pair_buck_coul_cut_cuda_cu.h"
|
||||
|
||||
#include <time.h>
|
||||
void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_Pair_Init_AllStyles(sdata, 5, true);
|
||||
}
|
||||
|
||||
void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||||
{
|
||||
|
||||
|
||||
static short init = 0;
|
||||
|
||||
if(! init) {
|
||||
init = 1;
|
||||
Cuda_PairBuckCoulCutCuda_Init(sdata);
|
||||
}
|
||||
|
||||
dim3 grid, threads;
|
||||
int sharedperproc;
|
||||
|
||||
|
||||
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
|
||||
|
||||
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
|
||||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
||||
|
||||
#undef _rhoinv
|
||||
#undef _buck1
|
||||
#undef _buck2
|
||||
#undef _a
|
||||
#undef _c
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
#ifdef CUDA_USE_BINNING
|
||||
extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag);
|
||||
#else
|
||||
extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
|
||||
#endif
|
|
@ -1,78 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define _rhoinv MY_AP(coeff1)
|
||||
#define _buck1 MY_AP(coeff2)
|
||||
#define _buck2 MY_AP(coeff3)
|
||||
#define _a MY_AP(coeff4)
|
||||
#define _c MY_AP(coeff5)
|
||||
|
||||
#include "pair_buck_coul_long_cuda_cu.h"
|
||||
|
||||
#include <time.h>
|
||||
|
||||
void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_Pair_Init_AllStyles(sdata, 5, true);
|
||||
}
|
||||
|
||||
void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||||
{
|
||||
|
||||
|
||||
static short init = 0;
|
||||
|
||||
if(! init) {
|
||||
init = 1;
|
||||
Cuda_PairBuckCoulLongCuda_Init(sdata);
|
||||
}
|
||||
|
||||
dim3 grid, threads;
|
||||
int sharedperproc;
|
||||
|
||||
|
||||
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
|
||||
|
||||
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
|
||||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#undef _rhoinv
|
||||
#undef _buck1
|
||||
#undef _buck2
|
||||
#undef _a
|
||||
#undef _c
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
#ifdef CUDA_USE_BINNING
|
||||
extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
|
||||
#else
|
||||
extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
|
||||
#endif
|
|
@ -1,77 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define _rhoinv MY_AP(coeff1)
|
||||
#define _buck1 MY_AP(coeff2)
|
||||
#define _buck2 MY_AP(coeff3)
|
||||
#define _a MY_AP(coeff4)
|
||||
#define _c MY_AP(coeff5)
|
||||
|
||||
#include "pair_buck_cuda_cu.h"
|
||||
#include "pair_buck_cuda_kernel_nc.cu"
|
||||
|
||||
#include <time.h>
|
||||
|
||||
void Cuda_PairBuckCuda_Init(cuda_shared_data* sdata)
|
||||
{
|
||||
Cuda_Pair_Init_AllStyles(sdata, 5);
|
||||
}
|
||||
|
||||
void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||||
{
|
||||
|
||||
|
||||
static short init = 0;
|
||||
|
||||
if(! init) {
|
||||
init = 1;
|
||||
Cuda_PairBuckCuda_Init(sdata);
|
||||
}
|
||||
|
||||
dim3 grid, threads;
|
||||
int sharedperproc;
|
||||
|
||||
|
||||
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
|
||||
|
||||
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
|
||||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
||||
|
||||
#undef _rhoinv
|
||||
#undef _buck1
|
||||
#undef _buck2
|
||||
#undef _a
|
||||
#undef _c
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
#ifdef CUDA_USE_BINNING
|
||||
extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, int eflag, int vflag);
|
||||
#else
|
||||
extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue