git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8921 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2012-10-08 15:29:39 +00:00
parent abc9afbec6
commit 856a237400
186 changed files with 0 additions and 22340 deletions

View File

@ -1,4 +0,0 @@
#Makefile for liblammpscuda.a
#No need to modify anything here! The CUDA path is inserted into Makefile.common
include Makefile.cudalib

View File

@ -1,124 +0,0 @@
#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed
# make options:
# emu=1 switch to cuda emulation mode (otherwise: use gpu)
# dbg=1 print a lot of debugging output during runtime
# verbose=1 output nvcc command line during compilation
# keep=1 do not delete temporary compilation files (.ii, .cubin, ...)
# cufft=1 use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw)
# binning=1 create virtual particle grid (neighbor-lists otherwise); currently this is not supported
# precision=1 single precision (global setting)
# precision=2 double precision (global setting)
SHELL = /bin/sh
# System-specific settings
#CUDA_INSTALL_PATH = /usr/local/cuda
CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
# e.g. in Gentoo
# CUDA_INSTALL_PATH = /opt/cuda
#//////////////////////////////////////////////////////////////////////////////////////////////
# no need to change anything below this line
#//////////////////////////////////////////////////////////////////////////////////////////////
#use CPU FFT if cufft=0 is requested.
FALLBACK_FFT = 1
#default settings for compiler switches
ifdef COMPILELIB
include Makefile.defaults
else
include ../../lib/cuda/Makefile.defaults
endif
#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer}
CUDA_FLAGS := -I${CUDA_INSTALL_PATH}/include -DUNIX
CUDA_USRLIB_CONDITIONAL := -L${CUDA_INSTALL_PATH}/lib -L${CUDA_INSTALL_PATH}/lib64
# debug setting
ifeq ($(strip $(dbg)), 1)
CUDA_FLAGS += -D_DEBUG -g
NVCC_FLAGS += -g -G
else
NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O2
endif
# skip timing on Mac and Windows manually
ifeq ($(strip $(prec_timer)), 0)
CUDA_FLAGS += -DNO_PREC_TIMING
endif
# set fft routine
ifeq ($(strip $(cufft)), 0)
ifneq ($(FALLBACK_FFT), 1)
FFT_INC = -DFFT_NONE
FFT_PATH =
FFT_LIB =
CUDA_FLAGS += -DFFT_NONE
endif
else
CUDA_FLAGS += -DFFT_CUFFT
CUDA_USRLIB_CONDITIONAL += -lcufft
endif
# make global precision setting
ifeq ($(strip $(precision)), 1)
CUDA_FLAGS += -DCUDA_PRECISION=1
else
ifeq ($(strip $(precision)), 3)
CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2
else
ifeq ($(strip $(precision)), 4)
CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2
else
CUDA_FLAGS += -DCUDA_PRECISION=2
endif
endif
endif
# make architecture settings
ifeq ($(strip $(arch)), 13)
CUDA_FLAGS += -DCUDA_ARCH=13
SMVERSIONFLAGS := -arch sm_13
else
ifeq ($(strip $(arch)), 20)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_20
else
ifeq ($(strip $(arch)), 21)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_21
else
ifeq ($(strip $(arch)), 30)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_30
else
ifeq ($(strip $(arch)), 35)
CUDA_FLAGS += -DCUDA_ARCH=20
#NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
SMVERSIONFLAGS := -arch sm_35
else
CUDA_FLAGS += -DCUDA_ARCH=99
SMVERSIONFLAGS := -arch sm_13
endif
endif
endif
endif
endif
CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \
-I$(CUDA_INSTALL_PATH)/include

View File

@ -1,87 +0,0 @@
#Makefile for liblammpscuda.a
#No need to modify anything here! The CUDA path is inserted into Makefile.common
.DEFAULT: lib
COMPILELIB := 1
SHELL = /bin/sh
CUDA_SRC_DIR = ../cuda
CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake
CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) )
include $(CUDA_TEMP)
CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu)
CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o)
CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO))
CUDA_DEP = $(CUDA_OBJ:.o=.d)
NVCC_FLAGS :=
VPATH = $(CUDA_SRC_DIR)
#rewriting default settings if new ones are specified
ifdef precision
tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults)
endif
ifdef arch
tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults)
endif
ifdef cufft
tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults)
endif
ifdef dbg
tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults)
endif
ifdef prec_timer
tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults)
endif
include Makefile.common
tmp := $(shell sed -i '2 d' Makefile.lammps)
tmp := $(shell sed -i '2 d' Makefile.lammps)
tmp := $(shell sed -i '1a CUDA_FLAGS := ${CUDA_FLAGS}' Makefile.lammps)
tmp := $(shell sed -i '2a CUDA_USRLIB_CONDITIONAL := ${CUDA_USRLIB_CONDITIONAL}' Makefile.lammps)
# verbose nvcc output during compilation
ifeq ($(verbose), 1)
VERBOSE :=
NVCC_FLAGS += --ptxas-options=-v
else
VERBOSE := @
endif
# keep temporary compilation files of nvcc
ifeq ($(keep), 1)
NVCC_FLAGS += -keep -Xptxas="--verbose"
endif
NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
CUDA_INCLUDES = -I./ -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA
CUDA_USRLIB =
# Link target
lib: $(CUDA_OBJ)
$(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a
clean:
rm $(CUDA_SRC_DIR)/*.o
rm liblammpscuda.a
# Library target
# Cuda compilation rules
%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h
$(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $<

View File

@ -1,19 +0,0 @@
#precision setting: 1 single, 2 double, 4 mixed
precision ?= 1
#verbose setting: 0 no, 1 yes
verbose ?= 1
#GPU architecture (compute capability): 13, 20, 21
arch ?= 20
#Using cufft (should not be changed)
cufft ?= 1
#Using dbg mode
dbg ?= 0
#On mac machines set this to 0 in order to avoid usage of linux specific precision timer
prec_timer ?= 1

View File

@ -1,8 +0,0 @@
# Settings that the LAMMPS build will import when this package library is used
CUDA_FLAGS := -I/usr/local/cuda/include -DUNIX -DFFT_CUFFT -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2 -DCUDA_ARCH=20
CUDA_USRLIB_CONDITIONAL := -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -lcufft
user-cuda_SYSINC = ${CUDA_FLAGS}
user-cuda_SYSLIB = -lcuda -lcudart -lrt
user-cuda_SYSPATH = $(CUDA_USRLIB_CONDITIONAL)

View File

@ -1,26 +0,0 @@
This directory has source files to build a library that LAMMPS
links against when using the USER-CUDA package.
When you are done building this library, two files should
exist in this directory:
liblammpscuda.a the library LAMMPS will link against
Makefile.lammps settings the LAMMPS Makefile will import
The latter file will have settings like this (can be omitted if blank):
user-cuda_SYSINC = -I$(CUDA_INSTALL_PATH)/include
user-cuda_SYSLIB = -lcuda -lcudart -lrt
user-cuda_SYSPATH = -L$(CUDA_INSTALL_PATH)/lib64 -L$(CUDA_INSTALL_PATH)/lib $(CUDA_USRLIB_CONDITIONAL)
SYSINC is for settings needed to compile LAMMPS source files
SYSLIB is for additional system libraries needed by this package
SYSPATH is the path(s) to where those libraries are
You must insure these settings are correct for your system, else
the LAMMPS build will likely fail.
-------------------------------------------------------------------------
Christian - there needs to additional info here about how
to build the lammpscuda lib.

View File

@ -1,85 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
const unsigned int ANGLE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
#include "atom_vec_angle_cuda_cu.h"
void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata)
{
return Cuda_AtomVecCuda_Init<ANGLE_DATA_MASK>(sdata);
}
int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
}
int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
}
int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
}
int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
}
int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
}
int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
}
int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
}
int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
}
int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
}

View File

@ -1,15 +0,0 @@
#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_
#define ATOM_VEC_ANGLE_CUDA_CU_H_
extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata);
extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/

View File

@ -1,85 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
const unsigned int ATOMIC_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
#include "atom_vec_atomic_cuda_cu.h"
void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata)
{
return Cuda_AtomVecCuda_Init<ATOMIC_DATA_MASK>(sdata);
}
int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
}
int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
}
int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
}
int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
}
int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
}
int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
}
int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
}
int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
}
int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
}

View File

@ -1,15 +0,0 @@
#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_
#define ATOM_VEC_ATOMIC_CUDA_CU_H_
extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata);
extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/

View File

@ -1,85 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
const unsigned int CHARGE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
#include "atom_vec_charge_cuda_cu.h"
void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata)
{
return Cuda_AtomVecCuda_Init<CHARGE_DATA_MASK>(sdata);
}
int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
}
int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
}
int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
}
int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
}
int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
}
int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
}
int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
}
int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
}
int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
}

View File

@ -1,15 +0,0 @@
#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_
#define ATOM_VEC_CHARGE_CUDA_CU_H_
extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata);
extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/

View File

@ -1,564 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX atom_vec_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "cuda_wrapper_cu.h"
#include "crm_cuda_utils.cu"
#include "atom_vec_cuda_kernel.cu"
int AtomVecCuda_CountDataItems(unsigned int data_mask)
{
int n=0;
if(data_mask & X_MASK) n+=3;
if(data_mask & V_MASK) n+=3;
if(data_mask & F_MASK) n+=3;
if(data_mask & TAG_MASK) n++;
if(data_mask & TYPE_MASK) n++;
if(data_mask & MASK_MASK) n++;
if(data_mask & IMAGE_MASK) n++;
if(data_mask & Q_MASK) n++;
if(data_mask & MOLECULE_MASK) n++;
if(data_mask & RMASS_MASK) n++;
if(data_mask & RADIUS_MASK) n++;
if(data_mask & DENSITY_MASK) n++;
if(data_mask & OMEGA_MASK) n+=3;
if(data_mask & TORQUE_MASK) n++;
//if(data_mask & NSPECIAL_MASK) n+=3;
return n;
}
void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata,int size)
{
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_AtomVecCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
}
template <const unsigned int data_mask>
void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(image) , & sdata->atom.image.dev_data, sizeof(int*) );
if(data_mask & Q_MASK) cudaMemcpyToSymbolAsync(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) );
if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbolAsync(MY_CONST(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*) );
if(data_mask & RADIUS_MASK) cudaMemcpyToSymbolAsync(MY_CONST(radius) , & sdata->atom.radius.dev_data, sizeof(int*) );
if(data_mask & DENSITY_MASK) cudaMemcpyToSymbolAsync(MY_CONST(density) , & sdata->atom.density.dev_data, sizeof(int*) );
if(data_mask & RMASS_MASK) cudaMemcpyToSymbolAsync(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(int*) );
if(data_mask & OMEGA_MASK) cudaMemcpyToSymbolAsync(MY_CONST(omega) , & sdata->atom.omega.dev_data, sizeof(int*) );
//if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_CONST(nspecial) , & sdata->atom.nspecial.dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(flag) , & sdata->flag, sizeof(int*) );
}
template <const unsigned int data_mask>
void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata)
{
MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n"); )
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbolAsync(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n"); )
cudaMemcpyToSymbolAsync(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT));
cudaMemcpyToSymbolAsync(MY_CONST(sublo) , & sdata->domain.sublo, 3*sizeof(X_FLOAT) );
cudaMemcpyToSymbolAsync(MY_CONST(subhi) , & sdata->domain.subhi, 3*sizeof(X_FLOAT) );
cudaMemcpyToSymbolAsync(MY_CONST(flag) , & sdata->flag, sizeof(int*) );
cudaThreadSynchronize();
MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n"); )
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbolAsync(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int n_data_items=AtomVecCuda_CountDataItems(data_mask);
int size=(n*n_data_items)*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
dz = pbc[2]*sdata->domain.prd[2];
}}
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
cudaMemset( sdata->flag,0,sizeof(int));
clock_gettime(CLOCK_REALTIME,&time1);
void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
Cuda_AtomVecCuda_PackComm_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_forward_kernel_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
if(not sdata->overlap_comm)
cudaMemcpy(buf_send, sdata->buffer, n*n_data_items*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_forward_download+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
int aflag;
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
}
return n_data_items*n;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");)
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int n_data_items=AtomVecCuda_CountDataItems(data_mask);
int size=(n*n_data_items)*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
static int count=-1;
count++;
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
dz = pbc[2]*sdata->domain.prd[2];
}}
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
clock_gettime(CLOCK_REALTIME,&time1);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed");
Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_forward_kernel_self+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed");
}
return n_data_items*n;
}
template <const unsigned int data_mask>
void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
{
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int n_data_items=AtomVecCuda_CountDataItems(data_mask);
int size=(n*n_data_items)*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
clock_gettime(CLOCK_REALTIME,&time1);
if(not sdata->overlap_comm||iswap<0)
cudaMemcpy(sdata->buffer,(void*)buf_recv, n_data_items*n*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_forward_upload+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask><<<grid, threads,0>>>(n,first,buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_forward_kernel_unpack+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed");
}
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
{
MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n",dim); )
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: pre Kernel execution failed");
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
Cuda_AtomVecCuda_Init<data_mask>(sdata);
int size=n*sizeof(double);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
cudaMemset((int*) (sdata->buffer),0,sizeof(int));
int3 layout=getgrid(sdata->atom.nlocal,sizeof(int),256,true);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
Cuda_AtomVecCuda_PackExchangeList_Kernel<<<grid, threads,(threads.x+1)*sizeof(int)>>>(n-1,dim);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_exchange_kernel_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost);
int return_value = ((int*) buf_send)[0];
if(n>1+return_value)
cudaMemcpy(buf_send, sdata->buffer, (1+return_value)*sizeof(double), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed");
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_exchange_download+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n"); )
return return_value;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n"); )
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1;
int size=(nsend*n_data_items+1)*sizeof(double);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
cudaMemset((int*) (sdata->buffer),0,sizeof(int));
int3 layout=getgrid(nsend,0);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
Cuda_AtomVecCuda_PackExchange_Kernel<data_mask><<<grid, threads,0>>>(nsend,(int*) copylist);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed");
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_exchange_kernel_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_exchange_download+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... done\n"); )
return nsend*n_data_items+1;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1;
int size=(nsend*n_data_items+1)*sizeof(double);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
cudaMemcpyToSymbol(MY_CONST(flag) , & sdata->flag, sizeof(int*) );
cudaMemset((int*) (sdata->flag),0,sizeof(int));
if(nsend)
{
int3 layout=getgrid(nsend,0);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
cudaMemcpy(sdata->buffer,buf_send , size, cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_exchange_upload+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask><<<grid, threads,0>>>(sdata->exchange_dim,nsend,(int*) copylist);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_exchange_kernel_unpack+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed");
}
}
int naccept;
cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
return naccept;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
timespec atime1,atime2;
clock_gettime(CLOCK_REALTIME,&atime1);
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
clock_gettime(CLOCK_REALTIME,&atime2);
sdata->cuda_timings.test1+=
atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
int n_data_items=AtomVecCuda_CountDataItems(data_mask);
int size=nsend*n_data_items*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0];
dy = pbc[1];
dz = pbc[2];
}}
int3 layout=getgrid(nsend);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
Cuda_AtomVecCuda_PackBorder_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,nsend,sdata->comm.maxlistlength,iswap,dx,dy,dz);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_border_kernel_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed");
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_border_download+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
}
return nsend*n_data_items;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int n_data_items=AtomVecCuda_CountDataItems(data_mask);
int size=n*n_data_items*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0];
dy = pbc[1];
dz = pbc[2];
}}
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_border_kernel_self+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed");
}
return n*n_data_items;
}
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
timespec atime1,atime2;
clock_gettime(CLOCK_REALTIME,&atime1);
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
clock_gettime(CLOCK_REALTIME,&atime2);
sdata->cuda_timings.test1+=
atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
int n_data_items=AtomVecCuda_CountDataItems(data_mask);
int size=n*n_data_items*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
cudaMemset((int*) (sdata->flag),0,sizeof(int));
cudaMemcpy(sdata->buffer,(void*)buf_recv, size, cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_border_upload+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask><<<grid, threads,0>>>(n,first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_border_kernel_unpack+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
cudaMemcpy(&sdata->comm.grow_flag,sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed");
}
return sdata->comm.grow_flag;
}
#include "atom_vec_angle_cuda.cu"
#include "atom_vec_atomic_cuda.cu"
#include "atom_vec_charge_cuda.cu"
#include "atom_vec_full_cuda.cu"
//#include "atom_vec_granular_cuda.cu"

View File

@ -1,371 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#define RIMLARGER 1.000001
#define RIMSMALLER 0.999999
#define SMALL 1e-5
extern __shared__ int shared[];
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
if(j>_nmax) _flag[0]=1;
int k=0;
if(data_mask & X_MASK){
((X_FLOAT*) buffer)[i+k*n]=_x[j] + dx; k++;
((X_FLOAT*) buffer)[i+k*n] = _x[j+_nmax] + dy; k++;
((X_FLOAT*) buffer)[i+k*n] = _x[j+2*_nmax] + dz; k++;}
if(data_mask & V_MASK){
((X_FLOAT*) buffer)[i+k*n]=_v[j]; k++;
((X_FLOAT*) buffer)[i+k*n] = _v[j+_nmax]; k++;
((X_FLOAT*) buffer)[i+k*n] = _v[j+2*_nmax]; k++;}
if(data_mask & OMEGA_MASK){
((X_FLOAT*) buffer)[i+k*n]=_omega[j]; k++;
((X_FLOAT*) buffer)[i+k*n] = _omega[j+_nmax]; k++;
((X_FLOAT*) buffer)[i+k*n] = _omega[j+2*_nmax]; k++;}
if(data_mask & RADIUS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_radius[j]; k++;
if(data_mask & RMASS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_rmass[j]; k++;
}
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=i;
j=list[i];
if(data_mask & X_MASK){
_x[i+first]=_x[j] + dx;
_x[i+first+_nmax] = _x[j+_nmax] + dy;
_x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;}
if(data_mask & V_MASK){
_v[i+first]=_v[j];
_v[i+first+_nmax] = _v[j+_nmax];
_v[i+first+2*_nmax] = _v[j+2*_nmax];}
if(data_mask & OMEGA_MASK) {
_omega[i+first] = _omega[j];
_omega[i+first+_nmax] = _omega[j+_nmax];
_omega[i+first+2*_nmax] = _omega[j+2*_nmax];}
if(data_mask & RADIUS_MASK) _radius[i+first]=_radius[j];
if(data_mask & RMASS_MASK) _rmass[i+first]=_rmass[j];
}
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n,int first,void* buffer)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<n)
{
int k=0;
if(data_mask & X_MASK){
_x[i+first]=((X_FLOAT*) buffer)[i+k*n]; k++;
_x[i+first+_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;
_x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;}
if(data_mask & V_MASK){
_v[i+first]=((X_FLOAT*) buffer)[i+k*n]; k++;
_v[i+first+_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;
_v[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;}
if(data_mask & OMEGA_MASK){
_omega[i+first]=((X_FLOAT*) buffer)[i+k*n]; k++;
_omega[i+first+_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;
_omega[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;}
if(data_mask & RADIUS_MASK) _radius[i+first] = ((X_FLOAT*) buffer)[i+k*n]; k++;
if(data_mask & RMASS_MASK) _rmass[i+first] = ((X_FLOAT*) buffer)[i+k*n]; k++;
}
}
__global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n,int dim)
{
double* buf=(double*) _buffer;
buf=&buf[1];
//X_FLOAT lo=slablo[iswap];
//X_FLOAT hi=slabhi[iswap];
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
bool add=false;
if(i<_nlocal)
{
double xdim_tmp=static_cast <double> (_x[i+dim*_nmax]);
if (xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim])
{
add=true;
}
}
shared[threadIdx.x]=add?1:0;
__syncthreads();
int nsend=0;
if(threadIdx.x==0)
{
for(int k=0;k<blockDim.x;k++)
{
if(shared[k]) {nsend++; shared[k]=nsend;}
}
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
}
__syncthreads();
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
if(add&&nsend+1<n)
buf[nsend]=i;
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist)
{
double* buf=(double*) _buffer;
int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(k>=nsend) return;
buf=&buf[1+k];
int i=static_cast <int> (buf[0]);
int j=copylist[k];
int m=1;
if(data_mask & X_MASK){
buf[(m++)*nsend] = static_cast <double> (_x[i]);
buf[(m++)*nsend] = static_cast <double> (_x[i+_nmax]);
buf[(m++)*nsend] = static_cast <double> (_x[i+2*_nmax]);}
if(data_mask & V_MASK){
buf[(m++)*nsend] = _v[i];
buf[(m++)*nsend] = _v[i+_nmax];
buf[(m++)*nsend] = _v[i+2*_nmax];}
if(data_mask & TAG_MASK) buf[(m++)*nsend] = _tag[i];
if(data_mask & TYPE_MASK) buf[(m++)*nsend] = _type[i];
if(data_mask & MASK_MASK) buf[(m++)*nsend] = _mask[i];
if(data_mask & IMAGE_MASK) buf[(m++)*nsend] = _image[i];
if(data_mask & Q_MASK) buf[(m++)*nsend] = _q[i];
if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i];
if(data_mask & RADIUS_MASK) buf[(m++)*nsend] = _radius[i];
if(data_mask & DENSITY_MASK) buf[(m++)*nsend] = _density[i];
if(data_mask & RMASS_MASK) buf[(m++)*nsend] = _rmass[i];
if(data_mask & OMEGA_MASK) {
buf[(m++)*nsend] = _omega[i];
buf[(m++)*nsend] = _omega[i+_nmax];
buf[(m++)*nsend] = _omega[i+2*_nmax];}
/* if(data_mask & NSPECIAL_MASK)
{
buf[(m++)*nsend] = _nspecial[i];
buf[(m++)*nsend] = _nspecial[i+_nmax];
buf[(m++)*nsend] = _nspecial[i+2* _nmax];
}*/
if(i>=_nlocal) return;
if(data_mask & X_MASK){
_x[i] = _x[j];
_x[i+_nmax] = _x[j+_nmax];
_x[i+2*_nmax] = _x[j+2*_nmax];}
if(data_mask & V_MASK){
_v[i] = _v[j];
_v[i+_nmax] = _v[j+_nmax];
_v[i+2*_nmax] = _v[j+2*_nmax];}
if(data_mask & TAG_MASK) _tag[i] = _tag[j];
if(data_mask & TYPE_MASK) _type[i] = _type[j];
if(data_mask & MASK_MASK) _mask[i] = _mask[j];
if(data_mask & IMAGE_MASK) _image[i] = _image[j];
if(data_mask & Q_MASK) _q[i] = _q[j];
if(data_mask & MOLECULE_MASK) _molecule[i]= _molecule[j];
if(data_mask & RADIUS_MASK) _radius[i] = _radius[j];
if(data_mask & DENSITY_MASK) _density[i] = _density[j];
if(data_mask & RMASS_MASK) _rmass[i] = _rmass[j];
if(data_mask & OMEGA_MASK)
{
_omega[i] = _omega[j];
_omega[i+_nmax] = _omega[j+_nmax];
_omega[i+2*_nmax] = _omega[j+2*_nmax];
}
/* if(data_mask & NSPECIAL_MASK)
{
_nspecial[i] = _nspecial[j];
_nspecial[i+_nmax] = _nspecial[j+_nmax];
_nspecial[i+2* _nmax] = _nspecial[j+2* _nmax];
}*/
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim,int nsend,int* copylist)
{
double* buf=(double*) _buffer;
int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(k>=nsend) return;
buf=&buf[1+k];
int i=-1;
double xdim_tmp = buf[(1+dim)*nsend];
if(xdim_tmp>=_sublo[dim]-SMALL && xdim_tmp<_subhi[dim]+SMALL)
{
i=atomicAdd(_flag,1)+_nlocal;
int m=1;
if(data_mask & X_MASK){
_x[i] = buf[(m++)*nsend];
_x[i+_nmax] = buf[(m++)*nsend];
_x[i+2*_nmax] = buf[(m++)*nsend];}
if(data_mask & V_MASK){
_v[i] = buf[(m++)*nsend];
_v[i+_nmax] = buf[(m++)*nsend];
_v[i+2*_nmax] = buf[(m++)*nsend];}
if(data_mask & TAG_MASK) _tag[i] = buf[(m++)*nsend];
if(data_mask & TYPE_MASK) _type[i] = buf[(m++)*nsend];
if(data_mask & MASK_MASK) _mask[i] = buf[(m++)*nsend];
if(data_mask & IMAGE_MASK) _image[i] = buf[(m++)*nsend];
if(data_mask & Q_MASK) _q[i] = buf[(m++)*nsend];
if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++)*nsend];
if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++)*nsend];
if(data_mask & DENSITY_MASK) _density[i] = buf[(m++)*nsend];
if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++)*nsend];
if(data_mask & OMEGA_MASK)
{
_omega[i] = buf[(m++)*nsend];
_omega[i+_nmax] = buf[(m++)*nsend];
_omega[i+2*_nmax] = buf[(m++)*nsend];
}
/* if(data_mask & NSPECIAL_MASK)
{
_nspecial[i] = buf[(m++)*nsend];
_nspecial[i+_nmax] = buf[(m++)*nsend];
_nspecial[i+2*_nmax] = buf[(m++)*nsend];
}*/
}
copylist[k]=i;
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
int m=0;
if(data_mask & X_MASK) {
((X_FLOAT*) _buffer)[i+(m++)*n]= _x[j] + dx;
((X_FLOAT*) _buffer)[i+(m++)*n] = _x[j+_nmax] + dy;
((X_FLOAT*) _buffer)[i+(m++)*n] = _x[j+2*_nmax] + dz;}
if(data_mask & V_MASK) {
((X_FLOAT*) _buffer)[i+(m++)*n]= _v[j];
((X_FLOAT*) _buffer)[i+(m++)*n] = _v[j+_nmax];
((X_FLOAT*) _buffer)[i+(m++)*n] = _v[j+2*_nmax];}
if(data_mask & TAG_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _tag[j];
if(data_mask & TYPE_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _type[j];
if(data_mask & MASK_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _mask[j];
if(data_mask & Q_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _q[j];
if(data_mask & MOLECULE_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _molecule[j];
if(data_mask & RADIUS_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _radius[i];
if(data_mask & DENSITY_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _density[i];
if(data_mask & RMASS_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _rmass[i];
if(data_mask & OMEGA_MASK) {
((X_FLOAT*) _buffer)[i+(m++)*n] = _omega[i];
((X_FLOAT*) _buffer)[i+(m++)*n] = _omega[i+_nmax];
((X_FLOAT*) _buffer)[i+(m++)*n] = _omega[i+2*_nmax];}
}
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
if(data_mask & X_MASK) {
_x[i+first]= _x[j] + dx;
_x[i+first+_nmax] = _x[j+_nmax] + dy;
_x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;}
if(data_mask & V_MASK) {
_v[i+first]= _v[j];
_v[i+first+_nmax] = _v[j+_nmax];
_v[i+first+2*_nmax] = _v[j+2*_nmax];}
if(data_mask & TAG_MASK) _tag[i+first] = _tag[j];
if(data_mask & TYPE_MASK) _type[i+first] = _type[j];
if(data_mask & MASK_MASK) _mask[i+first] = _mask[j];
if(data_mask & Q_MASK) _q[i+first] = _q[j];
if(data_mask & MOLECULE_MASK) _molecule[i+first] = _molecule[j];
if(data_mask & RADIUS_MASK) _radius[i+first] = _radius[j];
if(data_mask & DENSITY_MASK) _density[i+first] = _density[j];
if(data_mask & RMASS_MASK) _rmass[i+first] = _rmass[j];
if(data_mask & OMEGA_MASK) {
_omega[i+first]= _omega[j];
_omega[i+first+_nmax] = _omega[j+_nmax];
_omega[i+first+2*_nmax] = _omega[j+2*_nmax];}
}
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n,int first)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<n)
{
if(i+first<_nmax)
{
int m=0;
if(data_mask & X_MASK) {
_x[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
_x[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];
_x[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];}
if(data_mask & V_MASK) {
_v[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
_v[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];
_v[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];}
if(data_mask & TAG_MASK) _tag[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
if(data_mask & TYPE_MASK) _type[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
if(data_mask & MASK_MASK) _mask[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
if(data_mask & Q_MASK) _q[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
if(data_mask & MOLECULE_MASK) _molecule[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
if(data_mask & RADIUS_MASK) _radius[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
if(data_mask & DENSITY_MASK) _density[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
if(data_mask & RMASS_MASK) _rmass[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
if(data_mask & OMEGA_MASK) {
_omega[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
_omega[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];
_omega[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];}
}
else
{
_flag[0]=1;
}
}
}

View File

@ -1,85 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
const unsigned int FULL_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
#include "atom_vec_full_cuda_cu.h"
void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata)
{
return Cuda_AtomVecCuda_Init<FULL_DATA_MASK>(sdata);
}
int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
}
int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
}
int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
}
int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
}
int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
}
int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
}
int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
}
int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
}
int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
}

View File

@ -1,15 +0,0 @@
#ifndef ATOM_VEC_FULL_CUDA_CU_H_
#define ATOM_VEC_FULL_CUDA_CU_H_
extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata);
extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/

View File

@ -1,196 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifdef CUDA_USE_BINNING
#include <stdio.h>
#define MY_PREFIX binning
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "binning_cu.h"
#include "binning_kernel.cu"
void Cuda_PreBinning(cuda_shared_data* sdata)
{
// initialize only on first call
short init = 0;
if(! init)
{
init = 1;
int cuda_dummy_type = sdata->atom.ntypes + 1;
X_FLOAT outside[3] =
{
(sdata->domain.subhi[0] - sdata->domain.sublo[0])/1000.0,
(sdata->domain.subhi[1] - sdata->domain.sublo[1])/1000.0,
(sdata->domain.subhi[2] - sdata->domain.sublo[2])/1000.0
};
cudaMemcpyToSymbol("binned_size_all" , & sdata->atom.binned_type.dim[0] , sizeof(unsigned) );
cudaMemcpyToSymbol("cuda_dummy_type" , & cuda_dummy_type , sizeof(int) );
cudaMemcpyToSymbol("outside" , & outside , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(binned_type), & sdata->atom.binned_type.dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_x) , & sdata->atom.binned_x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi , sizeof(X_FLOAT)*3);
// bin_nmax == blockDim.x
// printf("# CUDA: MY_CONST(binned_type) = %s\n", MY_CONST(binned_type));
// int* p = pre_binning_binned_type; // pre_binning_binned_type is defined here!!
}
dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_dim[2], 1);
dim3 threads(sdata->domain.bin_nmax, 1, 1);
MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning grid = (%u, %u, %u)\n", grid.x, grid.y, grid.z);)
MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning threads = (%u, %u, %u)\n", threads.x, threads.y, threads.z); )
PreBinning_Kernel<<<grid, threads>>> ();
cudaThreadSynchronize();
MYDBG(printf("ERROR-CUDA pre_binning: %s\n",cudaGetErrorString(cudaGetLastError())));
CUT_CHECK_ERROR("Cuda_PreBinning: binning Kernel execution failed");
}
void Cuda_Binning(cuda_shared_data* sdata)
{
MYDBG( // check assumption in debug mode
if(sdata->atom.x.dim[1] != 3)
{
printf("# CUDA: Cuda_Binning: binning error: atom array dimensions not Nx3\n");
return;
}
)
// initialize only on first call
short init = 0;
if(! init)
{
init = 0;
X_FLOAT const_rez_bin_size[3] =
{
(1.0 * sdata->domain.bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
(1.0 * sdata->domain.bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
(1.0 * sdata->domain.bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
};
cudaMemcpyToSymbol("bin_error_count" , & sdata->atom.bin_error_count.dev_data, sizeof(X_FLOAT)*1);
cudaMemcpyToSymbol("rez_bin_size" , & const_rez_bin_size , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int3) );
cudaMemcpyToSymbol(MY_CONST(bin_nmax) , & sdata->domain.bin_nmax , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_q) , & sdata->atom.binned_q .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_rmass) , & sdata->atom.binned_rmass .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binpos) , & sdata->atom.binpos .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(natoms) , & sdata->atom.nall , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(nghost) , & sdata->atom.nghost , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , sizeof(X_FLOAT)*3);
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
}
dim3 grid((unsigned)(1 + sdata->atom.nlocal/64.0), 1, 1);
MYDBG( printf("# CUDA: Cuda_Binning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); )
dim3 threads(64, 1, 1);
cudaMemset((int*) (sdata->atom.bin_count_all.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2]));
cudaMemset((int*) (sdata->atom.bin_count_local.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2]));
cudaMemset(sdata->atom.bin_error_count.dev_data,0,sizeof(int)*1);
int binning_error_l[1];
Binning_Kernel<<<grid, threads>>> (
(X_FLOAT*) (sdata->atom. x.dev_data),
(X_FLOAT*) (sdata->atom.binned_x.dev_data),
sdata->atom.q_flag,
0,
sdata->atom.rmass_flag
);
cudaThreadSynchronize();
cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost);
if(binning_error_l[0]!=0)
{
printf("CUDA-ERROR: binning local: could not bin %i atoms\n",binning_error_l[0]);
}
CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
grid.x=(unsigned)(1 + (sdata->atom.nall-sdata->atom.nlocal)/32.0);
MYDBG( printf("# CUDA: Cuda_Binning Ghost: grid dim.x = %u\n", grid.x); )
Binning_Kernel<<<grid, threads>>> (
(X_FLOAT*) (sdata->atom. x.dev_data),
(X_FLOAT*) (sdata->atom.binned_x.dev_data),
sdata->atom.q_flag,
sdata->atom.nlocal,
sdata->atom.rmass_flag
);
cudaThreadSynchronize();
cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost);
if(binning_error_l[0]!=0) printf("CUDA-ERROR: binning ghost: could not bin %i atoms\n",binning_error_l[0]);
}
void Cuda_ReverseBinning(cuda_shared_data* sdata)
{
// initialize only on first call
short init = 0;
if(! init)
{
init = 0;
cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int3) );
cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_q) , & sdata->atom.binned_q .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(natoms) , & sdata->atom.nall , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
}
dim3 grid((unsigned)(1 + sdata->atom.nlocal/32.0), 1, 1);
MYDBG( printf("# CUDA: Cuda_ReverseBinning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); )
dim3 threads(32, 1, 1);
ReverseBinning_Kernel<<<grid, threads>>> (
(X_FLOAT*) (sdata->atom. x.dev_data),
(X_FLOAT*) (sdata->atom.binned_x.dev_data),
sdata->atom.q_flag
);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Binning: reverse binning Kernel execution failed");
}
#endif

View File

@ -1,28 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_PreBinning(cuda_shared_data* sdata);
extern "C" void Cuda_Binning(cuda_shared_data* sdata);
extern "C" void Cuda_ReverseBinning(cuda_shared_data* sdata);

View File

@ -1,149 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
// load some variables from shared cuda data into device's constant memory:
__device__ __constant__ X_FLOAT rez_bin_size[3];
__device__ __constant__ unsigned* bin_error_count;
__device__ __constant__ int cuda_dummy_type;
__device__ __constant__ unsigned binned_size_all;
__device__ __constant__ X_FLOAT outside[3];
__global__ void PreBinning_Kernel()
{
const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
if(bin < gridDim.x * gridDim.y) // TODO: suspected always to be true
{
_binned_type[blockDim.x * bin + threadIdx.x] = cuda_dummy_type;
const int i = 3*blockDim.x * bin + threadIdx.x;
X_FLOAT* binned_x = _binned_x + i; *binned_x = _subhi[0] + outside[0] * (1+i);
binned_x += blockDim.x; *binned_x = _subhi[1] + outside[1] * (1+i);
binned_x += blockDim.x; *binned_x = _subhi[2] + outside[2] * (1+i);
_binned_tag[i]=-1;
}
}
__global__ void Binning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag, int offset, int rmass_flag)
{
const unsigned i = blockDim.x * blockIdx.x + threadIdx.x+offset;
int binatoms=_natoms;
if(offset==0) binatoms=_nlocal ;
if(i < binatoms)
{
// copy atom position from global device memory to local register
// in this 3 steps to get as much coalesced access as possible
X_FLOAT my_xX, my_xY, my_xZ;
x += i; my_xX = *x;
x += _nmax; my_xY = *x;
x += _nmax; my_xZ = *x;
//my_xX=x[i];
//my_xY=x[i+_nmax];
//my_xZ=x[i+2*_nmax];
// calculate flat bin index
int bx=__float2int_rd(rez_bin_size[0] * (my_xX - _sublo[0]))+2;
int by=__float2int_rd(rez_bin_size[1] * (my_xY - _sublo[1]))+2;
int bz=__float2int_rd(rez_bin_size[2] * (my_xZ - _sublo[2]))+2;
bx-=bx*negativCUDA(1.0f*bx);
bx-=(bx-_bin_dim.x+1)*negativCUDA(1.0f*_bin_dim.x-1.0f-1.0f*bx);
by-=by*negativCUDA(1.0f*by);
by-=(by-_bin_dim.y+1)*negativCUDA(1.0f*_bin_dim.y-1.0f-1.0f*by);
bz-=bz*negativCUDA(1.0f*bz);
bz-=(bz-_bin_dim.z+1)*negativCUDA(1.0f*_bin_dim.z-1.0f-1.0f*bz);
const unsigned j = _bin_dim.z * ( _bin_dim.y *bx+by)+bz;
// add new atom to bin, get bin-array position
const unsigned k = atomicAdd(& _bin_count_all[j], 1);
if(offset==0) atomicAdd(& _bin_count_local[j], 1);
if(k < _bin_nmax)
{
// copy register values back to global device memory
unsigned pos = 3*_bin_nmax * j + k;
_binpos[i]=pos;
binned_x += pos; *binned_x = my_xX;
binned_x += _bin_nmax; *binned_x = my_xY;
binned_x += _bin_nmax; *binned_x = my_xZ;
// also copy velocity and force accordingly
binned_x = _binned_v + pos; x = _v + i; *binned_x = *x;
binned_x += _bin_nmax; x += _nmax; *binned_x = *x;
binned_x += _bin_nmax; x += _nmax; *binned_x = *x;
binned_x = _binned_f + pos; x = _f + i; *binned_x = *x;
binned_x += _bin_nmax; x += _nmax; *binned_x = *x;
binned_x += _bin_nmax; x += _nmax; *binned_x = *x;
pos = _bin_nmax * j + k;
_binned_type [pos] = _type[i];
_binned_tag [pos] = _tag[i];
if(rmass_flag)
_binned_rmass[pos] = _rmass[i];
if(q_flag)
_binned_q [pos] = _q[i];
}
else
{ // normally, this should not happen:
int errorn=atomicAdd(bin_error_count, 1);
MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); )
}
}
}
__global__ void ReverseBinning_Kernel(X_FLOAT* x, X_FLOAT* binned_x,int q_flag)
{
const unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < _nlocal)
{
unsigned bin_pos3 = _binpos[i];
unsigned bin_pos=bin_pos3/(3*_bin_nmax);
bin_pos*=_bin_nmax;
bin_pos+=bin_pos3-bin_pos*3;
binned_x = _binned_x + bin_pos3; x = x + i; *x = *binned_x;
binned_x += _bin_nmax; x += _nmax; *x = *binned_x;
binned_x += _bin_nmax; x += _nmax; *x = *binned_x;
binned_x = _binned_v + bin_pos3; x = _v + i; *x = *binned_x;
binned_x += _bin_nmax; x += _nmax; *x = *binned_x;
binned_x += _bin_nmax; x += _nmax; *x = *binned_x;
binned_x = _binned_f + bin_pos3; x = _f + i; *x = *binned_x;
binned_x += _bin_nmax; x += _nmax; *x = *binned_x;
binned_x += _bin_nmax; x += _nmax; *x = *binned_x;
_type[i] = _binned_type[bin_pos];
_tag[i] = _binned_tag[bin_pos];
if(q_flag) _q[i] = _binned_q[bin_pos];
}
}

View File

@ -1,485 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX comm_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "comm_cuda_cu.h"
#include "comm_cuda_kernel.cu"
#include <ctime>
void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n)
{
int size=n*3*sizeof(X_FLOAT);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
}
void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbolAsync(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbolAsync(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbolAsync(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbolAsync(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbolAsync(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbolAsync(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
}
void Cuda_CommCuda_Init(cuda_shared_data* sdata)
{
Cuda_CommCuda_UpdateNmax(sdata);
int ntypesp=sdata->atom.ntypes+1;
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , &ntypesp, sizeof(int));
cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(debugdata) , &sdata->debugdata, sizeof(int*));
}
int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*3*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,n);
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
dz = pbc[2]*sdata->domain.prd[2];
}}
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
cudaMemset( sdata->flag,0,sizeof(int));
clock_gettime(CLOCK_REALTIME,&time1);
void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_forward_kernel_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
if(not sdata->overlap_comm)
cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_forward_download+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
int aflag;
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
}
return 3*n;
}
int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*6*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,n);
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
dz = pbc[2]*sdata->domain.prd[2];
}}
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
cudaMemset( sdata->flag,0,sizeof(int));
clock_gettime(CLOCK_REALTIME,&time1);
void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_forward_kernel_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
if(not sdata->overlap_comm)
cudaMemcpy(buf_send, sdata->buffer, n*6*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_forward_download+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
int aflag;
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
}
return 6*n;
}
int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*3*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,n);
static int count=-1;
count++;
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
dz = pbc[2]*sdata->domain.prd[2];
}}
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
clock_gettime(CLOCK_REALTIME,&time1);
Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_forward_kernel_self+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
}
return 3*n;
}
int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*6*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,n);
static int count=-1;
count++;
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
dz = pbc[2]*sdata->domain.prd[2];
}}
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
clock_gettime(CLOCK_REALTIME,&time1);
Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_forward_kernel_self+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
}
return 6*n;
}
void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
{
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*3*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,n);
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
clock_gettime(CLOCK_REALTIME,&time1);
if(not sdata->overlap_comm||iswap<0)
cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_forward_upload+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_forward_kernel_unpack+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
}
}
void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
{
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*6*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,n);
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
clock_gettime(CLOCK_REALTIME,&time1);
if(not sdata->overlap_comm||iswap<0)
cudaMemcpy(sdata->buffer,(void*)buf_recv, n*6*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_forward_upload+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time1);
sdata->cuda_timings.comm_forward_kernel_unpack+=
time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
}
}
int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send)
{
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*3*sizeof(F_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,n);
F_FLOAT* buf=(F_FLOAT*)buf_send;
F_FLOAT* f_dev=(F_FLOAT*)sdata->atom.f.dev_data;
f_dev+=first;
cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
buf+=n; f_dev+=sdata->atom.nmax;
cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
buf+=n; f_dev+=sdata->atom.nmax;
cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
return n*3;
}
void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv)
{
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*3*sizeof(F_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,n);
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
cudaMemcpy(sdata->buffer,buf_recv, size, cudaMemcpyHostToDevice);
Cuda_CommCuda_UnpackReverse_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed");
}
}
void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first)
{
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*3*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,n);
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
Cuda_CommCuda_UnpackReverse_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,first);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed");
}
}
int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap)
{
MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
timespec time1,time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
if(sdata->buffer_new or (80>sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata,10);
int n;
if (!bordergroup || ineed >= 2)
n=nlast-nfirst+1;
else
{
n=atom_nfirst;
if(nlast-sdata->atom.nlocal+1>n) n=nlast-sdata->atom.nlocal+1;
}
int3 layout=getgrid(n,0,512,true);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x+1, layout.y, 1);
cudaMemset((int*) (sdata->buffer),0,sizeof(int));
clock_gettime(CLOCK_REALTIME,&time1);
if(style==1)
Cuda_CommCuda_BuildSendlist_Single<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.slablo.dev_data,(X_FLOAT*) sdata->comm.slabhi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
else
Cuda_CommCuda_BuildSendlist_Multi<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.multilo.dev_data,(X_FLOAT*) sdata->comm.multihi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&time2);
sdata->cuda_timings.comm_border_kernel_buildlist+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed");
int nsend;
cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
return nsend;
}

View File

@ -1,35 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag);
extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag);
extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag);
extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag);
extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1);
extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1);
extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send);
extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv);
extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first);
extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap);

View File

@ -1,353 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
if(j>_nmax) _flag[0]=1;
((X_FLOAT*) buffer)[i]=_x[j] + dx;
((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy;
((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz;
}
}
__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
if(j>_nmax) _flag[0]=1;
((X_FLOAT*) buffer)[i]=_x[j] + dx;
((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy;
((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz;
((X_FLOAT*) buffer)[i+3*n]=_v[j];
((X_FLOAT*) buffer)[i+4*n] = _v[j+_nmax];
((X_FLOAT*) buffer)[i+5*n] = _v[j+2*_nmax];
}
}
__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=i;
j=list[i];
_x[i+first]=_x[j] + dx;
_x[i+first+_nmax] = _x[j+_nmax] + dy;
_x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;
}
}
__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=i;
j=list[i];
_x[i+first]=_x[j] + dx;
_x[i+first+_nmax] = _x[j+_nmax] + dy;
_x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;
_v[i+first]=_v[j];
_v[i+first+_nmax] = _v[j+_nmax];
_v[i+first+2*_nmax] = _v[j+2*_nmax];
}
}
__global__ void Cuda_CommCuda_UnpackComm_Kernel(int n,int first,void* buffer)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<n)
{
_x[i+first]=((X_FLOAT*) buffer)[i];
_x[i+first+_nmax]=((X_FLOAT*) buffer)[i+1*n];
_x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+2*n];
}
}
__global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n,int first,void* buffer)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<n)
{
_x[i+first]=((X_FLOAT*) buffer)[i];
_x[i+first+_nmax]=((X_FLOAT*) buffer)[i+1*n];
_x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+2*n];
_v[i+first]=((X_FLOAT*) buffer)[i+3*n];
_v[i+first+_nmax]=((X_FLOAT*) buffer)[i+4*n];
_v[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+5*n];
}
}
__global__ void Cuda_CommCuda_PackReverse_Kernel(int n,int first)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<n)
{
((F_FLOAT*) _buffer)[i]=_f[i+first];
((F_FLOAT*) _buffer)[i+n] = _f[i+first+_nmax];
((F_FLOAT*) _buffer)[i+2*n] = _f[i+first+2*_nmax];
}
}
__global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist,int n,int maxlistlength,int iswap)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
_f[j]+=((F_FLOAT*)_buffer)[i];
_f[j+_nmax]+=((F_FLOAT*) _buffer)[i+n];
_f[j+2*_nmax]+=((F_FLOAT*) _buffer)[i+2*n];
}
}
__global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,int first)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
_f[j]+=_f[i+first];
_f[j+_nmax]+=_f[i+first+_nmax];
_f[j+2*_nmax]+=_f[i+first+2*_nmax];
}
}
extern __shared__ int shared[];
__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup,int ineed,int atom_nfirst,
int nfirst,int nlast,int dim,int iswap,X_FLOAT* slablo, X_FLOAT* slabhi,int* sendlist,int maxlistlength)
{
int* list=sendlist+iswap*maxlistlength;
X_FLOAT lo=slablo[iswap];
X_FLOAT hi=slabhi[iswap];
bool add=false;
if (!bordergroup || ineed >= 2) {
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst;
if(i<nlast)
if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
add=true;
}
shared[threadIdx.x]=add?1:0;
__syncthreads();
int nsend=0;
if(threadIdx.x==0)
{
for(int k=0;k<blockDim.x;k++)
{
if(shared[k]) {nsend++; shared[k]=nsend;}
}
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
}
__syncthreads();
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
if(add&&nsend<maxlistlength)
list[nsend] = i;
} else {
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<atom_nfirst)
if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
add=true;
}
shared[threadIdx.x]=add?1:0;
__syncthreads();
int nsend=0;
if(threadIdx.x==0)
{
for(int k=0;k<blockDim.x;k++)
{
if(shared[k]) {nsend++; shared[k]=nsend;}
}
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
}
__syncthreads();
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
if(add&&nsend<maxlistlength)
list[nsend] = i;
__syncthreads();
add=false;
i+=_nlocal;
if(i < nlast)
if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
add=true;
}
shared[threadIdx.x]=add?1:0;
__syncthreads();
nsend=0;
if(threadIdx.x==0)
{
for(int k=0;k<blockDim.x;k++)
{
if(shared[k]) {nsend++; shared[k]=nsend;}
}
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
}
__syncthreads();
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
if(add&&nsend<maxlistlength)
list[nsend] = i;
}
}
__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup,int ineed,int atom_nfirst
,int nfirst,int nlast,int dim,int iswap,X_FLOAT* multilo, X_FLOAT* multihi,int* sendlist,int maxlistlength)
{
int* list=sendlist+iswap*maxlistlength;
X_FLOAT* mlo=&multilo[iswap*_cuda_ntypes];
X_FLOAT* mhi=&multihi[iswap*_cuda_ntypes];
int itype=0;
bool add=false;
if (!bordergroup || ineed >= 2) {
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst;
if(i<nlast)
{
itype=_type[i];
if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
add=true;
}
}
shared[threadIdx.x]=add?1:0;
__syncthreads();
int nsend=0;
if(threadIdx.x==0)
{
for(int k=0;k<blockDim.x;k++)
{
if(shared[k]) {nsend++; shared[k]=nsend;}
}
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
}
__syncthreads();
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
if(add&&nsend<maxlistlength)
list[nsend] = i;
} else {
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<atom_nfirst)
{
itype=_type[i];
if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
add=true;
}
}
shared[threadIdx.x]=add?1:0;
__syncthreads();
int nsend=0;
if(threadIdx.x==0)
{
for(int k=0;k<blockDim.x;k++)
{
if(shared[k]) {nsend++; shared[k]=nsend;}
}
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
}
__syncthreads();
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
if(add&&nsend<maxlistlength)
list[nsend] = i;
__syncthreads();
add=false;
i+=_nlocal;
if(i < nlast)
{
itype = _type[i];
if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
add=true;
}
}
shared[threadIdx.x]=add?1:0;
__syncthreads();
nsend=0;
if(threadIdx.x==0)
{
for(int k=0;k<blockDim.x;k++)
{
if(shared[k]) {nsend++; shared[k]=nsend;}
}
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
}
__syncthreads();
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
if(add&&nsend<maxlistlength)
list[nsend] = i;
}
}

View File

@ -1,123 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX compute_temp_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "compute_temp_cuda_cu.h"
#include "compute_temp_cuda_kernel.cu"
void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
}
void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) );
if(sdata->atom.rmass_flag)
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
}
void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata)
{
Cuda_ComputeTempCuda_UpdateNmax(sdata);
}
void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
//if(sdata->buffer_new)
Cuda_ComputeTempCuda_UpdateBuffer(sdata);
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
Cuda_ComputeTempCuda_Vector_Kernel<<<grid, threads,threads.x*6*sizeof(ENERGY_FLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed");
int oldgrid=grid.x*grid.y;
grid.x=6;
grid.y=1;
threads.x=512;
Cuda_ComputeTempCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed");
}
}
void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
//if(sdata->buffer_new)
Cuda_ComputeTempCuda_UpdateBuffer(sdata);
MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n",sdata->atom.nlocal);)
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel");
Cuda_ComputeTempCuda_Scalar_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed");
int oldgrid=grid.x*grid.y;
grid.x=1;
grid.y=1;
threads.x=512;
Cuda_ComputeTempCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed");
}
}

View File

@ -1,28 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t);
extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t);

View File

@ -1,109 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ ENERGY_FLOAT sharedmem[];
__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
sharedmem[threadIdx.x]=0;
if(i < _nlocal)
{
if (_rmass_flag) {
if (_mask[i] & groupbit)
sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * _rmass[i];
} else {
if (_mask[i] & groupbit)
sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * (_mass[_type[i]]);
}
}
reduceBlock(sharedmem);
ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
if(threadIdx.x==0)
{
buffer[(blockIdx.x*gridDim.y+blockIdx.y)]=sharedmem[0];
}
}
__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
sharedmem[threadIdx.x]=0;
sharedmem[threadIdx.x+blockDim.x]=0;
sharedmem[threadIdx.x+2*blockDim.x]=0;
sharedmem[threadIdx.x+3*blockDim.x]=0;
sharedmem[threadIdx.x+4*blockDim.x]=0;
sharedmem[threadIdx.x+5*blockDim.x]=0;
if(i < _nlocal)
if (_mask[i] & groupbit) {
V_FLOAT massone;
if (_rmass_flag) massone = _rmass[i];
else massone = _mass[_type[i]];
sharedmem[threadIdx.x] = massone * _v[i]*_v[i];
sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax];
sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax];
sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax];
sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax];
sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax];
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2*blockDim.x]);
reduceBlock(&sharedmem[3*blockDim.x]);
reduceBlock(&sharedmem[4*blockDim.x]);
reduceBlock(&sharedmem[5*blockDim.x]);
ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
if(threadIdx.x==0)
{
buffer[(blockIdx.x*gridDim.y+blockIdx.y)]=sharedmem[0];
buffer[(blockIdx.x*gridDim.y+blockIdx.y)+gridDim.x*gridDim.y]=sharedmem[blockDim.x];
buffer[(blockIdx.x*gridDim.y+blockIdx.y)+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];
buffer[(blockIdx.x*gridDim.y+blockIdx.y)+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];
buffer[(blockIdx.x*gridDim.y+blockIdx.y)+4*gridDim.x*gridDim.y]=sharedmem[4*blockDim.x];
buffer[(blockIdx.x*gridDim.y+blockIdx.y)+5*gridDim.x*gridDim.y]=sharedmem[5*blockDim.x];
}
}
__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t)
{
int i=0;
sharedmem[threadIdx.x]=0;
ENERGY_FLOAT myforig=0.0;
ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer;
buf=&buf[blockIdx.x*n];
while(i<n)
{
sharedmem[threadIdx.x]=0;
if(i+threadIdx.x<n)
sharedmem[threadIdx.x]=buf[i+threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i+=blockDim.x;
if(threadIdx.x==0)
myforig+=sharedmem[0];
}
if(threadIdx.x==0)
t[blockIdx.x]=myforig;
}

View File

@ -1,161 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX compute_temp_partial_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "compute_temp_partial_cuda_cu.h"
#include "compute_temp_partial_cuda_kernel.cu"
void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
}
void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) );
if(sdata->atom.rmass_flag)
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
}
void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
{
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
}
void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
//if(sdata->buffer_new)
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
Cuda_ComputeTempPartialCuda_Vector_Kernel<<<grid, threads,threads.x*6*sizeof(ENERGY_FLOAT)>>> (groupbit,xflag,yflag,zflag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
int oldgrid=grid.x*grid.y;
grid.x=6;
threads.x=512;
Cuda_ComputeTempPartialCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
}
}
void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
//if(sdata->buffer_new)
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n",sdata->atom.nlocal);)
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
Cuda_ComputeTempPartialCuda_Scalar_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (groupbit,xflag,yflag,zflag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
int oldgrid=grid.x*grid.y;
grid.x=1;
threads.x=512;
Cuda_ComputeTempPartialCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
}
}
void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
//if(sdata->buffer_new)
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel<<<grid, threads,0>>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
}
}
void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
//if(sdata->buffer_new)
Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel<<<grid, threads,0>>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
}
}

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag);
extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag);
extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall);
extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall);

View File

@ -1,152 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ ENERGY_FLOAT sharedmem[];
__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit,int xflag,int yflag,int zflag)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
sharedmem[threadIdx.x]=0;
if(i < _nlocal)
{
if (_rmass_flag) {
if (_mask[i] & groupbit)
sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * _rmass[i];
} else {
if (_mask[i] & groupbit)
sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * (_mass[_type[i]]);
}
}
reduceBlock(sharedmem);
ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
if(threadIdx.x==0)
{
buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];
}
}
__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit,int xflag,int yflag,int zflag)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
sharedmem[threadIdx.x]=0;
sharedmem[threadIdx.x+blockDim.x]=0;
sharedmem[threadIdx.x+2*blockDim.x]=0;
sharedmem[threadIdx.x+3*blockDim.x]=0;
sharedmem[threadIdx.x+4*blockDim.x]=0;
sharedmem[threadIdx.x+5*blockDim.x]=0;
if(i < _nlocal)
if (_mask[i] & groupbit) {
V_FLOAT massone;
if (_rmass_flag) massone = _rmass[i];
else massone = _mass[_type[i]];
sharedmem[threadIdx.x] = massone * _v[i]*_v[i]*xflag;
sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax]*yflag;
sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax]*zflag;
sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax]*xflag*yflag;
sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax]*xflag*zflag;
sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax]*yflag*zflag;
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2*blockDim.x]);
reduceBlock(&sharedmem[3*blockDim.x]);
reduceBlock(&sharedmem[4*blockDim.x]);
reduceBlock(&sharedmem[5*blockDim.x]);
ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
if(threadIdx.x==0)
{
buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];
buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+4*gridDim.x*gridDim.y]=sharedmem[4*blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+5*gridDim.x*gridDim.y]=sharedmem[5*blockDim.x];
}
}
__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t)
{
int i=0;
sharedmem[threadIdx.x]=0;
ENERGY_FLOAT myforig=0.0;
ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer;
buf=&buf[blockIdx.x*n];
while(i<n)
{
sharedmem[threadIdx.x]=0;
if(i+threadIdx.x<n)
sharedmem[threadIdx.x]=buf[i+threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i+=blockDim.x;
if(threadIdx.x==0)
myforig+=sharedmem[0];
}
if(threadIdx.x==0)
t[blockIdx.x]=myforig;
}
__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit,int xflag,int yflag,int zflag,V_FLOAT* vbiasall)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal)
if (_mask[i] & groupbit) {
if(!xflag)
{
vbiasall[i] = _v[i];
_v[i] = V_F(0.0);
}
if(!yflag)
{
vbiasall[i+_nmax] = _v[i+_nmax];
_v[i+_nmax] = V_F(0.0);
}
if(!zflag)
{
vbiasall[i+2*_nmax] = _v[i+2*_nmax];
_v[i+2*_nmax] = V_F(0.0);
}
}
}
__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit,int xflag,int yflag,int zflag,V_FLOAT* vbiasall)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal)
if (_mask[i] & groupbit) {
if(!xflag)
{
_v[i] += vbiasall[i];
}
if(!yflag)
{
_v[i+_nmax] += vbiasall[i+_nmax];
}
if(!zflag)
{
_v[i+2*_nmax] += vbiasall[i+2*_nmax];
}
}
}

View File

@ -1,919 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef CRM_CUDA_UTILS
#define CRM_CUDA_UTILS
//split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
{
int3 gridparams;
int sharedsize = 16000;
if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
if((n < 60 * 32) || (threadsmax < 64))
gridparams.z = 32;
else if((n < 60 * 64) || (threadsmax < 128))
gridparams.z = 64;
else if((n < 60 * 128) || (threadsmax < 256))
gridparams.z = 128;
else if((n < 60 * 256) || (threadsmax < 512))
gridparams.z = 256;
else gridparams.z = 512;
if(p2) {
gridparams.z = 16;
while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
}
int blocks = (n + gridparams.z - 1) / gridparams.z;
if(blocks > 10000)
gridparams.x = gridparams.y = int(sqrt(blocks));
else {
gridparams.x = blocks;
gridparams.y = 1;
}
while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
if(gridparams.x == 0) gridparams.x = 1;
return gridparams;
}
//return value: 1 if f<0; else: 0
//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
static inline __device__ int negativCUDA(float f)
{
return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
}
//return value: -1 if f<0; else +1
static inline __device__ float fsignCUDA(float f)
{
return f < 0.0f ? -1.0f : 1.0f;
}
//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
//blockDim.y and blockDim.z are assumed to be 1
static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
__syncthreads();
}
static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
__syncthreads();
}
static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
glob[i + threadIdx.x] = shared[i + threadIdx.x];
}
__syncthreads();
}
static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
__syncthreads();
}
static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
{
int i, k;
k = n - blockDim.x;
for(i = 0; i < k; i += blockDim.x) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
__syncthreads();
}
static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
{
int i;
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
if(threadIdx.x < n - i) {
shared[i + threadIdx.x] = glob[i + threadIdx.x];
}
__syncthreads();
}
//copy data between two memory areas on device, 3d BlockDims are allowed
static __device__ inline void copyData(double* source, double* target, const int &n)
{
int i;
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
target[i + offset] = source[i + offset];
}
if(offset < n - i) {
target[i + offset] = source[i + offset];
}
__syncthreads();
}
static __device__ inline void copyData(float* source, float* target, const int &n)
{
int i;
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
target[i + offset] = source[i + offset];
}
if(offset < n - i) {
target[i + offset] = source[i + offset];
}
__syncthreads();
}
static __device__ inline void copyData(int* source, int* target, const int &n)
{
int i;
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
target[i + offset] = source[i + offset];
}
if(offset < n - i) {
target[i + offset] = source[i + offset];
}
__syncthreads();
}
static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
{
int i;
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
target[i + offset] = source[i + offset];
}
if(offset < n - i) {
target[i + offset] = source[i + offset];
}
__syncthreads();
}
//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
//in the end in data[0]=sum_i=0^blockDim.x data[i]
//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
static __device__ inline void reduceBlockP2(int* data)
{
__syncthreads();
for(int i = 2; i <= blockDim.x; i *= 2) {
if(threadIdx.x < blockDim.x / i)
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
__syncthreads();
}
}
static __device__ inline void reduceBlockP2(unsigned int* data)
{
__syncthreads();
for(int i = 2; i <= blockDim.x; i *= 2) {
if(threadIdx.x < blockDim.x / i)
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
__syncthreads();
}
}
static __device__ inline void reduceBlockP2(float* data)
{
__syncthreads();
for(int i = 2; i <= blockDim.x; i *= 2) {
if(threadIdx.x < blockDim.x / i)
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
__syncthreads();
}
}
static __device__ inline void reduceBlockP2(double* data)
{
__syncthreads();
for(int i = 2; i <= blockDim.x; i *= 2) {
if(threadIdx.x < blockDim.x / i)
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
__syncthreads();
}
}
static __device__ inline void reduceBlock(float* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] += data[threadIdx.x + p2];
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] += data[threadIdx.x + p2 / i];
__syncthreads();
}
}
static __device__ inline void reduceBlock(int* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] += data[threadIdx.x + p2];
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] += data[threadIdx.x + p2 / i];
__syncthreads();
}
}
static __device__ inline void reduceBlock(unsigned int* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] += data[threadIdx.x + p2];
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] += data[threadIdx.x + p2 / i];
__syncthreads();
}
}
static __device__ inline void reduceBlock(double* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] += data[threadIdx.x + p2];
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] += data[threadIdx.x + p2 / i];
__syncthreads();
}
}
static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
{
int i;
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
data[i + threadIdx.x] = value;
}
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
}
static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
{
int i;
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
data[i + threadIdx.x] = value;
}
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
}
static __device__ inline void reduce(float* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
j++;
}
__syncthreads();
}
}
static __device__ inline void reduce(double* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
j++;
}
__syncthreads();
}
}
static __device__ inline void minOfBlock(float* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
__syncthreads();
}
}
static __device__ inline void maxOfBlock(float* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
__syncthreads();
}
}
static __device__ inline void minOfBlock(double* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
__syncthreads();
}
}
static __device__ inline void maxOfBlock(double* data)
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < blockDim.x) p2 *= 2;
if(threadIdx.x < blockDim.x - p2)
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
if(threadIdx.x < p2 / i)
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
__syncthreads();
}
}
static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) < n - p2) {
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
j++;
}
__syncthreads();
}
}
static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) < n - p2) {
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
j++;
}
__syncthreads();
}
}
static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) < n - p2) {
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
j++;
}
__syncthreads();
}
}
static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
{
__syncthreads();
int p2 = 1;
while(p2 * 2 < n) p2 *= 2;
int j = 0;
while((threadIdx.x + blockDim.x * j) < n - p2) {
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
j++;
}
__syncthreads();
for(int i = 2; i <= p2; i *= 2) {
while((threadIdx.x + blockDim.x * j) < p2 / i) {
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
j++;
}
__syncthreads();
}
}
#if X_PRECISION == 2
static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
{
int2 v = tex1Dfetch(t, i);
return __hiloint2double(v.y, v.x);
}
static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t, 2 * i);
int4 u = tex1Dfetch(t, 2 * i + 1);
X_FLOAT4 w;
w.x = __hiloint2double(v.y, v.x);
w.y = __hiloint2double(v.w, v.z);
w.z = __hiloint2double(u.y, u.x);
w.w = __hiloint2double(u.w, u.z);
return w;
}
#endif
inline void BindXTypeTexture(cuda_shared_data* sdata)
{
#ifdef CUDA_USE_TEXTURE
_x_type_tex.normalized = false; // access with normalized texture coordinates
_x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
#if X_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
#endif
#endif
}
static __device__ inline X_FLOAT4 fetchXType(int i)
{
#ifdef CUDA_USE_TEXTURE
#if X_PRECISION == 1
return tex1Dfetch(_x_type_tex, i);
#else
return tex1Dfetch_double(_x_type_tex, i);
#endif
#else
return _x_type[i];
#endif
}
#if V_PRECISION == 2
static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
{
int2 v = tex1Dfetch(t, i);
return __hiloint2double(v.y, v.x);
}
static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t, 2 * i);
int4 u = tex1Dfetch(t, 2 * i + 1);
V_FLOAT4 w;
w.x = __hiloint2double(v.y, v.x);
w.y = __hiloint2double(v.w, v.z);
w.z = __hiloint2double(u.y, u.x);
w.w = __hiloint2double(u.w, u.z);
return w;
}
#endif
inline void BindVRadiusTexture(cuda_shared_data* sdata)
{
#ifdef CUDA_USE_TEXTURE
_v_radius_tex.normalized = false; // access with normalized texture coordinates
_v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
#if V_PRECISION == 1
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4));
#else
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
#endif
#endif
}
static __device__ inline V_FLOAT4 fetchVRadius(int i)
{
#ifdef CUDA_USE_TEXTURE
#if V_PRECISION == 1
return tex1Dfetch(_v_radius_tex, i);
#else
return tex1Dfetch_double_v(_v_radius_tex, i);
#endif
#else
return _v_radius[i];
#endif
}
inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
{
#ifdef CUDA_USE_TEXTURE
_omega_rmass_tex.normalized = false; // access with normalized texture coordinates
_omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
#if V_PRECISION == 1
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4));
#else
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
#endif
#endif
}
static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
{
#ifdef CUDA_USE_TEXTURE
#if V_PRECISION == 1
return tex1Dfetch(_omega_rmass_tex, i);
#else
return tex1Dfetch_double_v(_omega_rmass_tex, i);
#endif
#else
return _omega_rmass[i];
#endif
}
#if F_PRECISION == 2
static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
{
int2 v = tex1Dfetch(t, i);
return __hiloint2double(v.y, v.x);
}
static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t, 2 * i);
int4 u = tex1Dfetch(t, 2 * i + 1);
F_FLOAT4 w;
w.x = __hiloint2double(v.y, v.x);
w.y = __hiloint2double(v.w, v.z);
w.z = __hiloint2double(u.y, u.x);
w.w = __hiloint2double(u.w, u.z);
return w;
}
#endif
inline void BindQTexture(cuda_shared_data* sdata)
{
#ifdef CUDA_USE_TEXTURE
_q_tex.normalized = false; // access with normalized texture coordinates
_q_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* q_texture_ptr = &MY_AP(q_tex);
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT));
#else
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
#endif
#endif
}
static __device__ inline F_FLOAT fetchQ(int i)
{
#ifdef CUDA_USE_TEXTURE
#if F_PRECISION == 1
return tex1Dfetch(_q_tex, i);
#else
return tex1Dfetch_double_f(_q_tex, i);
#endif
#else
return _q[i];
#endif
}
#endif
/*
inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
{
#ifdef CUDA_USE_TEXTURE
_coeff_tex.normalized = false; // access with normalized texture coordinates
_coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no
_coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
const textureReference* coeff_texture_ptr;
cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
#endif
#endif
}
static __device__ inline X_FLOAT4 fetchXType(int i)
{
#ifdef CUDA_USE_TEXTURE
#if X_PRECISION == 1
return tex1Dfetch(_x_type_tex,i);
#else
return tex1Dfetch_double(_x_type_tex,i);
#endif
#else
return _x_type[i];
#endif
}
*/
#define SBBITS 30
static inline __device__ int sbmask(int j)
{
return j >> SBBITS & 3;
}
static inline __device__ void minimum_image(X_FLOAT4 &delta)
{
if(_triclinic == 0) {
if(_periodicity[0]) {
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
}
if(_periodicity[1]) {
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
}
if(_periodicity[2]) {
delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
(delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
}
} else {
if(_periodicity[1]) {
delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
(delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
(delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
(delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
}
if(_periodicity[1]) {
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
(delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
}
if(_periodicity[0]) {
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
}
}
}
static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci)
{
ci.x = x2.x - x1.x;
ci.y = x2.y - x1.y;
ci.z = x2.z - x1.z;
minimum_image(ci);
ci.x += x1.x;
ci.y += x1.y;
ci.z += x1.z;
}

View File

@ -1,22 +0,0 @@
#include "cuda_precision.h"
#include "cuda_shared.h"
#include "cuda_cu.h"
void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata)
{
sdata->compile_settings.prec_glob=sizeof(CUDA_FLOAT)/4;
sdata->compile_settings.prec_x=sizeof(X_FLOAT)/4;
sdata->compile_settings.prec_v=sizeof(V_FLOAT)/4;
sdata->compile_settings.prec_f=sizeof(F_FLOAT)/4;
sdata->compile_settings.prec_pppm=sizeof(PPPM_FLOAT)/4;
sdata->compile_settings.prec_fft=sizeof(FFT_FLOAT)/4;
#ifdef FFT_CUFFT
sdata->compile_settings.cufft=1;
#else
sdata->compile_settings.cufft=0;
#endif
sdata->compile_settings.arch=CUDA_ARCH;
}

View File

@ -1,344 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_COMMON_H_
#define _CUDA_COMMON_H_
//#include "cutil.h"
#include "cuda_precision.h"
#include "cuda_wrapper_cu.h"
#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
//this can not be arbitrarly large, since constant space is limited.
//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
//Christian
#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
#define CUDA_MAX_NSPECIAL 25
// define some easy-to-use debug and emulation macros
#ifdef _DEBUG
#define MYDBG(a) a
#else
#define MYDBG(a)
#endif
#if __DEVICE_EMULATION__
#define MYEMU(a) a
#else
#define MYEMU(a)
#endif
#define MYEMUDBG(a) MYEMU(MYDBG(a))
// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
#define MY_ADD_PREFIX(prefix, var) prefix##_##var
#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
#define MY_VAR_TO_STR(var) #var
#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
#define CUDA_USE_TEXTURE
#define CUDA_USE_FLOAT4
//constants used by many classes
//domain
#define _boxhi MY_AP(boxhi)
#define _boxlo MY_AP(boxlo)
#define _subhi MY_AP(subhi)
#define _sublo MY_AP(sublo)
#define _box_size MY_AP(box_size)
#define _prd MY_AP(prd)
#define _periodicity MY_AP(periodicity)
#define _triclinic MY_AP(triclinic)
#define _boxhi_lamda MY_AP(boxhi_lamda)
#define _boxlo_lamda MY_AP(boxlo_lamda)
#define _prd_lamda MY_AP(prd_lamda)
#define _h MY_AP(h)
#define _h_inv MY_AP(h_inv)
#define _h_rate MY_AP(h_rate)
__device__ __constant__ X_FLOAT _boxhi[3];
__device__ __constant__ X_FLOAT _boxlo[3];
__device__ __constant__ X_FLOAT _subhi[3];
__device__ __constant__ X_FLOAT _sublo[3];
__device__ __constant__ X_FLOAT _box_size[3];
__device__ __constant__ X_FLOAT _prd[3];
__device__ __constant__ int _periodicity[3];
__device__ __constant__ int _triclinic;
__device__ __constant__ X_FLOAT _boxhi_lamda[3];
__device__ __constant__ X_FLOAT _boxlo_lamda[3];
__device__ __constant__ X_FLOAT _prd_lamda[3];
__device__ __constant__ X_FLOAT _h[6];
__device__ __constant__ X_FLOAT _h_inv[6];
__device__ __constant__ V_FLOAT _h_rate[6];
//atom properties
#define _x MY_AP(x)
#define _v MY_AP(v)
#define _f MY_AP(f)
#define _tag MY_AP(tag)
#define _type MY_AP(type)
#define _mask MY_AP(mask)
#define _image MY_AP(image)
#define _q MY_AP(q)
#define _mass MY_AP(mass)
#define _rmass MY_AP(rmass)
#define _rmass_flag MY_AP(rmass_flag)
#define _eatom MY_AP(eatom)
#define _vatom MY_AP(vatom)
#define _x_type MY_AP(x_type)
#define _radius MY_AP(radius)
#define _density MY_AP(density)
#define _omega MY_AP(omega)
#define _torque MY_AP(torque)
#define _special MY_AP(special)
#define _maxspecial MY_AP(maxspecial)
#define _nspecial MY_AP(nspecial)
#define _special_flag MY_AP(special_flag)
#define _molecule MY_AP(molecule)
#define _v_radius MY_AP(v_radius)
#define _omega_rmass MY_AP(omega_rmass)
#define _freeze_group_bit MY_AP(freeze_group_bit)
#define _map_array MY_AP(map_array)
__device__ __constant__ X_FLOAT* _x; //holds pointer to positions
__device__ __constant__ V_FLOAT* _v;
__device__ __constant__ F_FLOAT* _f;
__device__ __constant__ int* _tag;
__device__ __constant__ int* _type;
__device__ __constant__ int* _mask;
__device__ __constant__ int* _image;
__device__ __constant__ V_FLOAT* _mass;
__device__ __constant__ F_FLOAT* _q;
__device__ __constant__ V_FLOAT* _rmass;
__device__ __constant__ int _rmass_flag;
__device__ __constant__ ENERGY_FLOAT* _eatom;
__device__ __constant__ ENERGY_FLOAT* _vatom;
__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions
__device__ __constant__ X_FLOAT* _radius;
__device__ __constant__ F_FLOAT* _density;
__device__ __constant__ V_FLOAT* _omega;
__device__ __constant__ F_FLOAT* _torque;
__device__ __constant__ int* _special;
__device__ __constant__ int _maxspecial;
__device__ __constant__ int* _nspecial;
__device__ __constant__ int _special_flag[4];
__device__ __constant__ int* _molecule;
__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions
__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions
__device__ __constant__ int _freeze_group_bit;
__device__ __constant__ int* _map_array;
#ifdef CUDA_USE_TEXTURE
#define _x_tex MY_AP(x_tex)
#if X_PRECISION == 1
texture<float> _x_tex;
#else
texture<int2,1> _x_tex;
#endif
#define _type_tex MY_AP(type_tex)
texture<int> _type_tex;
#define _x_type_tex MY_AP(x_type_tex)
#if X_PRECISION == 1
texture<float4,1> _x_type_tex;
#else
texture<int4,1> _x_type_tex;
#endif
#define _v_radius_tex MY_AP(v_radius_tex)
#if V_PRECISION == 1
texture<float4,1> _v_radius_tex;
#else
texture<int4,1> _v_radius_tex;
#endif
#define _omega_rmass_tex MY_AP(omega_rmass_tex)
#if V_PRECISION == 1
texture<float4,1> _omega_rmass_tex;
#else
texture<int4,1> _omega_rmass_tex;
#endif
#define _q_tex MY_AP(q_tex)
#if F_PRECISION == 1
texture<float> _q_tex;
#else
texture<int2,1> _q_tex;
#endif
#endif
//neighbor
#ifdef IncludeCommonNeigh
#define _inum MY_AP(inum)
#define _inum_border MY_AP(inum_border)
#define _ilist MY_AP(ilist)
#define _ilist_border MY_AP(ilist_border)
#define _numneigh MY_AP(numneigh)
#define _numneigh_border MY_AP(numneigh_border)
#define _numneigh_inner MY_AP(numneigh_inner)
#define _firstneigh MY_AP(firstneigh)
#define _neighbors MY_AP(neighbors)
#define _neighbors_border MY_AP(neighbors_border)
#define _neighbors_inner MY_AP(neighbors_inner)
#define _reneigh_flag MY_AP(reneigh_flag)
#define _triggerneighsq MY_AP(triggerneighsq)
#define _xhold MY_AP(xhold)
#define _maxhold MY_AP(maxhold)
#define _dist_check MY_AP(dist_check)
#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
#define _maxneighbors MY_AP(maxneighbors)
#define _overlap_comm MY_AP(overlap_comm)
__device__ __constant__ int _inum;
__device__ __constant__ int* _inum_border;
__device__ __constant__ int* _ilist;
__device__ __constant__ int* _ilist_border;
__device__ __constant__ int* _numneigh;
__device__ __constant__ int* _numneigh_border;
__device__ __constant__ int* _numneigh_inner;
__device__ __constant__ int** _firstneigh;
__device__ __constant__ int* _neighbors;
__device__ __constant__ int* _neighbors_border;
__device__ __constant__ int* _neighbors_inner;
__device__ __constant__ int* _reneigh_flag;
__device__ __constant__ X_FLOAT _triggerneighsq;
__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions
__device__ __constant__ int _maxhold;
__device__ __constant__ int _dist_check;
__device__ __constant__ int _neighbor_maxlocal;
__device__ __constant__ int _maxneighbors;
__device__ __constant__ int _overlap_comm;
#endif
//system properties
#define _nall MY_AP(nall)
#define _nghost MY_AP(nghost)
#define _nlocal MY_AP(nlocal)
#define _nmax MY_AP(nmax)
#define _cuda_ntypes MY_AP(cuda_ntypes)
#define _dtf MY_AP(dtf)
#define _dtv MY_AP(dtv)
#define _factor MY_AP(factor)
#define _virial MY_AP(virial)
#define _eng_vdwl MY_AP(eng_vdwl)
#define _eng_coul MY_AP(eng_coul)
#define _molecular MY_AP(molecular)
__device__ __constant__ unsigned _nall;
__device__ __constant__ unsigned _nghost;
__device__ __constant__ unsigned _nlocal;
__device__ __constant__ unsigned _nmax;
__device__ __constant__ unsigned _cuda_ntypes;
__device__ __constant__ V_FLOAT _dtf;
__device__ __constant__ X_FLOAT _dtv;
__device__ __constant__ V_FLOAT _factor;
__device__ __constant__ ENERGY_FLOAT* _virial;
__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
__device__ __constant__ ENERGY_FLOAT* _eng_coul;
__device__ __constant__ int _molecular;
//other general constants
#define _buffer MY_AP(buffer)
#define _flag MY_AP(flag)
#define _debugdata MY_AP(debugdata)
__device__ __constant__ void* _buffer;
__device__ __constant__ int* _flag;
__device__ __constant__ int* _debugdata;
// pointers to data fields on GPU are hold in constant space
// -> reduces register usage and number of parameters for kernelcalls
// will be variables of file scope in cuda files
// maybe used to output cudaError_t
#define MY_OUTPUT_RESULT(result) \
switch(result) \
{ \
case cudaSuccess: printf(" => cudaSuccess\n"); break; \
case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
default: printf(" => unknown\n"); break; \
}
#ifdef _DEBUG
# define CUT_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#else
# define CUT_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#endif
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} }
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
#define X_MASK 1
#define V_MASK 2
#define F_MASK 4
#define TAG_MASK 8
#define TYPE_MASK 16
#define MASK_MASK 32
#define IMAGE_MASK 64
#define Q_MASK 128
#define MOLECULE_MASK 256
#define RMASS_MASK 512
#define RADIUS_MASK 1024
#define DENSITY_MASK 2048
#define OMEGA_MASK 4096
#define TORQUE_MASK 8192
#endif // #ifdef _CUDA_COMMON_H_

View File

@ -1 +0,0 @@
extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata);

View File

@ -1,168 +0,0 @@
enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
#include "cuda_data_cu.h"
#include "cuda_wrapper_cu.h"
#include "cuda_data_kernel.cu"
#include <cstdio>
void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
{
int size=n[0];
if(n[1]>0) size*=n[1];
if(n[2]>0) size*=n[2];
dim3 threads; threads.x=1; threads.y=1; threads.z=1;
dim3 grid; grid.x=1; grid.y=1; grid.z=1;
if(size<=128*30)
threads.x=32;
else if(size<=256*30)
threads.x=64;
else if(size<=512*30)
threads.x=128;
else
threads.x=256;
grid.x=((size-1)+threads.x)/threads.x;
if(grid.x>32000)
grid.x=32000;
while(grid.x*grid.y*threads.x<size) grid.y++;
float debugdata[size];
//int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
size*=sizeof(double);
printf("size: %i (%i %i %i) (%i %i %i) %p\n",size,grid.x,grid.y,threads.x,n[0],n[1],n[2],buffer);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_DoubleFloat<<<grid,threads>>>((double*)buffer,(float*)dev_data,n[0],n[1],n[2],mode);
cudaThreadSynchronize();
CudaWrapper_DownloadCudaData(debugdata, dev_data, size/2);
double sum=0;
printf("debugdata: ");
for(int i=0;i<size/sizeof(double);i++) sum+=(debugdata[i]-((double*) host_data)[i])*(debugdata[i]-((double*) host_data)[i]);
printf("%lf \n",sum);
}
void CudaData_Upload_DoubleDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
{
int size=n[0];
if(n[1]>0) size*=n[1];
if(n[2]>0) size*=n[2];
dim3 threads; threads.x=1; threads.y=1; threads.z=1;
dim3 grid; grid.x=1; grid.y=1; grid.z=1;
if(size<=128*30)
threads.x=32;
else if(size<=256*30)
threads.x=64;
else if(size<=512*30)
threads.x=128;
else
threads.x=256;
grid.x=((size-1)+threads.x)/threads.x;
if(grid.x>32000)
grid.x=32000;
while(grid.x*grid.y*threads.x<size) grid.y++;
size*=sizeof(double);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_DoubleDouble<<<grid,threads>>>((double*)buffer,(double*)dev_data,n[0],n[1],n[2],mode);
cudaThreadSynchronize();
}
void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
{
int size=n[0];
if(n[1]>0) size*=n[1];
if(n[2]>0) size*=n[2];
dim3 threads; threads.x=1; threads.y=1; threads.z=1;
dim3 grid; grid.x=1; grid.y=1; grid.z=1;
if(size<=128*30)
threads.x=32;
else if(size<=256*30)
threads.x=64;
else if(size<=512*30)
threads.x=128;
else
threads.x=256;
grid.x=((size-1)+threads.x)/threads.x;
if(grid.x>32000)
grid.x=32000;
while(grid.x*grid.y*threads.x<size) grid.y++;
size*=sizeof(float);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_FloatDouble<<<grid,threads>>>((float*)buffer,(double*)dev_data,n[0],n[1],n[2],mode);
cudaThreadSynchronize();
}
void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
{
int size=n[0];
if(n[1]>0) size*=n[1];
if(n[2]>0) size*=n[2];
dim3 threads; threads.x=1; threads.y=1; threads.z=1;
dim3 grid; grid.x=1; grid.y=1; grid.z=1;
if(size<=128*30)
threads.x=32;
else if(size<=256*30)
threads.x=64;
else if(size<=512*30)
threads.x=128;
else
threads.x=256;
grid.x=((size-1)+threads.x)/threads.x;
if(grid.x>32000)
grid.x=32000;
while(grid.x*grid.y*threads.x<size) grid.y++;
size*=sizeof(float);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_FloatFloat<<<grid,threads>>>((float*)buffer,(float*)dev_data,n[0],n[1],n[2],mode);
cudaThreadSynchronize();
}
void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
{
int size=n[0];
if(n[1]>0) size*=n[1];
if(n[2]>0) size*=n[2];
dim3 threads; threads.x=1; threads.y=1; threads.z=1;
dim3 grid; grid.x=1; grid.y=1; grid.z=1;
if(size<=128*30)
threads.x=32;
else if(size<=256*30)
threads.x=64;
else if(size<=512*30)
threads.x=128;
else
threads.x=256;
grid.x=((size-1)+threads.x)/threads.x;
if(grid.x>32000)
grid.x=32000;
while(grid.x*grid.y*threads.x<size) grid.y++;
size*=sizeof(int);
CudaWrapper_UploadCudaData(host_data, buffer, size);
CudaData_Upload_Kernel_IntInt<<<grid,threads>>>((int*)buffer,(int*)dev_data,n[0],n[1],n[2],mode);
cudaThreadSynchronize();
}
void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer)
{
}

View File

@ -1,13 +0,0 @@
#ifndef CUDA_DATA_CU_H_
#define CUDA_DATA_CU_H_
extern "C" void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
extern "C" void CudaData_Upload_DoubleDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
extern "C" void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
extern "C" void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
extern "C" void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
extern "C" void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer);
#endif /*CUDA_DATA_CU_H_*/

View File

@ -1,156 +0,0 @@
__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer,float* dev_data,
unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
{
if(mode==x) mode=xx;
unsigned length=nx;
if(ny>0) length*=ny;
if(nz>0) length*=nz;
unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
if(i>=length) return;
switch(mode)
{
case xx:
{
dev_data[i]=buffer[i];
}
case xy:
{
dev_data[i]=buffer[i];
}
case yx:
{
j=i/ny;
k=i%ny;
dev_data[k*nx+j]=buffer[j*ny+k];
}
case xyz:
{
dev_data[i]=buffer[i];
}
case xzy:
{
j=i/(ny*nz);
k=(i%(ny*nz))/nz;
l=i%nz;
dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
}
}
}
__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer,double* dev_data,
unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
{
if(mode==x) mode=xx;
unsigned length=nx;
if(ny>0) length*=ny;
if(nz>0) length*=nz;
unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
if(i>=length) return;
switch(mode)
{
case xx:
dev_data[i]=buffer[i];
case xy:
dev_data[i]=buffer[i];
case yx:
j=i/ny;
k=i%ny;
dev_data[k*nx+j]=buffer[j*ny+k];
case xyz:
dev_data[i]=buffer[i];
case xzy:
j=i/(ny*nz);
k=(i%(ny*nz))/nz;
l=i%nz;
dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
}
}
__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer,double* dev_data,
unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
{
if(mode==x) mode=xx;
unsigned length=nx;
if(ny>0) length*=ny;
if(nz>0) length*=nz;
unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
if(i>=length) return;
switch(mode)
{
case xx:
dev_data[i]=buffer[i];
case xy:
dev_data[i]=buffer[i];
case yx:
j=i/ny;
k=i%ny;
dev_data[k*nx+j]=buffer[j*ny+k];
case xyz:
dev_data[i]=buffer[i];
case xzy:
j=i/(ny*nz);
k=(i%(ny*nz))/nz;
l=i%nz;
dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
}
}
__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer,float* dev_data,
unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
{
if(mode==x) mode=xx;
unsigned length=nx;
if(ny>0) length*=ny;
if(nz>0) length*=nz;
unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
if(i>=length) return;
switch(mode)
{
case xx:
dev_data[i]=buffer[i];
case xy:
dev_data[i]=buffer[i];
case yx:
j=i/ny;
k=i%ny;
dev_data[k*nx+j]=buffer[j*ny+k];
case xyz:
dev_data[i]=buffer[i];
case xzy:
j=i/(ny*nz);
k=(i%(ny*nz))/nz;
l=i%nz;
dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
}
}
__global__ void CudaData_Upload_Kernel_IntInt(int* buffer,int* dev_data,
unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
{
if(mode==x) mode=xx;
unsigned length=nx;
if(ny>0) length*=ny;
if(nz>0) length*=nz;
unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
if(i>=length) return;
switch(mode)
{
case xx:
dev_data[i]=buffer[i];
case xy:
dev_data[i]=buffer[i];
case yx:
j=i/ny;
k=i%ny;
dev_data[k*nx+j]=buffer[j*ny+k];
case xyz:
dev_data[i]=buffer[i];
case xzy:
j=i/(ny*nz);
k=(i%(ny*nz))/nz;
l=i%nz;
dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata);
extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag);

File diff suppressed because it is too large Load Diff

View File

@ -1,126 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ ENERGY_FLOAT sharedmem[];
static inline __device__ void PairVirialCompute_A_Kernel(int eflag,int vflag,int coulflag=0)
{
__syncthreads();
ENERGY_FLOAT* shared=sharedmem;
if(eflag)
{
reduceBlock(shared);
shared+=blockDim.x;
if(coulflag)
{
reduceBlock(shared);
shared+=blockDim.x;
}
}
if(vflag)
{
reduceBlock(shared + 0 * blockDim.x);
reduceBlock(shared + 1 * blockDim.x);
reduceBlock(shared + 2 * blockDim.x);
reduceBlock(shared + 3 * blockDim.x);
reduceBlock(shared + 4 * blockDim.x);
reduceBlock(shared + 5 * blockDim.x);
}
if(threadIdx.x == 0)
{
shared=sharedmem;
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
if(eflag)
{
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0];
shared+=blockDim.x; buffer+=gridDim.x * gridDim.y;
if(coulflag)
{
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0];
shared+=blockDim.x; buffer+=gridDim.x * gridDim.y;
}
}
if(vflag)
{
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[0 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[1 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[2 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[3 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[4 * blockDim.x];
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[5 * blockDim.x];
}
}
__syncthreads();
}
__global__ void MY_AP(PairVirialCompute_reduce)(int n)
{
sharedmem[threadIdx.x] = ENERGY_F(0.0);
ENERGY_FLOAT sum = ENERGY_F(0.0);
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
//if(blockIdx.x==2) buf=&buf[n];
for(int i = 0; i < n; i += blockDim.x)
{
sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0);
__syncthreads();
reduceBlock(sharedmem);
if(threadIdx.x == 0) sum += sharedmem[0];
}
if(threadIdx.x==0)
{
if(gridDim.x == 1) //evdwl
{
_eng_vdwl[0]+=sum;
}
if(gridDim.x == 2) //evdwl + ecoul only
{
if(blockIdx.x==0)
_eng_vdwl[0]+=sum;
else
_eng_coul[0]+=sum;
}
if(gridDim.x == 6) //virial
{
_virial[blockIdx.x] += sum;
}
if(gridDim.x == 7) //evdwl+virial
{
if(blockIdx.x==0)
_eng_vdwl[0]+=sum;
else _virial[blockIdx.x-1] += sum;
}
if(gridDim.x == 8) //evdwl+ecoul+virial
{
if(blockIdx.x==0)
_eng_vdwl[0]+=sum;
else
if(blockIdx.x==1)
_eng_coul[0]+=sum;
else
_virial[blockIdx.x-2] += sum;
}
}
}

View File

@ -1,284 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef CUDA_PRECISION_H_
#define CUDA_PRECISION_H_
/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
* Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
* ***_FLOAT: type definition of given property
* ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
*/
#ifdef CUDA_USE_BINNING
#define CUDA_IF_BINNING(a) a
#else
#define CUDA_IF_BINNING(a)
#endif
//GLOBAL
#ifdef CUDA_PRECISION
#if CUDA_PRECISION == 1
#define CUDA_FLOAT float
#define CUDA_F(x) x##f
#endif
#if CUDA_PRECISION == 2
#define CUDA_FLOAT double
#define CUDA_F(x) x
#endif
#endif
#ifndef CUDA_PRECISION
#define CUDA_FLOAT double
#define CUDA_F(x) x
#define CUDA_PRECISION 2
#endif
//--------------------------------
//-----------FFT-----------------
//--------------------------------
#ifdef FFT_PRECISION_CU
#if FFT_PRECISION_CU == 1
#define FFT_FLOAT float
#define FFT_F(x) x##f
#endif
#if FFT_PRECISION_CU == 2
#define FFT_FLOAT double
#define FFT_F(x) x
#endif
#endif
#ifndef FFT_PRECISION_CU
#define FFT_FLOAT CUDA_FLOAT
#define FFT_F(x) CUDA_F(x)
#define FFT_PRECISION_CU CUDA_PRECISION
#endif
//--------------------------------
//-----------PPPM-----------------
//--------------------------------
#ifndef PPPM_PRECISION
#define PPPM_PRECISION CUDA_PRECISION
#endif
#ifdef PPPM_PRECISION
#if PPPM_PRECISION == 1
#define PPPM_FLOAT float
#ifdef float3
#define PPPM_FLOAT3 float3
#else
struct PPPM_FLOAT3
{
PPPM_FLOAT x;
PPPM_FLOAT y;
PPPM_FLOAT z;
};
#endif
#define PPPM_F(x) x##f
#endif
#if PPPM_PRECISION == 2
#define PPPM_FLOAT double
struct PPPM_FLOAT3
{
PPPM_FLOAT x;
PPPM_FLOAT y;
PPPM_FLOAT z;
};
#define PPPM_F(x) x
#endif
#endif
//--------------------------------
//-----------FORCE-----------------
//--------------------------------
#ifdef F_PRECISION
#if F_PRECISION == 1
#define F_FLOAT float
#define F_F(x) x##f
#endif
#if F_PRECISION == 2
#define F_FLOAT double
#define F_F(x) x
#endif
#endif
#ifndef F_PRECISION
#define F_FLOAT CUDA_FLOAT
#define F_F(x) CUDA_F(x)
#define F_PRECISION CUDA_PRECISION
#endif
#if F_PRECISION == 1
#define _SQRT_ sqrtf
#define _RSQRT_ rsqrtf
#define _EXP_ expf
#else
#define _SQRT_ sqrt
#define _RSQRT_ rsqrt
#define _EXP_ exp
#endif
#if F_PRECISION == 2
struct F_FLOAT2
{
F_FLOAT x;
F_FLOAT y;
};
struct F_FLOAT3
{
F_FLOAT x;
F_FLOAT y;
F_FLOAT z;
};
struct F_FLOAT4
{
F_FLOAT x;
F_FLOAT y;
F_FLOAT z;
F_FLOAT w;
};
#else
#define F_FLOAT2 float2
#define F_FLOAT3 float3
#define F_FLOAT4 float4
#endif
//--------------------------------
//-----------ENERGY-----------------
//--------------------------------
#ifndef ENERGY_PRECISION
#define ENERGY_FLOAT CUDA_FLOAT
#define ENERGY_F(x) CUDA_F(x)
#endif
#ifdef ENERGY_PRECISION
#if ENERGY_PRECISION == 1
#define ENERGY_FLOAT float
#define ENERGY_F(x) x##f
#endif
#if ENERGY_PRECISION == 2
#define ENERGY_FLOAT double
#define ENERGY_F(x) x
#endif
#endif
#ifndef ENERGY_PRECISION
#define ENERGY_FLOAT CUDA_FLOAT
#define ENERGY_F(x) CUDA_F(x)
#define ENERGY_PRECISION CUDA_PRECISION
#endif
//--------------------------------
//-----------POSITIONS------------
//--------------------------------
#ifdef X_PRECISION
#if X_PRECISION == 1
#define X_FLOAT float
#define X_F(x) x##f
#endif
#if X_PRECISION == 2
#define X_FLOAT double
#define X_F(x) x
#endif
#endif
#ifndef X_PRECISION
#define X_FLOAT CUDA_FLOAT
#define X_F(x) CUDA_F(x)
#define X_PRECISION CUDA_PRECISION
#endif
#if X_PRECISION == 2
struct X_FLOAT2
{
X_FLOAT x;
X_FLOAT y;
};
struct X_FLOAT3
{
X_FLOAT x;
X_FLOAT y;
X_FLOAT z;
};
struct X_FLOAT4
{
X_FLOAT x;
X_FLOAT y;
X_FLOAT z;
X_FLOAT w;
};
#else
#define X_FLOAT2 float2
#define X_FLOAT3 float3
#define X_FLOAT4 float4
#endif
//--------------------------------
//-----------velocities-----------
//--------------------------------
#ifdef V_PRECISION
#if V_PRECISION == 1
#define V_FLOAT float
#define V_F(x) x##f
#endif
#if V_PRECISION == 2
#define V_FLOAT double
#define V_F(x) x
#endif
#endif
#ifndef V_PRECISION
#define V_FLOAT CUDA_FLOAT
#define V_F(x) CUDA_F(x)
#define V_PRECISION CUDA_PRECISION
#endif
#if V_PRECISION == 2
struct V_FLOAT4
{
V_FLOAT x;
V_FLOAT y;
V_FLOAT z;
V_FLOAT w;
};
#else
#define V_FLOAT4 float4
#endif
#ifdef NO_PREC_TIMING
struct timespec_2
{
unsigned int tv_sec;
unsigned int tv_nsec;
};
#define timespec timespec_2
#define clock_gettime(a,b)
#endif
#endif /*CUDA_PRECISION_H_*/

View File

@ -1,380 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_SHARED_H_
#define _CUDA_SHARED_H_
#include "cuda_precision.h"
#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
struct dev_array
{
void* dev_data; // pointer to memory address on cuda device
unsigned dim[3]; // array dimensions
};
struct cuda_shared_atom // relevent data from atom class
{
dev_array dx; // cumulated distance for binning settings
dev_array x; // position
dev_array v; // velocity
dev_array f; // force
dev_array tag;
dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1)
dev_array mask;
dev_array image;
dev_array q; // charges
dev_array mass; // per-type masses
dev_array rmass; // per-atom masses
dev_array radius; // per-atom radius
dev_array density;
dev_array omega;
dev_array torque;
dev_array molecule;
dev_array special;
int maxspecial;
dev_array nspecial;
int* special_flag;
int molecular;
dev_array eatom; // per-atom energy
dev_array vatom; // per-atom virial
int need_eatom;
int need_vatom;
dev_array x_type; // position + type in X_FLOAT4 struct
dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
double* mass_host; // remember per-type host pointer to masses
//int natoms; // total # of atoms in system, could be 0
int nghost; // and ghost atoms on this proc
int nlocal; // # of owned
int nall; // total # of atoms in this proc
int nmax; // max # of owned+ghost in arrays on this proc
int ntypes;
int q_flag; // do we have charges?
int rmass_flag; // do we have per-atom masses?
int firstgroup;
int nfirst;
int update_nlocal;
int update_nmax;
int update_neigh;
dev_array xhold; // position at last neighboring
X_FLOAT triggerneighsq; // maximum square movement before reneighboring
int reneigh_flag; // is reneighboring necessary
int maxhold; // size of xhold
int dist_check; //perform distance check for reneighboring
dev_array binned_id; //id of each binned atom (not tag!!)
dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
float bin_extraspace;
int bin_dim[3];
int bin_nmax;
dev_array map_array;
};
struct cuda_shared_pair // relevent data from pair class
{
char cudable_force; // check for (cudable_force!=0)
X_FLOAT cut_global;
X_FLOAT cut_inner_global;
X_FLOAT cut_coul_global;
double** cut; // type-type cutoff
double** cutsq; // type-type cutoff
double** cut_inner; // type-type cutoff for coul
double** cut_coul; // type-type cutoff for coul
double** coeff1; // tpye-type pair parameters
double** coeff2;
double** coeff3;
double** coeff4;
double** coeff5;
double** coeff6;
double** coeff7;
double** coeff8;
double** coeff9;
double** coeff10;
double** offset;
double* special_lj;
double* special_coul;
dev_array virial; // ENERGY_FLOAT
dev_array eng_vdwl; // ENERGY_FLOAT
dev_array eng_coul; // ENERGY_FLOAT
X_FLOAT cut_coulsq_global;
F_FLOAT g_ewald,kappa;
int freeze_group_bit;
dev_array coeff1_gm;
dev_array coeff2_gm;
dev_array coeff3_gm;
dev_array coeff4_gm;
dev_array coeff5_gm;
dev_array coeff6_gm;
dev_array coeff7_gm;
dev_array coeff8_gm;
dev_array coeff9_gm;
dev_array coeff10_gm;
int lastgridsize;
int n_energy_virial;
int collect_forces_later;
int use_block_per_atom;
int override_block_per_atom;
bool neighall;
};
struct cuda_shared_domain // relevent data from domain class
{
X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
X_FLOAT subhi[3];
X_FLOAT boxlo[3];
X_FLOAT boxhi[3];
X_FLOAT prd[3];
int periodicity[3]; // xyz periodicity as array
int triclinic;
X_FLOAT xy;
X_FLOAT xz;
X_FLOAT yz;
X_FLOAT boxlo_lamda[3];
X_FLOAT boxhi_lamda[3];
X_FLOAT prd_lamda[3];
X_FLOAT h[6];
X_FLOAT h_inv[6];
V_FLOAT h_rate[6];
int update;
};
struct cuda_shared_pppm
{
char cudable_force;
#ifdef FFT_CUFFT
FFT_FLOAT* work1;
FFT_FLOAT* work2;
FFT_FLOAT* work3;
PPPM_FLOAT* greensfn;
PPPM_FLOAT* fkx;
PPPM_FLOAT* fky;
PPPM_FLOAT* fkz;
PPPM_FLOAT* vg;
#endif
int* part2grid;
PPPM_FLOAT* density_brick;
int* density_brick_int;
PPPM_FLOAT density_intScale;
PPPM_FLOAT* vdx_brick;
PPPM_FLOAT* vdy_brick;
PPPM_FLOAT* vdz_brick;
PPPM_FLOAT* density_fft;
ENERGY_FLOAT* energy;
ENERGY_FLOAT* virial;
int nxlo_in;
int nxhi_in;
int nxlo_out;
int nxhi_out;
int nylo_in;
int nyhi_in;
int nylo_out;
int nyhi_out;
int nzlo_in;
int nzhi_in;
int nzlo_out;
int nzhi_out;
int nx_pppm;
int ny_pppm;
int nz_pppm;
PPPM_FLOAT qqrd2e;
int order;
// float3 sublo;
PPPM_FLOAT* rho_coeff;
int nmax;
int nlocal;
PPPM_FLOAT* debugdata;
PPPM_FLOAT delxinv;
PPPM_FLOAT delyinv;
PPPM_FLOAT delzinv;
int nlower;
int nupper;
PPPM_FLOAT shiftone;
PPPM_FLOAT3* fH;
};
struct cuda_shared_comm
{
int maxswap;
int maxlistlength;
dev_array pbc;
dev_array slablo;
dev_array slabhi;
dev_array multilo;
dev_array multihi;
dev_array sendlist;
int grow_flag;
int comm_phase;
int nsend;
int* nsend_swap;
int* send_size;
int* recv_size;
double** buf_send;
void** buf_send_dev;
double** buf_recv;
void** buf_recv_dev;
void* buffer;
int buffer_size;
double overlap_split_ratio;
};
struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
{
int maxlocal;
int inum; // # of I atoms neighbors are stored for local indices of I atoms
int inum_border2;
dev_array inum_border; // # of atoms which interact with border atoms
dev_array ilist;
dev_array ilist_border;
dev_array numneigh;
dev_array numneigh_inner;
dev_array numneigh_border;
dev_array firstneigh;
dev_array neighbors;
dev_array neighbors_border;
dev_array neighbors_inner;
int maxpage;
dev_array page_pointers;
dev_array* pages;
int maxneighbors;
int neigh_lists_per_page;
double** cutneighsq;
CUDA_FLOAT* cu_cutneighsq;
int* binned_id;
int* bin_dim;
int bin_nmax;
float bin_extraspace;
double maxcut;
dev_array ex_type;
int nex_type;
dev_array ex1_bit;
dev_array ex2_bit;
int nex_group;
dev_array ex_mol_bit;
int nex_mol;
};
struct cuda_compile_settings // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
{
int prec_glob;
int prec_x;
int prec_v;
int prec_f;
int prec_pppm;
int prec_fft;
int cufft;
int arch;
};
struct cuda_timings_struct
{
//Debug:
double test1;
double test2;
//transfers
double transfer_upload_tmp_constr;
double transfer_download_tmp_deconstr;
//communication
double comm_forward_total;
double comm_forward_mpi_upper;
double comm_forward_mpi_lower;
double comm_forward_kernel_pack;
double comm_forward_kernel_unpack;
double comm_forward_kernel_self;
double comm_forward_upload;
double comm_forward_download;
double comm_exchange_total;
double comm_exchange_mpi;
double comm_exchange_kernel_pack;
double comm_exchange_kernel_unpack;
double comm_exchange_kernel_fill;
double comm_exchange_cpu_pack;
double comm_exchange_upload;
double comm_exchange_download;
double comm_border_total;
double comm_border_mpi;
double comm_border_kernel_pack;
double comm_border_kernel_unpack;
double comm_border_kernel_self;
double comm_border_kernel_buildlist;
double comm_border_upload;
double comm_border_download;
//pair forces
double pair_xtype_conversion;
double pair_kernel;
double pair_virial;
double pair_force_collection;
//neighbor
double neigh_bin;
double neigh_build;
double neigh_special;
//PPPM
double pppm_particle_map;
double pppm_make_rho;
double pppm_brick2fft;
double pppm_poisson;
double pppm_fillbrick;
double pppm_fieldforce;
double pppm_compute;
};
struct cuda_shared_data // holds space for all relevent data from the different classes
{
void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
int buffersize; //maxsize of buffer
int buffer_new; //should be 1 if the pointer to buffer has changed
void* flag;
void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
cuda_shared_atom atom;
cuda_shared_pair pair;
cuda_shared_domain domain;
cuda_shared_pppm pppm;
cuda_shared_comm comm;
cuda_compile_settings compile_settings;
cuda_timings_struct cuda_timings;
int exchange_dim;
int me; //mpi rank
unsigned int datamask;
int overlap_comm;
};
#endif // #ifndef _CUDA_SHARED_H_

View File

@ -1,317 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#include "cuda_shared.h"
#include "cuda_common.h"
#include "cuda_wrapper_cu.h"
#include "cuda_wrapper_kernel.cu"
static int CudaWrapper_total_gpu_mem=0;
static double CudaWrapper_total_upload_time=0;
static double CudaWrapper_total_download_time=0;
static double CudaWrapper_cpubuffer_upload_time=0;
static double CudaWrapper_cpubuffer_download_time=0;
static cudaStream_t* streams;
static int nstreams=0;
void CudaWrapper_Init(int argc, char** argv,int me,int ppn,int* devicelist)
{
MYDBG( printf("# CUDA: debug mode on\n"); )
#if __DEVICE_EMULATION__
printf("# CUDA: emulation mode on\n");
#else
// modified from cutil.h
static int deviceCount=0;
static bool sharedmode=false;
if(deviceCount && !sharedmode) return;
if(deviceCount && sharedmode) cudaThreadExit();
CUDA_SAFE_CALL_NO_SYNC( cudaGetDeviceCount(&deviceCount) );
if (deviceCount == 0)
{
fprintf(stderr, "cutil error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
MYDBG( printf("# CUDA There are %i devices supporting CUDA in this system.\n",deviceCount);)
cudaDeviceProp deviceProp[deviceCount];
for(int i=0;i<deviceCount;i++)
CUDA_SAFE_CALL_NO_SYNC( cudaGetDeviceProperties(&(deviceProp[i]), i) );
int dev_list[deviceCount];
for(int i=0;i<deviceCount;i++) dev_list[i]=i;
for(int i=0;i<deviceCount;i++)
{
for(int j=0;j<deviceCount-1-i;j++)
if(deviceProp[dev_list[j]].multiProcessorCount<deviceProp[dev_list[j+1]].multiProcessorCount)
{
int k=dev_list[j];
dev_list[j]=dev_list[j+1];
dev_list[j+1]=k;
}
}
for(int i=0;i<deviceCount;i++)
{
if((deviceProp[dev_list[i]].computeMode==0)) sharedmode=true;
cudaSetDevice(i);
cudaSetDeviceFlags(cudaDeviceMapHost);
}
if(sharedmode)
{
if(ppn&&(me%ppn+1)>deviceCount) {printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n"); exit(0);}
int devicea=me%ppn;
if(devicelist) devicea=devicelist[devicea];
else
devicea=dev_list[devicea];
if(devicea>=deviceCount) {printf("Asking for non existent GPU %i. Found only %i GPUs.\n",devicea,deviceCount); exit(0);}
MYDBG(
printf(" # CUDA myid: %i take device: %i\n",me,devicea);
)
CUDA_SAFE_CALL( cudaSetDevice(devicea) );
}
else
{
CUDA_SAFE_CALL( cudaSetValidDevices(dev_list,deviceCount) );
}
cudaThreadSynchronize();
int dev;
CUDA_SAFE_CALL( cudaGetDevice(&dev));
if (deviceProp[dev].major < 1)
{
fprintf(stderr, "CUDA error: device does not support CUDA.\n");
exit(EXIT_FAILURE);
}
else
if ((deviceProp[dev].major == 1)&&(deviceProp[dev].minor != 3))
{
fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n",dev,deviceProp[dev].name,deviceProp[dev].major,deviceProp[dev].minor);
exit(EXIT_FAILURE);
}
if ((deviceProp[dev].major == 2)&&(CUDA_ARCH<20))
{
fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n",deviceProp[dev].major,deviceProp[dev].minor);
}
if ((deviceProp[dev].major == 1)&&(CUDA_ARCH>=20))
{
fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n",CUDA_ARCH);
exit(EXIT_FAILURE);
}
fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);
MYDBG( fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);)
MYDBG
(
printf("name = %s\n", deviceProp[dev].name);
printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem);
printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock);
printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock);
printf("warpSize = %i\n", deviceProp[dev].warpSize);
printf("memPitch = %i\n", deviceProp[dev].memPitch);
printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock);
printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]);
printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]);
printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem);
printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor);
printf("clockRate = %i\n", deviceProp[dev].clockRate);
printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment);
printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap);
printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount);
printf("computeMode = %i\n", deviceProp[dev].computeMode);
)
#endif
}
void* CudaWrapper_AllocCudaData(unsigned nbytes)
{
void* dev_data;
CUDA_SAFE_CALL( cudaMalloc((void**)&dev_data, nbytes) );
MYDBG( printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data); )
CudaWrapper_total_gpu_mem+=nbytes;
return dev_data;
}
void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes)
{
MYDBG( printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data,host_data); )
cudaThreadSynchronize();
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
CUDA_SAFE_CALL( cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice) );
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_total_upload_time+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
}
void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream)
{
MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); )
cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice,streams[stream]);
}
void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes)
{
MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); )
cudaThreadSynchronize();
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
CUDA_SAFE_CALL( cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost) );
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_total_download_time+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
}
void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream)
{
MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); )
cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost,streams[stream]);
}
void CudaWrapper_FreeCudaData(void* dev_data,unsigned nbytes)
{
MYDBG( printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data,nbytes,(char*)dev_data+nbytes); )
CUDA_SAFE_CALL( cudaFree(dev_data) );
CudaWrapper_total_gpu_mem-=nbytes;
}
void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes)
{
MYDBG( printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data); )
CUDA_SAFE_CALL( cudaMemset(dev_data, value, nbytes) );
}
void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes)
{
MYDBG( printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source,dev_dest); )
CUDA_SAFE_CALL( cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice) );
}
void* CudaWrapper_AllocPinnedHostData(unsigned nbytes,bool mapped,bool writeCombined)
{
void* host_data;
int flags=0;
if(mapped) flags=flags | cudaHostAllocMapped;
if(writeCombined) flags=flags | cudaHostAllocWriteCombined;
CUDA_SAFE_CALL( cudaHostAlloc((void**)&host_data, nbytes,flags) );
// CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) );
MYDBG( printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data); )
return host_data;
}
void CudaWrapper_FreePinnedHostData(void* host_data)
{
MYDBG( printf("# CUDA: freeing pinned host memory at %p \n",host_data); )
if(host_data)
CUDA_SAFE_CALL( cudaFreeHost(host_data) );
}
void cuda_check_error(char* comment)
{
printf("ERROR-CUDA %s %s\n",comment,cudaGetErrorString(cudaGetLastError()));
}
int CudaWrapper_CheckMemUseage()
{
size_t free,total;
cudaMemGetInfo(&free,&total);
return total-free; //possible with cuda 3.0 ???
//return CudaWrapper_total_gpu_mem;
}
double CudaWrapper_CheckUploadTime(bool reset)
{
if(reset) CudaWrapper_total_upload_time=0.0;
return CudaWrapper_total_upload_time;
}
double CudaWrapper_CheckDownloadTime(bool reset)
{
if(reset) CudaWrapper_total_download_time=0.0;
return CudaWrapper_total_download_time;
}
double CudaWrapper_CheckCPUBufUploadTime(bool reset)
{
if(reset) CudaWrapper_cpubuffer_upload_time=0.0;
return CudaWrapper_cpubuffer_upload_time;
}
double CudaWrapper_CheckCPUBufDownloadTime(bool reset)
{
if(reset) CudaWrapper_cpubuffer_download_time=0.0;
return CudaWrapper_cpubuffer_download_time;
}
void CudaWrapper_AddCPUBufUploadTime(double dt)
{
CudaWrapper_cpubuffer_upload_time+=dt;
}
void CudaWrapper_AddCPUBufDownloadTime(double dt)
{
CudaWrapper_cpubuffer_download_time+=dt;
}
void CudaWrapper_Sync()
{
cudaThreadSynchronize();
}
void CudaWrapper_SyncStream(int stream)
{
cudaStreamSynchronize(streams[stream]);
}
void CudaWrapper_AddStreams(int n)
{
cudaStream_t* new_streams=new cudaStream_t[nstreams+n];
for(int i=0;i<nstreams;i++) new_streams[i]=streams[i];
for(int i=nstreams;i<nstreams+n;i++) cudaStreamCreate(&new_streams[i]);
if(nstreams>0)
delete [] streams;
streams=new_streams;
nstreams+=n;
}
void* CudaWrapper_returnStreams()
{
return (void*) streams;
}
int CudaWrapper_returnNStreams()
{
return nstreams;
}

View File

@ -1,52 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_DATA_WRAPPER_H_
#define _CUDA_DATA_WRAPPER_H_
extern "C" void CudaWrapper_Init(int argc, char** argv,int me=0,int ppn=2,int* devicelist=NULL);
extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes);
extern "C" void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes);
extern "C" void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
extern "C" void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes);
extern "C" void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
extern "C" void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes=0);
extern "C" void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes);
extern "C" void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes);
extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped=false, bool writeCombind=false);
extern "C" void CudaWrapper_FreePinnedHostData(void* dev_data);
extern "C" void cuda_check_error(char* comment);
extern "C" int CudaWrapper_CheckMemUseage();
extern "C" double CudaWrapper_CheckUploadTime(bool reset=false);
extern "C" double CudaWrapper_CheckDownloadTime(bool reset=false);
extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset=false);
extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset=false);
extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt);
extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt);
extern "C" void CudaWrapper_Sync();
extern "C" void CudaWrapper_SyncStream(int n);
extern "C" void CudaWrapper_AddStreams(int n);
extern "C" void* CudaWrapper_returnStreams();
extern "C" int CudaWrapper_returnNStreams();
#endif // _CUDA_DATA_WRAPPER_H_

View File

@ -1,24 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
// empty file to obay common make rule

View File

@ -1,194 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX domain
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "domain_cu.h"
#include "domain_kernel.cu"
void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata,int size)
{
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_Domain Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
}
void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(image) , & sdata->atom.image.dev_data, sizeof(int*) );
}
void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo , 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(boxhi) , sdata->domain.boxhi , 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi , 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd , 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , 3*sizeof(int));
cudaMemcpyToSymbol(MY_CONST(triclinic) , & sdata->domain.triclinic , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(boxlo_lamda) , sdata->domain.boxlo_lamda , 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(boxhi_lamda) , sdata->domain.boxhi_lamda , 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(prd_lamda) , sdata->domain.prd_lamda , 3*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(h) , sdata->domain.h , 6*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(h_inv) , sdata->domain.h_inv , 6*sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(h_rate) , sdata->domain.h_rate , 6*sizeof(V_FLOAT));
cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag , sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(debugdata) , &sdata->debugdata , sizeof(int*));
}
void Cuda_Domain_Init(cuda_shared_data* sdata)
{
Cuda_Domain_UpdateNmax(sdata);
Cuda_Domain_UpdateDomain(sdata);
}
void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent)
{
Cuda_Domain_UpdateNmax(sdata);
//if(sdata->domain.update)
Cuda_Domain_UpdateDomain(sdata);
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int box_change=0;
if(extent) box_change=1;
int sharedmem=0;
if(box_change) sharedmem=6*sizeof(X_FLOAT);
int3 layout=getgrid(sdata->atom.nlocal,sharedmem);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
sharedmem*=threads.x;
if((box_change)&&(sdata->buffer_new or (6*sizeof(X_FLOAT)*grid.x*grid.y>sdata->buffersize)))
Cuda_Domain_UpdateBuffer(sdata,layout.x*layout.y*6*sizeof(X_FLOAT));
Domain_PBC_Kernel<<<grid, threads,sharedmem>>>(deform_remap,deform_groupbit,box_change);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed");
if(box_change)
{
X_FLOAT buf2[6*layout.x*layout.y];
X_FLOAT* buf=buf2;
int flag;
cudaMemcpy(buf, sdata->buffer, 6*layout.x*layout.y*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
//printf("Flag: %i\n",flag);
X_FLOAT min,max;
min=1.0*BIG;
max=-1.0*BIG;
for(int i=0;i<layout.x*layout.y;i++)
{
if(buf[i]<min) min=buf[i];
if(buf[i+layout.x*layout.y]>max) max=buf[i+layout.x*layout.y];
}
extent[0]=min;
extent[1]=max;
buf+=2*layout.x*layout.y;
min=1.0*BIG;
max=-1.0*BIG;
for(int i=0;i<layout.x*layout.y;i++)
{
if(buf[i]<min) min=buf[i];
if(buf[i+layout.x*layout.y]>max) max=buf[i+layout.x*layout.y];
}
extent[2]=min;
extent[3]=max;
buf+=2*layout.x*layout.y;
min=1.0*BIG;
max=-1.0*BIG;
for(int i=0;i<layout.x*layout.y;i++)
{
if(buf[i]<min) min=buf[i];
if(buf[i+layout.x*layout.y]>max) max=buf[i+layout.x*layout.y];
}
extent[4]=min;
extent[5]=max;
//printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]);
/* int n=grid.x*grid.y;
if(n<128) threads.x=32;
else if(n<256) threads.x=64;
else threads.x=128;
sharedmem=n*sizeof(X_FLOAT);
grid.x=6;
grid.y=1;
Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/
}
}
void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n)
{
Cuda_Domain_UpdateNmax(sdata);
//if(sdata->domain.update)
Cuda_Domain_UpdateDomain(sdata);
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Domain_lamda2x_Kernel<<<grid, threads,0>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed");
}
void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n)
{
Cuda_Domain_UpdateNmax(sdata);
//if(sdata->domain.update)
Cuda_Domain_UpdateDomain(sdata);
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Domain_x2lamda_Kernel<<<grid, threads,0>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed");
}

View File

@ -1,29 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata);
extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent=NULL);
extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n);
extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n);

View File

@ -1,269 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ X_FLOAT sharedmem[];
#define BIG 1e10
__global__ void Domain_PBC_Kernel(int deform_remap,int deform_groupbit,int box_change)
{
int idim,otherdims;
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
X_FLOAT lo[3];
X_FLOAT hi[3];
X_FLOAT* period;
if (_triclinic == 0) {
lo[0] = _boxlo[0];
lo[1] = _boxlo[1];
lo[2] = _boxlo[2];
hi[0] = _boxhi[0];
hi[1] = _boxhi[1];
hi[2] = _boxhi[2];
period = _prd;
} else {
lo[0] = _boxlo_lamda[0];
lo[1] = _boxlo_lamda[1];
lo[2] = _boxlo_lamda[2];
hi[0] = _boxhi_lamda[0];
hi[1] = _boxhi_lamda[1];
hi[2] = _boxhi_lamda[2];
period = _prd_lamda;
}
X_FLOAT tmpx=X_F(0.5)*(hi[0]+lo[0]);
X_FLOAT tmpy=X_F(0.5)*(hi[1]+lo[1]);
X_FLOAT tmpz=X_F(0.5)*(hi[2]+lo[2]);
X_FLOAT* buf=(X_FLOAT*) _buffer;
buf+=blockIdx.x*gridDim.y+blockIdx.y;
buf[0]=tmpx;
buf+=gridDim.x*gridDim.y;
buf[0]=tmpx;
buf+=gridDim.x*gridDim.y;
buf[0]=tmpy;
buf+=gridDim.x*gridDim.y;
buf[0]=tmpy;
buf+=gridDim.x*gridDim.y;
buf[0]=tmpz;
buf+=gridDim.x*gridDim.y;
buf[0]=tmpz;
if(i<_nlocal)
{
if (_periodicity[0]) {
if (_x[i] < lo[0]) {
_x[i] += period[0];
if (deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0];
idim = _image[i] & 1023;
otherdims = _image[i] ^ idim;
idim--;
idim &= 1023;
_image[i] = otherdims | idim;
}
if (_x[i] >= hi[0]) {
_x[i] -= period[0];
_x[i] = MAX(_x[i],lo[0]);
if (deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0];
idim = _image[i] & 1023;
otherdims = _image[i] ^ idim;
idim++;
idim &= 1023;
_image[i] = otherdims | idim;
}
}
if (_periodicity[1]) {
if (_x[i+_nmax] < lo[1]) {
_x[i+_nmax] += period[1];
if (deform_remap && _mask[i] & deform_groupbit) {
_v[i] += _h_rate[5];
_v[i+_nmax] += _h_rate[1];
}
idim = (_image[i] >> 10) & 1023;
otherdims = _image[i] ^ (idim << 10);
idim--;
idim &= 1023;
_image[i] = otherdims | (idim << 10);
}
if (_x[i+_nmax] >= hi[1]) {
_x[i+_nmax] -= period[1];
_x[i+_nmax] = MAX(_x[i+_nmax],lo[1]);
if (deform_remap && _mask[i] & deform_groupbit) {
_v[i] -= _h_rate[5];
_v[i+_nmax] -= _h_rate[1];
}
idim = (_image[i] >> 10) & 1023;
otherdims = _image[i] ^ (idim << 10);
idim++;
idim &= 1023;
_image[i] = otherdims | (idim << 10);
}
}
if (_periodicity[2]) {
if (_x[i+2*_nmax] < lo[2]) {
_x[i+2*_nmax] += period[2];
if (deform_remap && _mask[i] & deform_groupbit) {
_v[i] += _h_rate[4];
_v[i+_nmax] += _h_rate[3];
_v[i+2*_nmax] += _h_rate[2];
}
idim = _image[i] >> 20;
otherdims = _image[i] ^ (idim << 20);
idim--;
idim &= 1023;
_image[i] = otherdims | (idim << 20);
}
if (_x[i+2*_nmax] >= hi[2]) {
_x[i+2*_nmax] -= period[2];
_x[i+2*_nmax] = MAX(_x[i+2*_nmax],lo[2]);
if (deform_remap && _mask[i] & deform_groupbit) {
_v[i] -= _h_rate[4];
_v[i+_nmax] -= _h_rate[3];
_v[i+2*_nmax] -= _h_rate[2];
}
idim = _image[i] >> 20;
otherdims = _image[i] ^ (idim << 20);
idim++;
idim &= 1023;
_image[i] = otherdims | (idim << 20);
}
}
if(box_change)
{
tmpx=_x[i];
tmpy=_x[i+_nmax];
tmpz=_x[i+2*_nmax];
}
}
__syncthreads();
if(box_change)
{
X_FLOAT minx=BIG;
X_FLOAT maxx=-BIG;
X_FLOAT miny=BIG;
X_FLOAT maxy=-BIG;
X_FLOAT minz=BIG;
X_FLOAT maxz=-BIG;
if (not _periodicity[0]) {
sharedmem[threadIdx.x]=tmpx;
minOfBlock(sharedmem);
minx=sharedmem[0];
__syncthreads();
sharedmem[threadIdx.x]=tmpx;
maxOfBlock(sharedmem);
maxx=sharedmem[0];
__syncthreads();
}
else {minx=lo[0];maxx=hi[0];}
if (not _periodicity[1]) {
sharedmem[threadIdx.x]=tmpy;
minOfBlock(sharedmem);
miny=sharedmem[0];
__syncthreads();
sharedmem[threadIdx.x]=tmpy;
maxOfBlock(sharedmem);
maxy=sharedmem[0];
__syncthreads();
}
else {minx=lo[1];maxx=hi[1];}
if (not _periodicity[2]) {
sharedmem[threadIdx.x]=tmpz;
minOfBlock(sharedmem);
minz=sharedmem[0];
__syncthreads();
sharedmem[threadIdx.x]=tmpz;
maxOfBlock(sharedmem);
maxz=sharedmem[0];
__syncthreads();
}
else {minz=lo[2];maxz=hi[2];}
if(threadIdx.x==0)
{
buf=(X_FLOAT*) _buffer;
buf+=blockIdx.x*gridDim.y+blockIdx.y;
buf[0]=minx;
buf+=gridDim.x*gridDim.y;
buf[0]=maxx;
buf+=gridDim.x*gridDim.y;
buf[0]=miny;
buf+=gridDim.x*gridDim.y;
buf[0]=maxy;
buf+=gridDim.x*gridDim.y;
buf[0]=minz;
buf+=gridDim.x*gridDim.y;
buf[0]=maxz;
}
}
}
__global__ void Domain_reduceBoxExtent(double* extent,int n)
{
X_FLOAT* buf=(X_FLOAT*) _buffer;
buf+=blockIdx.x*n;
copyGlobToShared(buf,sharedmem,n);
if(blockIdx.x%2==0)
minOfData(sharedmem,n);
else
maxOfData(sharedmem,n);
extent[blockIdx.x]=sharedmem[0];
}
__global__ void Domain_lamda2x_Kernel(int n)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<n)
{
X_FLOAT ytmp = _x[i+_nmax];
X_FLOAT ztmp = _x[i+2*_nmax];
_x[i] = _h[0]*_x[i] + _h[5]*ytmp + _h[4]*ztmp + _boxlo[0];
_x[i+_nmax] = _h[1]*ytmp + _h[3]*ztmp + _boxlo[1];
_x[i+2*_nmax] = _h[2]*ztmp + _boxlo[2];
}
}
__global__ void Domain_x2lamda_Kernel(int n)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
X_FLOAT delta[3];
if(i<n)
{
delta[0] = _x[i] - _boxlo[0];
delta[1] = _x[i+_nmax] - _boxlo[1];
delta[2] = _x[i+2*_nmax] - _boxlo[2];
_x[i] = _h_inv[0]*delta[0] + _h_inv[5]*delta[1] + _h_inv[4]*delta[2];
_x[i+_nmax] = _h_inv[1]*delta[1] + _h_inv[3]*delta[2];
_x[i+2*_nmax] = _h_inv[2]*delta[2];
}
}

View File

@ -1,103 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
//#define CUDA_PRECISION 1
#include "cuda_precision.h"
#include "cuda_common.h"
struct FFT_DATA{
FFT_FLOAT re;
FFT_FLOAT im;
};
#include "fft3d_cuda_cu.h"
#include "fft3d_cuda_kernel.cu"
#include <stdio.h>
void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow)
{
dim3 grid;
grid.x=nslow;
grid.y=nmid;
grid.z=1;
dim3 threads;
threads.x=nfast;
threads.y=1;
threads.z=1;
cudaThreadSynchronize();
initfftdata_kernel<<<grid,threads,0>>>(in,out);
cudaThreadSynchronize();
MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));
}
void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)
{
dim3 grid;
grid.x=nslow;
grid.y=nmid;
grid.z=1;
dim3 threads;
threads.x=nfast*2;
threads.y=1;
threads.z=1;
permute_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out);
cudaThreadSynchronize();
MYDBG(printf("ERROR-CUDA permute_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));
}
void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)
{
dim3 grid;
grid.x=nslow;
grid.y=nmid;
grid.z=1;
dim3 threads;
threads.x=nfast*2;
threads.y=1;
threads.z=1;
permute_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out);
cudaThreadSynchronize();
}
void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo)
{
dim3 grid;
grid.x=(ihi-ilo+1);
grid.y=(jhi-jlo+1);
grid.z=1;
dim3 threads;
threads.x=(khi-klo+1)*2;
threads.y=1;
threads.z=1;
permute_part_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out,nfast,nmid,nslow,ihi,ilo,jhi,jlo,khi,klo);
cudaThreadSynchronize();
}
void FFTsyncthreads()
{
cudaThreadSynchronize();
}

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow);
extern "C" void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow);
extern "C" void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow);
extern "C" void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo);
extern "C" void FFTsyncthreads();

View File

@ -1,44 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void initfftdata_kernel(double* in,FFT_FLOAT* out)
{
out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x];
out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)+1]=0;
}
__global__ void permute_kernel(FFT_FLOAT* in,FFT_FLOAT* out)
{
out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x];
}
__global__ void permute_scale_kernel(FFT_FLOAT* in,FFT_FLOAT* out)
{
out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x]*gridDim.x*gridDim.y*blockDim.x*0.5;
}
__global__ void permute_part_kernel(FFT_FLOAT* in,FFT_FLOAT* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo)
{
{out[2*((threadIdx.x/2)*(ihi-ilo+1)*(jhi-jlo+1)+(blockIdx.x)*(jhi-jlo+1)+blockIdx.y-jlo)+threadIdx.x-2*(threadIdx.x/2)]=in[2*(blockIdx.x+ilo)*nmid*nslow+2*(blockIdx.y+jlo)*nmid+threadIdx.x+2*klo]; }
}

View File

@ -1,89 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_add_force_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_addforce_cuda_cu.h"
#include "fix_addforce_cuda_kernel.cu"
void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
}
void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
}
void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixAddForceCuda_UpdateNmax(sdata);
}
void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal)
{
if(sdata->atom.update_nmax)
Cuda_FixAddForceCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
if(sdata->buffer_new)
Cuda_FixAddForceCuda_UpdateBuffer(sdata);
int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixAddForceCuda_PostForce_Kernel<<<grid, threads,threads.x*4*sizeof(F_FLOAT)>>> (groupbit,axvalue,ayvalue,azvalue);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed");
int oldgrid=grid.x;
grid.x=4;
threads.x=512;
reduce_foriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,aforiginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal);

View File

@ -1,86 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_FLOAT sharedmem[];
__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
sharedmem[threadIdx.x]=0;
sharedmem[threadIdx.x+blockDim.x]=0;
sharedmem[threadIdx.x+2*blockDim.x]=0;
sharedmem[threadIdx.x+3*blockDim.x]=0;
if(i < _nlocal)
if (_mask[i] & groupbit)
//if (iregion >= 0 &&
//match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported
{
sharedmem[threadIdx.x]=-xvalue*_x[i] - yvalue*_x[i+1*_nmax] - zvalue*_x[i+2*_nmax];
sharedmem[threadIdx.x+blockDim.x]=_f[i];
sharedmem[threadIdx.x+2*blockDim.x]=_f[i+1*_nmax];
sharedmem[threadIdx.x+3*blockDim.x]=_f[i+2*_nmax];
_f[i] += xvalue;
_f[i+1*_nmax] += yvalue;
_f[i+2*_nmax] += zvalue;
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2*blockDim.x]);
reduceBlock(&sharedmem[3*blockDim.x]);
F_FLOAT* buffer=(F_FLOAT*) _buffer;
if(threadIdx.x==0)
{
buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];
buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];
}
}
__global__ void reduce_foriginal(int n,F_FLOAT* foriginal)
{
int i=0;
sharedmem[threadIdx.x]=0;
F_FLOAT myforig=0.0;
F_FLOAT* buf=(F_FLOAT*) _buffer;
buf=&buf[blockIdx.x*n];
while(i<n)
{
sharedmem[threadIdx.x]=0;
if(i+threadIdx.x<n)
sharedmem[threadIdx.x]=buf[i+threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i+=blockDim.x;
if(threadIdx.x==0)
myforig+=sharedmem[0];
}
if(threadIdx.x==0)
foriginal[blockIdx.x]=myforig;
}

View File

@ -1,104 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_ave_force_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_aveforce_cuda_cu.h"
#include "fix_aveforce_cuda_kernel.cu"
void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
}
void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
}
void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixAveForceCuda_UpdateNmax(sdata);
}
void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal)
{
if(sdata->atom.update_nmax)
Cuda_FixAveForceCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
if(sdata->buffer_new)
Cuda_FixAveForceCuda_UpdateBuffer(sdata);
int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixAveForceCuda_PostForce_FOrg_Kernel<<<grid, threads,threads.x*4*sizeof(F_FLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed");
int oldgrid=grid.x;
grid.x=4;
threads.x=512;
Cuda_FixAveForceCuda_reduce_foriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,aforiginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed");
}
void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue)
{
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixAveForceCuda_PostForce_Set_Kernel<<<grid, threads,0>>> (groupbit,xflag,yflag,zflag,axvalue,ayvalue,azvalue);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed");
}

View File

@ -1,28 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal);
extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue);

View File

@ -1,87 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_FLOAT sharedmem[];
__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
sharedmem[threadIdx.x]=0;
sharedmem[threadIdx.x+blockDim.x]=0;
sharedmem[threadIdx.x+2*blockDim.x]=0;
sharedmem[threadIdx.x+3*blockDim.x]=0;
if(i < _nlocal)
if (_mask[i] & groupbit) {
sharedmem[threadIdx.x]=_f[i];
sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax];
sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax];
sharedmem[threadIdx.x+3*blockDim.x]=1;
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2*blockDim.x]);
reduceBlock(&sharedmem[3*blockDim.x]);
F_FLOAT* buffer=(F_FLOAT*) _buffer;
if(threadIdx.x==0)
{
buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];
buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];
}
}
__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n,F_FLOAT* foriginal)
{
int i=0;
sharedmem[threadIdx.x]=0;
F_FLOAT myforig=0.0;
F_FLOAT* buf=(F_FLOAT*) _buffer;
buf=&buf[blockIdx.x*n];
while(i<n)
{
sharedmem[threadIdx.x]=0;
if(i+threadIdx.x<n)
sharedmem[threadIdx.x]=buf[i+threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i+=blockDim.x;
if(threadIdx.x==0)
myforig+=sharedmem[0];
}
if(threadIdx.x==0)
foriginal[blockIdx.x]=myforig;
}
__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit,int xflag, int yflag, int zflag,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal)
if (_mask[i] & groupbit) {
if(xflag) _f[i] = xvalue;
if(yflag) _f[i+1*_nmax] = yvalue;
if(zflag) _f[i+2*_nmax] = zvalue;
}
}

View File

@ -1,54 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_enforce2d_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_enforce2d_cuda_cu.h"
#include "fix_enforce2d_cuda_kernel.cu"
void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
}
void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit)
{
if(sdata->atom.update_nmax)
Cuda_FixEnforce2dCuda_Init(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixEnforce2dCuda_PostForce_Kernel<<<grid, threads>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit);

View File

@ -1,33 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal)
if (_mask[i] & groupbit) {
_v[i+2*_nmax] = V_F(0.0);
_f[i+2*_nmax] = F_F(0.0);
}
}

View File

@ -1,95 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_freeze_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_freeze_cuda_cu.h"
#include "fix_freeze_cuda_kernel.cu"
void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) );
}
void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*) );
}
void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixFreezeCuda_UpdateNmax(sdata);
}
void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal)
{
if(sdata->atom.update_nmax)
Cuda_FixFreezeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
if(sdata->buffer_new)
Cuda_FixFreezeCuda_UpdateBuffer(sdata);
int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixFreezeCuda_PostForce_Kernel<<<grid, threads,threads.x*3*sizeof(F_FLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed");
int oldgrid=grid.x;
grid.x=3;
threads.x=512;
Cuda_FixFreezeCuda_Reduce_FOriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,foriginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal);

View File

@ -1,82 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_FLOAT sharedmem[];
__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
sharedmem[threadIdx.x]=0;
sharedmem[threadIdx.x+blockDim.x]=0;
sharedmem[threadIdx.x+2*blockDim.x]=0;
if(i < _nlocal)
if (_mask[i] & groupbit) {
sharedmem[threadIdx.x]=_f[i];
sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax];
sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax];
_f[i] = F_F(0.0);
_f[i+1*_nmax] = F_F(0.0);
_f[i+2*_nmax] = F_F(0.0);
_torque[i] = F_F(0.0);
_torque[i+1*_nmax] = F_F(0.0);
_torque[i+2*_nmax] = F_F(0.0);
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2*blockDim.x]);
F_FLOAT* buffer=(F_FLOAT*)_buffer;
if(threadIdx.x==0)
{
buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];
buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];
}
}
__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal)
{
int i=0;
sharedmem[threadIdx.x]=0;
F_FLOAT myforig=0.0;
F_FLOAT* buf=(F_FLOAT*)_buffer;
buf=&buf[blockIdx.x*n];
while(i<n)
{
sharedmem[threadIdx.x]=0;
if(i+threadIdx.x<n)
sharedmem[threadIdx.x]=buf[i+threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i+=blockDim.x;
if(threadIdx.x==0)
myforig+=sharedmem[0];
}
if(threadIdx.x==0)
foriginal[blockIdx.x]=myforig;
}

View File

@ -1,89 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_gravity_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_gravity_cuda_cu.h"
#include "fix_gravity_cuda_kernel.cu"
void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) );
}
void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) );
}
void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixGravityCuda_UpdateNmax(sdata);
}
void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc)
{
if(sdata->atom.update_nmax)
Cuda_FixGravityCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
if(sdata->buffer_new)
Cuda_FixGravityCuda_UpdateBuffer(sdata);
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixGravityCuda_PostForce_Kernel<<<grid, threads>>> (groupbit,xacc,yacc,zacc);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc);

View File

@ -1,36 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal)
if (_mask[i] & groupbit) {
F_FLOAT mass = _rmass_flag?_rmass[i]:_mass[_type[i]];
_f[i] += mass*xacc;
_f[i+1*_nmax] += mass*yacc;
_f[i+2*_nmax] += mass*zacc;
}
}

View File

@ -1,220 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_nh_cuda
#define IncludeCommonNeigh
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_nh_cuda_cu.h"
#include "fix_nh_cuda_kernel.cu"
void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_CONST(maxhold) , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
}
void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size=(unsigned)10*sizeof(int);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_FixNHCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
}
void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
{
cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(dtf) , & dtf , sizeof(V_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(dtv) , & dtv , sizeof(X_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag , sizeof(int) ); //
Cuda_FixNHCuda_UpdateNmax(sdata);
}
void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
timespec atime1,atime2;
clock_gettime(CLOCK_REALTIME,&atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME,&atime2);
sdata->cuda_timings.test1+=
atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]};
F_FLOAT3 factor2;
if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];}
int3 layout=getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNHCuda_nh_v_press_Kernel<<<grid, threads>>> (groupbit,factor,p_triclinic,factor2);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
}
void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]};
F_FLOAT3 factor2;
if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];}
int3 layout=getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
CUT_CHECK_ERROR("FixNHCuda: fix nh v_press pre Kernel execution failed");
FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel<<<grid, threads>>> (groupbit,factor,p_triclinic,factor2);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
}
void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
timespec atime1,atime2;
clock_gettime(CLOCK_REALTIME,&atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME,&atime2);
sdata->cuda_timings.test1+=
atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
int3 layout=getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNHCuda_nh_v_temp_Kernel<<<grid, threads>>> (groupbit,factor_eta);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed");
}
void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
timespec atime1,atime2;
clock_gettime(CLOCK_REALTIME,&atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME,&atime2);
sdata->cuda_timings.test1+=
atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
int3 layout=getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNHCuda_nve_v_Kernel<<<grid, threads>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed");
}
void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
timespec atime1,atime2;
clock_gettime(CLOCK_REALTIME,&atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME,&atime2);
sdata->cuda_timings.test1+=
atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
int3 layout=getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
cudaMemset(sdata->buffer,0,sizeof(int));
FixNHCuda_nve_x_Kernel<<<grid, threads>>> (groupbit);
cudaThreadSynchronize();
int reneigh_flag;
cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost);
sdata->atom.reneigh_flag+=reneigh_flag;
CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed");
}
void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]};
F_FLOAT3 factor2;
if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];}
int3 layout=getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel<<<grid, threads>>> (groupbit,factor,p_triclinic,factor2);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias: Kernel execution failed");
}

View File

@ -1,32 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup see cpp

View File

@ -1,187 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit)
{
if(_dist_check)
{
X_FLOAT d=X_F(0.0);
if(i<_nlocal)
{
X_FLOAT tmp=xtmp-_xhold[i];
d=tmp*tmp;
tmp=ytmp-_xhold[i+_maxhold];
d+=tmp*tmp;
tmp=ztmp-_xhold[i+2*_maxhold];
d+=tmp*tmp;
d=((_mask[i] & groupbit))?d:X_F(0.0);
}
if(not __all(d<=_triggerneighsq))
_reneigh_flag[0]=1;
}
}
__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit)
{
V_FLOAT* my_v = _v + i;
V_FLOAT vx=my_v[0];
V_FLOAT vy=my_v[_nmax];
V_FLOAT vz=my_v[2*_nmax];
vx*=factor.x;
vy*=factor.y;
vz*=factor.z;
if(p_triclinic) {
vx += vy*factor2.z + vz*factor2.y;
vy += vz*factor2.x;
}
vx*=factor.x;
vy*=factor.y;
vz*=factor.z;
my_v[0] = vx;
my_v[_nmax] = vy;
my_v[2*_nmax] = vz;
}
}
__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit)
{
V_FLOAT* my_v = _v + i;
my_v[0]*=factor_eta;
my_v[_nmax]*=factor_eta;
my_v[2*_nmax]*=factor_eta;
}
}
__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit)
{
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
V_FLOAT dtfm = _dtf;
if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i];
else dtfm*= V_F(1.0) / _mass[_type[i]];
V_FLOAT vx=my_v[0];
V_FLOAT vy=my_v[_nmax];
V_FLOAT vz=my_v[2*_nmax];
vx*=factor.x;
vy*=factor.y;
vz*=factor.z;
if(p_triclinic) {
vx += vy*factor2.z + vz*factor2.y;
vy += vz*factor2.x;
}
vx*=factor.x;
vy*=factor.y;
vz*=factor.z;
my_v[0] = vx + dtfm * my_f[0];
my_v[_nmax] = vy + dtfm * my_f[_nmax];
my_v[2*_nmax] = vz + dtfm * my_f[_nmax*2];
}
}
__global__ void FixNHCuda_nve_v_Kernel(int groupbit)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit)
{
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
V_FLOAT dtfm = _dtf;
if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i];
else dtfm*=V_F(1.0) / _mass[_type[i]];
*my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax;
*my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax;
*my_v = (*my_v + dtfm*(*my_f));
}
}
__global__ void FixNHCuda_nve_x_Kernel(int groupbit)
{
X_FLOAT xtmp,ytmp,ztmp;
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit)
{
V_FLOAT* my_v = _v + i;
X_FLOAT* my_x = _x + i;
xtmp = *my_x += _dtv * *my_v; my_v += _nmax; my_x += _nmax;
ytmp = *my_x += _dtv * *my_v; my_v += _nmax; my_x += _nmax;
ztmp = *my_x += _dtv * *my_v;
}
check_distance(xtmp,ytmp,ztmp,i,groupbit);
}
__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit)
{
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
V_FLOAT dtfm = _dtf;
if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i];
else dtfm*=V_F(1.0) / _mass[_type[i]];
V_FLOAT vx = my_v[0] + dtfm*my_f[0];
V_FLOAT vy = my_v[_nmax] + dtfm*my_f[_nmax];
V_FLOAT vz = my_v[2*_nmax] + dtfm*my_f[2*_nmax];
vx*=factor.x;
vy*=factor.y;
vz*=factor.z;
if(p_triclinic) {
vx += vy*factor2.z + vz*factor2.y;
vy += vz*factor2.x;
}
vx*=factor.x;
vy*=factor.y;
vz*=factor.z;
my_v[0] = vx;
my_v[_nmax] = vy;
my_v[2*_nmax] = vz;
}
}

View File

@ -1,161 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_nve_cuda
#define IncludeCommonNeigh
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_nve_cuda_cu.h"
#include "fix_nve_cuda_kernel.cu"
void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata)
{
#ifdef CUDA_USE_BINNING
cudaMemcpyToSymbol(MY_CONST(bin_count_all) , & sdata->atom.bin_count_all .dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
cudaMemcpyToSymbol(MY_CONST(bin_dim) , sdata->domain.bin_dim , sizeof(int)*3 );
cudaMemcpyToSymbol(MY_CONST(binned_f) , & sdata->atom.binned_f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_type) , & sdata->atom.binned_type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(binned_v) , & sdata->atom.binned_v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_x) , & sdata->atom.binned_x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(binned_rmass) , & sdata->atom.binned_rmass .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) );
}
#else
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_CONST(maxhold) , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
#endif
}
void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size=(unsigned)10*sizeof(int);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_FixNVECuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
}
void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
{
cudaMemcpyToSymbol(MY_CONST(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(dtf) , & dtf , sizeof(V_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(dtv) , & dtv , sizeof(X_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT) );
cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag , sizeof(int) ); //
Cuda_FixNVECuda_UpdateNmax(sdata);
}
void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
if(sdata->atom.update_nmax)
Cuda_FixNVECuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixNVECuda_UpdateBuffer(sdata);
#ifdef CUDA_USE_BINNING
dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
dim3 threads(sdata->domain.bin_nmax, 1, 1);
FixNVECuda_InitialIntegrate_N_Kernel<<<grid, threads>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate (binning) Kernel execution failed");
#else
int3 layout=getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
cudaMemset(sdata->buffer,0,sizeof(int));
FixNVECuda_InitialIntegrate_Kernel<<<grid, threads>>> (groupbit);
cudaThreadSynchronize();
int reneigh_flag;
cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost);
sdata->atom.reneigh_flag+=reneigh_flag;
CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed");
#endif
}
void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp
{
if(sdata->atom.update_nmax)
Cuda_FixNVECuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixNVECuda_UpdateBuffer(sdata);
#ifdef CUDA_USE_BINNING
dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
dim3 threads(sdata->domain.bin_nmax, 1, 1);
FixNVECuda_FinalIntegrate_Kernel<<<grid, threads>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed");
#else
int3 layout=getgrid(mynlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixNVECuda_FinalIntegrate_Kernel<<<grid, threads>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed");
#endif
}

View File

@ -1,28 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);

View File

@ -1,137 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit)
{
if(_dist_check)
{
X_FLOAT tmp=xtmp-_xhold[i];
X_FLOAT d=tmp*tmp;
tmp=ytmp-_xhold[i+_maxhold];
d+=tmp*tmp;
tmp=ztmp-_xhold[i+2*_maxhold];
d+=tmp*tmp;
d=((i < _nlocal) && (_mask[i] & groupbit))?d:X_F(0.0);
if(not __all(d<=_triggerneighsq))
_reneigh_flag[0]=1;
}
}
__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
{
X_FLOAT xtmp,ytmp,ztmp;
#ifdef CUDA_USE_BINNING
const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
if(threadIdx.x < _bin_count_local[bin])
{
const int i = 3*blockDim.x * bin + threadIdx.x;
if(_mask[i] & groupbit)
{
F_FLOAT* my_f = _binned_f + i;
V_FLOAT* my_v = _binned_v + i;
X_FLOAT* my_x = _binned_x + i;
V_FLOAT dtfm = _dtf
if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i];
else dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
V_FLOAT v_mem;
v_mem = *my_v += dtfm * (*my_f); xtmp = *my_x += _dtv * v_mem; my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x;
v_mem = *my_v += dtfm * (*my_f); ytmp = *my_x += _dtv * v_mem; my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x;
v_mem = *my_v += dtfm * (*my_f); ztmp = *my_x += _dtv * v_mem;
}
}
#else
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit)
{
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
X_FLOAT* my_x = _x + i;
V_FLOAT dtfm = _dtf;
if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i];
else dtfm*= V_F(1.0) / _mass[_type[i]];
V_FLOAT v_mem;
v_mem = *my_v += dtfm * (*my_f); xtmp=*my_x += _dtv * v_mem; my_f += _nmax; my_v += _nmax; my_x += _nmax;
v_mem = *my_v += dtfm * (*my_f); ytmp=*my_x += _dtv * v_mem; my_f += _nmax; my_v += _nmax; my_x += _nmax;
v_mem = *my_v += dtfm * (*my_f); ztmp=*my_x += _dtv * v_mem;
}
#endif
check_distance(xtmp,ytmp,ztmp,i,groupbit);
}
__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
{
#ifdef CUDA_USE_BINNING
const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
if(threadIdx.x < _bin_count_local[bin])
{
const int i = 3*blockDim.x * bin + threadIdx.x;
if(_mask[i] & groupbit)
{
F_FLOAT* my_f = _binned_f + i;
V_FLOAT* my_v = _binned_v + i;
V_FLOAT dtfm = _dtf
if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i];
else dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
*my_v += dtfm * (*my_f); my_f += blockDim.x; my_v += blockDim.x;
*my_v += dtfm * (*my_f); my_f += blockDim.x; my_v += blockDim.x;
*my_v += dtfm * (*my_f);
}
}
#else
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit)
{
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
V_FLOAT dtfm = _dtf;
if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i];
else dtfm*=V_F(1.0) / _mass[_type[i]];
*my_v += dtfm * (*my_f); my_f += _nmax; my_v += _nmax;
*my_v += dtfm * (*my_f); my_f += _nmax; my_v += _nmax;
*my_v += dtfm * (*my_f);
}
#endif
}

View File

@ -1,93 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_set_force_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_set_force_cuda_cu.h"
#include "fix_set_force_cuda_kernel.cu"
void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT);
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) );
}
void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
}
void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixSetForceCuda_UpdateNmax(sdata);
}
void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz)
{
if(sdata->atom.update_nmax)
Cuda_FixSetForceCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
if(sdata->buffer_new)
Cuda_FixSetForceCuda_UpdateBuffer(sdata);
int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixSetForceCuda_PostForce_Kernel<<<grid, threads,threads.x*3*sizeof(F_FLOAT)>>> (groupbit,xvalue,yvalue,zvalue,flagx,flagy,flagz);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed");
int oldgrid=grid.x;
grid.x=3;
threads.x=512;
Cuda_FixSetForceCuda_Reduce_FOriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,foriginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz);

View File

@ -1,79 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_FLOAT sharedmem[];
__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,int flagx,int flagy,int flagz)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
sharedmem[threadIdx.x]=0;
sharedmem[threadIdx.x+blockDim.x]=0;
sharedmem[threadIdx.x+2*blockDim.x]=0;
if(i < _nlocal)
if (_mask[i] & groupbit) {
sharedmem[threadIdx.x]=_f[i];
sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax];
sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax];
if(flagx) _f[i] = xvalue;
if(flagy) _f[i+1*_nmax] = yvalue;
if(flagz) _f[i+2*_nmax] = zvalue;
}
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2*blockDim.x]);
F_FLOAT* buffer=(F_FLOAT*)_buffer;
if(threadIdx.x==0)
{
buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];
buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];
buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];
}
}
__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal)
{
int i=0;
sharedmem[threadIdx.x]=0;
F_FLOAT myforig=0.0;
F_FLOAT* buf=(F_FLOAT*)_buffer;
buf=&buf[blockIdx.x*n];
while(i<n)
{
sharedmem[threadIdx.x]=0;
if(i+threadIdx.x<n)
sharedmem[threadIdx.x]=buf[i+threadIdx.x];
__syncthreads();
reduceBlock(sharedmem);
i+=blockDim.x;
if(threadIdx.x==0)
myforig+=sharedmem[0];
}
if(threadIdx.x==0)
foriginal[blockIdx.x]=myforig;
}

View File

@ -1,275 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_shake_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_shake_cuda_cu.h"
#include "cuda_pair_virial_kernel_nc.cu"
#define _shake_atom MY_AP(shake_atom)
#define _shake_type MY_AP(shake_type)
#define _shake_flag MY_AP(shake_flag)
#define _xshake MY_AP(xshake)
#define _dtfsq MY_AP(dtfsq)
#define _bond_distance MY_AP(bond_distance)
#define _angle_distance MY_AP(angle_distance)
#define _max_iter MY_AP(max_iter)
#define _tolerance MY_AP(tolerance)
__device__ __constant__ int* _shake_atom;
__device__ __constant__ int* _shake_type;
__device__ __constant__ int* _shake_flag;
__device__ __constant__ X_FLOAT3* _xshake;
__device__ __constant__ F_FLOAT _dtfsq;
__device__ __constant__ X_FLOAT* _bond_distance;
__device__ __constant__ X_FLOAT* _angle_distance;
__device__ __constant__ int _max_iter;
__device__ __constant__ X_FLOAT _tolerance;
#include "fix_shake_cuda_kernel.cu"
void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(map_array), & sdata->atom.map_array .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(debugdata), & sdata->debugdata , sizeof(int*) );
}
void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity , sizeof(int)*3 );
cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd , sizeof(X_FLOAT)*3 );
cudaMemcpyToSymbol(MY_CONST(triclinic) , &sdata->domain.triclinic , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(h) , sdata->domain.h , sizeof(X_FLOAT)*6 );
}
void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata,int size)
{
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_FixShakeCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*) );
}
void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq,
void* shake_flag,void* shake_atom,void* shake_type, void* xshake,
void* bond_distance,void* angle_distance,void* virial,
int max_iter,X_FLOAT tolerance)
{
Cuda_FixShakeCuda_UpdateNmax(sdata);
Cuda_FixShakeCuda_UpdateDomain(sdata);
cudaMemcpyToSymbol(MY_CONST(shake_atom) , & shake_atom , sizeof(void*) );
cudaMemcpyToSymbol(MY_CONST(shake_type) , & shake_type , sizeof(void*) );
cudaMemcpyToSymbol(MY_CONST(shake_flag) , & shake_flag , sizeof(void*) );
cudaMemcpyToSymbol(MY_CONST(xshake) , & xshake , sizeof(void*) );
cudaMemcpyToSymbol(MY_CONST(dtv) , & dtv , sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_CONST(dtfsq) , & dtfsq , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_CONST(bond_distance) , & bond_distance , sizeof(void*) );
cudaMemcpyToSymbol(MY_CONST(angle_distance) , & angle_distance , sizeof(void*) );
cudaMemcpyToSymbol(MY_CONST(virial) , & virial , sizeof(void*) );
cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag , sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(max_iter) , &max_iter , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(tolerance) , &tolerance , sizeof(X_FLOAT));
if(sdata->atom.mass_host)
cudaMemcpyToSymbol(MY_CONST(mass),& sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag , sizeof(int) ); //
cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag, sizeof(int*));
}
void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
if(sdata->buffer_new)
Cuda_FixShakeCuda_UpdateBuffer(sdata,10*sizeof(double));
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
FixShakeCuda_UnconstrainedUpdate_Kernel<<<grid, threads>>> ();
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed");
}
void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->domain.update)
Cuda_FixShakeCuda_UpdateDomain(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
int3 layout=getgrid(sdata->atom.nlocal,6*sizeof(ENERGY_FLOAT),64);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->buffer_new)
Cuda_FixShakeCuda_UpdateBuffer(sdata,grid.x*grid.y*6*sizeof(ENERGY_FLOAT));
BindXTypeTexture(sdata);
FixShakeCuda_Shake_Kernel<<<grid, threads,6*threads.x*sizeof(ENERGY_FLOAT)>>> (vflag,vflag_atom,list,nlist);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed");
if(vflag)
{
int n=grid.x*grid.y;
grid.x=6;
grid.y=1;
threads.x=256;
MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed");
}
}
int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*3*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_FixShakeCuda_UpdateBuffer(sdata,size);
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
dz = pbc[2]*sdata->domain.prd[2];
}}
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
cudaMemset( sdata->flag,0,sizeof(int));
FixShakeCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz);
cudaThreadSynchronize();
cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
int aflag;
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed");
}
return 3*n;
}
int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*3*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_FixShakeCuda_UpdateBuffer(sdata,size);
static int count=-1;
count++;
X_FLOAT dx=0.0;
X_FLOAT dy=0.0;
X_FLOAT dz=0.0;
if (pbc_flag != 0) {
if (sdata->domain.triclinic == 0) {
dx = pbc[0]*sdata->domain.prd[0];
dy = pbc[1]*sdata->domain.prd[1];
dz = pbc[2]*sdata->domain.prd[2];
} else {
dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
dz = pbc[2]*sdata->domain.prd[2];
}}
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
FixShakeCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
}
return 3*n;
}
void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv)
{
if(sdata->atom.update_nmax)
Cuda_FixShakeCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int size=n*3*sizeof(X_FLOAT);
if(sdata->buffer_new or (size>sdata->buffersize))
Cuda_FixShakeCuda_UpdateBuffer(sdata,size);
int3 layout=getgrid(n);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal>0)
{
cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
FixShakeCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed");
}
}

View File

@ -1,34 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq,
void* shake_flag,void* shake_atom,void* shake_type, void* xshake,
void* bond_distance,void* angle_distance,void* virial,
int max_iter,X_FLOAT tolerance);
extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata);
extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist);
extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv);

View File

@ -1,971 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ void v_tally(int& vflag_global,int& vflag_atom,int& n, int *list, ENERGY_FLOAT total, ENERGY_FLOAT *v)
{
/*if(vflag_global)
{
ENERGY_FLOAT fraction = n/total;
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
*shared += fraction*v[0]; shared+=blockDim.x;
*shared += fraction*v[1]; shared+=blockDim.x;
*shared += fraction*v[2]; shared+=blockDim.x;
*shared += fraction*v[3]; shared+=blockDim.x;
*shared += fraction*v[4]; shared+=blockDim.x;
*shared += fraction*v[5];
}*/
if (vflag_atom) {
ENERGY_FLOAT fraction = ENERGY_F(1.0)/total;
for (int i = 0; i < n; i++) {
int m = list[i];
ENERGY_FLOAT* myvatom=&_vatom[m];
*myvatom += fraction*v[0]; myvatom+=_nmax;
*myvatom += fraction*v[1]; myvatom+=_nmax;
*myvatom += fraction*v[2]; myvatom+=_nmax;
*myvatom += fraction*v[3]; myvatom+=_nmax;
*myvatom += fraction*v[4]; myvatom+=_nmax;
*myvatom += fraction*v[5];
}
}
}
inline __device__ void minimum_image(X_FLOAT3& delta)
{
if (_triclinic == 0) {
if (_periodicity[0]) {
delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] :
(delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0));
}
if (_periodicity[1]) {
delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] :
(delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0));
}
if (_periodicity[2]) {
delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] :
(delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0));
}
} else {
if (_periodicity[1]) {
delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] :
(delta.z > X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0));
delta.y += delta.z < -X_F(0.5)*_prd[2] ? _h[3] :
(delta.z > X_F(0.5)*_prd[2] ?-_h[3] : X_F(0.0));
delta.x += delta.z < -X_F(0.5)*_prd[2] ? _h[4] :
(delta.z > X_F(0.5)*_prd[2] ?-_h[4] : X_F(0.0));
}
if (_periodicity[1]) {
delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] :
(delta.y > X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0));
delta.x += delta.y < -X_F(0.5)*_prd[1] ? _h[5] :
(delta.y > X_F(0.5)*_prd[1] ?-_h[5] : X_F(0.0));
}
if (_periodicity[0]) {
delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] :
(delta.x > X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0));
}
}
}
__global__ void FixShakeCuda_UnconstrainedUpdate_Kernel()
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i>=_nlocal) return;
X_FLOAT3 my_xshake = {X_F(0.0),X_F(0.0),X_F(0.0)};
if(_shake_flag[i])
{
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
X_FLOAT* my_x = _x + i;
V_FLOAT dtfmsq = _dtfsq;
if(_rmass_flag) dtfmsq*= V_F(1.0) / _rmass[i];
else dtfmsq*= V_F(1.0) / _mass[_type[i]];
my_xshake.x = *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax;
my_xshake.y = *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax;
my_xshake.z = *my_x + _dtv* *my_v + dtfmsq* *my_f;
}
_xshake[i]=my_xshake;
}
__device__ void FixShakeCuda_Shake2(int& vflag,int& vflag_atom,int& m)
{
int nlist,list[2];
ENERGY_FLOAT v[6];
X_FLOAT invmass0,invmass1;
// local atom IDs and constraint distances
int i0 = _map_array[_shake_atom[m]];
int i1 = _map_array[_shake_atom[m+_nmax]];
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
// r01 = distance vec between atoms, with PBC
X_FLOAT3 r01;
X_FLOAT4 x_i0,x_i1;
x_i0=fetchXType(i0);
x_i1=fetchXType(i1);
r01.x = x_i0.x - x_i1.x;
r01.y = x_i0.y - x_i1.y;
r01.z = x_i0.z - x_i1.z;
minimum_image(r01);
// s01 = distance vec after unconstrained update, with PBC
X_FLOAT3 s01;
X_FLOAT3 xs_i0=_xshake[i0];
X_FLOAT3 xs_i1=_xshake[i1];
s01.x = xs_i0.x - xs_i1.x;
s01.y = xs_i0.y - xs_i1.y;
s01.z = xs_i0.z - xs_i1.z;
minimum_image(s01);
// scalar distances between atoms
X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
// a,b,c = coeffs in quadratic equation for lamda
if (_rmass_flag) {
invmass0 = X_F(1.0)/_rmass[i0];
invmass1 = X_F(1.0)/_rmass[i1];
} else {
invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
}
X_FLOAT a = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
X_FLOAT b = X_F(2.0) * (invmass0+invmass1) *
(s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
X_FLOAT c = s01sq - bond1*bond1;
// error check
X_FLOAT determ = b*b - X_F(4.0)*a*c;
if (determ < X_F(0.0)) {
_flag[0]++;
determ = X_F(0.0);
}
// exact quadratic solution for lamda
X_FLOAT lamda,lamda1,lamda2;
lamda1 = -b+_SQRT_(determ);
lamda2 = -lamda1 - X_F(2.0)*b;
lamda1 *= X_F(1.0) / (X_F(2.0)*a);
lamda2 *= X_F(1.0) / (X_F(2.0)*a);
lamda = (fabs(lamda1) <= fabs(lamda2))? lamda1 : lamda2;
// update forces if atom is owned by this processor
lamda*= X_F(1.0)/_dtfsq;
//attenion: are shake clusters <-> atom unique?
nlist = 0;
if (i0 < _nlocal) {
_f[i0] += lamda*r01.x;
_f[i0+_nmax] += lamda*r01.y;
_f[i0+2*_nmax] += lamda*r01.z;
list[nlist++] = i0;
}
if (i1 < _nlocal) {
_f[i1] -= lamda*r01.x;
_f[i1+_nmax] -= lamda*r01.y;
_f[i1+2*_nmax] -= lamda*r01.z;
list[nlist++] = i1;
}
if (vflag||vflag_atom) {
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
X_FLOAT factor=nlist;
v[0] = lamda*r01.x*r01.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
v[1] = lamda*r01.y*r01.y; *shared = factor*v[1]; shared+=blockDim.x;
v[2] = lamda*r01.z*r01.z; *shared = factor*v[2]; shared+=blockDim.x;
v[3] = lamda*r01.x*r01.y; *shared = factor*v[3]; shared+=blockDim.x;
v[4] = lamda*r01.x*r01.z; *shared = factor*v[4]; shared+=blockDim.x;
v[5] = lamda*r01.y*r01.z; *shared = factor*v[5]; shared+=blockDim.x;
v_tally(vflag,vflag_atom,nlist,list,2.0,v);
}
}
__device__ void FixShakeCuda_Shake3(int& vflag,int& vflag_atom,int& m)
{
int nlist,list[3];
ENERGY_FLOAT v[6];
X_FLOAT invmass0,invmass1,invmass2;
// local atom IDs and constraint distances
int i0 = _map_array[_shake_atom[m]];
int i1 = _map_array[_shake_atom[m+_nmax]];
int i2 = _map_array[_shake_atom[m+2*_nmax]];
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]];
// r01 = distance vec between atoms, with PBC
X_FLOAT3 r01,r02;
X_FLOAT4 x_i0,x_i1,x_i2;
x_i0=fetchXType(i0);
x_i1=fetchXType(i1);
x_i2=fetchXType(i2);
r01.x = x_i0.x - x_i1.x;
r01.y = x_i0.y - x_i1.y;
r01.z = x_i0.z - x_i1.z;
minimum_image(r01);
r02.x = x_i0.x - x_i2.x;
r02.y = x_i0.y - x_i2.y;
r02.z = x_i0.z - x_i2.z;
minimum_image(r02);
// s01 = distance vec after unconstrained update, with PBC
X_FLOAT3 s01,s02;
X_FLOAT3 xs_i0=_xshake[i0];
X_FLOAT3 xs_i1=_xshake[i1];
X_FLOAT3 xs_i2=_xshake[i2];
s01.x = xs_i0.x - xs_i1.x;
s01.y = xs_i0.y - xs_i1.y;
s01.z = xs_i0.z - xs_i1.z;
minimum_image(s01);
s02.x = xs_i0.x - xs_i2.x;
s02.y = xs_i0.y - xs_i2.y;
s02.z = xs_i0.z - xs_i2.z;
minimum_image(s02);
// scalar distances between atoms
X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z;
X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z;
// a,b,c = coeffs in quadratic equation for lamda
if (_rmass_flag) {
invmass0 = X_F(1.0)/_rmass[i0];
invmass1 = X_F(1.0)/_rmass[i1];
invmass2 = X_F(1.0)/_rmass[i2];
} else {
invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
invmass2 = X_F(1.0)/_mass[static_cast <int> (x_i2.w)];
}
X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) *
(s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
X_FLOAT a12 = X_F(2.0) * invmass0 *
(s01.x*r02.x + s01.y*r02.y + s01.z*r02.z);
X_FLOAT a21 = X_F(2.0) * invmass0 *
(s02.x*r01.x + s02.y*r01.y + s02.z*r01.z);
X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) *
(s02.x*r02.x + s02.y*r02.y + s02.z*r02.z);
// error check
X_FLOAT determ = a11*a22 - a12*a21;
if (determ == X_F(0.0)) _flag[0]++;
X_FLOAT determinv = X_F(1.0)/determ;
X_FLOAT a11inv = a22*determinv;
X_FLOAT a12inv = -a12*determinv;
X_FLOAT a21inv = -a21*determinv;
X_FLOAT a22inv = a11*determinv;
// quadratic correction coeffs
X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z);
X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq;
X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102;
X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq;
X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102;
// iterate until converged
X_FLOAT lamda01 = X_F(0.0);
X_FLOAT lamda02 = X_F(0.0);
int niter = 0;
int done = 0;
X_FLOAT quad1,quad2,b1,b2,lamda01_new,lamda02_new;
//maybe all running full loop?
while (__any(!done) && niter < _max_iter) {
quad1 = quad1_0101 * lamda01*lamda01 + quad1_0202 * lamda02*lamda02 +
quad1_0102 * lamda01*lamda02;
quad2 = quad2_0101 * lamda01*lamda01 + quad2_0202 * lamda02*lamda02 +
quad2_0102 * lamda01*lamda02;
b1 = bond1*bond1 - s01sq - quad1;
b2 = bond2*bond2 - s02sq - quad2;
lamda01_new = a11inv*b1 + a12inv*b2;
lamda02_new = a21inv*b1 + a22inv*b2;
done++;
done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done;
done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done;
lamda01 = done<2?lamda01_new:lamda01;
lamda02 = done<2?lamda02_new:lamda02;
niter++;
}
// update forces if atom is owned by this processor
lamda01 *= X_F(1.0)/_dtfsq;
lamda02 *= X_F(1.0)/_dtfsq;
//attenion: are shake clusters <-> atom unique?
nlist = 0;
if (i0 < _nlocal) {
_f[i0] += lamda01*r01.x + lamda02*r02.x;
_f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y;
_f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z;
list[nlist++] = i0;
}
if (i1 < _nlocal) {
_f[i1] -= lamda01*r01.x;
_f[i1+_nmax] -= lamda01*r01.y;
_f[i1+2*_nmax] -= lamda01*r01.z;
list[nlist++] = i1;
}
if (i2 < _nlocal) {
_f[i2] -= lamda02*r02.x;
_f[i2+_nmax] -= lamda02*r02.y;
_f[i2+2*_nmax] -= lamda02*r02.z;
list[nlist++] = i2;
}
if (vflag||vflag_atom) {
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist;
v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y; *shared = factor*v[1]; shared+=blockDim.x;
v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z; *shared = factor*v[2]; shared+=blockDim.x;
v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y; *shared = factor*v[3]; shared+=blockDim.x;
v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z; *shared = factor*v[4]; shared+=blockDim.x;
v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z; *shared = factor*v[5]; shared+=blockDim.x;
v_tally(vflag,vflag_atom,nlist,list,3.0,v);
}
}
__device__ void FixShakeCuda_Shake4(int& vflag,int& vflag_atom,int& m)
{
int nlist,list[4];
ENERGY_FLOAT v[6];
X_FLOAT invmass0,invmass1,invmass2,invmass3;
// local atom IDs and constraint distances
int i0 = _map_array[_shake_atom[m]];
int i1 = _map_array[_shake_atom[m+_nmax]];
int i2 = _map_array[_shake_atom[m+2*_nmax]];
int i3 = _map_array[_shake_atom[m+3*_nmax]];
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]];
X_FLOAT bond3 = _bond_distance[_shake_type[m+2*_nmax]];
// r01 = distance vec between atoms, with PBC
X_FLOAT3 r01,r02,r03;
X_FLOAT4 x_i0,x_i1,x_i2,x_i3;
x_i0=fetchXType(i0);
x_i1=fetchXType(i1);
x_i2=fetchXType(i2);
x_i3=fetchXType(i3);
r01.x = x_i0.x - x_i1.x;
r01.y = x_i0.y - x_i1.y;
r01.z = x_i0.z - x_i1.z;
minimum_image(r01);
r02.x = x_i0.x - x_i2.x;
r02.y = x_i0.y - x_i2.y;
r02.z = x_i0.z - x_i2.z;
minimum_image(r02);
r03.x = x_i0.x - x_i3.x;
r03.y = x_i0.y - x_i3.y;
r03.z = x_i0.z - x_i3.z;
minimum_image(r03);
// s01 = distance vec after unconstrained update, with PBC
X_FLOAT3 s01,s02,s03;
X_FLOAT3 xs_i0=_xshake[i0];
X_FLOAT3 xs_i1=_xshake[i1];
X_FLOAT3 xs_i2=_xshake[i2];
X_FLOAT3 xs_i3=_xshake[i3];
s01.x = xs_i0.x - xs_i1.x;
s01.y = xs_i0.y - xs_i1.y;
s01.z = xs_i0.z - xs_i1.z;
minimum_image(s01);
s02.x = xs_i0.x - xs_i2.x;
s02.y = xs_i0.y - xs_i2.y;
s02.z = xs_i0.z - xs_i2.z;
minimum_image(s02);
s03.x = xs_i0.x - xs_i3.x;
s03.y = xs_i0.y - xs_i3.y;
s03.z = xs_i0.z - xs_i3.z;
minimum_image(s03);
// scalar distances between atoms
X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z;
X_FLOAT r03sq = r03.x*r03.x + r03.y*r03.y + r03.z*r03.z;
X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z;
X_FLOAT s03sq = s03.x*s03.x + s03.y*s03.y + s03.z*s03.z;
// a,b,c = coeffs in quadratic equation for lamda
if (_rmass_flag) {
invmass0 = X_F(1.0)/_rmass[i0];
invmass1 = X_F(1.0)/_rmass[i1];
invmass2 = X_F(1.0)/_rmass[i2];
invmass3 = X_F(1.0)/_rmass[i3];
} else {
invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
invmass2 = X_F(1.0)/_mass[static_cast <int> (x_i2.w)];
invmass3 = X_F(1.0)/_mass[static_cast <int> (x_i3.w)];
}
X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) *
(s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
X_FLOAT a12 = X_F(2.0) * invmass0 *
(s01.x*r02.x + s01.y*r02.y + s01.z*r02.z);
X_FLOAT a13 = X_F(2.0) * invmass0 *
(s01.x*r03.x + s01.y*r03.y + s01.z*r03.z);
X_FLOAT a21 = X_F(2.0) * invmass0 *
(s02.x*r01.x + s02.y*r01.y + s02.z*r01.z);
X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) *
(s02.x*r02.x + s02.y*r02.y + s02.z*r02.z);
X_FLOAT a23 = X_F(2.0) * (invmass0) *
(s02.x*r03.x + s02.y*r03.y + s02.z*r03.z);
X_FLOAT a31 = X_F(2.0) * (invmass0) *
(s03.x*r01.x + s03.y*r01.y + s03.z*r01.z);
X_FLOAT a32 = X_F(2.0) * (invmass0) *
(s03.x*r02.x + s03.y*r02.y + s03.z*r02.z);
X_FLOAT a33 = X_F(2.0) * (invmass0+invmass3) *
(s03.x*r03.x + s03.y*r03.y + s03.z*r03.z);
// error check
X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
if (determ == X_F(0.0)) _flag[0]++;
X_FLOAT determinv = X_F(1.0)/determ;
X_FLOAT a11inv = determinv * (a22*a33 - a23*a32);
X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32);
X_FLOAT a13inv = determinv * (a12*a23 - a13*a22);
X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31);
X_FLOAT a22inv = determinv * (a11*a33 - a13*a31);
X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21);
X_FLOAT a31inv = determinv * (a21*a32 - a22*a31);
X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31);
X_FLOAT a33inv = determinv * (a11*a22 - a12*a21);
// quadratic correction coeffs
X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z);
X_FLOAT r0103 = (r01.x*r03.x + r01.y*r03.y + r01.z*r03.z);
X_FLOAT r0203 = (r02.x*r03.x + r02.y*r03.y + r02.z*r03.z);
X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq;
X_FLOAT quad1_0303 = invmass0*invmass0 * r03sq;
X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102;
X_FLOAT quad1_0103 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0103;
X_FLOAT quad1_0203 = X_F(2.0) * invmass0*invmass0 * r0203;
X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq;
X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
X_FLOAT quad2_0303 = invmass0*invmass0 * r03sq;
X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102;
X_FLOAT quad2_0103 = X_F(2.0) * invmass0*invmass0 * r0103;
X_FLOAT quad2_0203 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0203;
X_FLOAT quad3_0101 = invmass0*invmass0 * r01sq;
X_FLOAT quad3_0202 = invmass0*invmass0 * r02sq;
X_FLOAT quad3_0303 = (invmass0+invmass3)*(invmass0+invmass3) * r03sq;
X_FLOAT quad3_0102 = X_F(2.0) * invmass0*invmass0 * r0102;
X_FLOAT quad3_0103 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0103;
X_FLOAT quad3_0203 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0203;
// iterate until converged
X_FLOAT lamda01 = X_F(0.0);
X_FLOAT lamda02 = X_F(0.0);
X_FLOAT lamda03 = X_F(0.0);
int niter = 0;
int done = 0;
X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda03_new;
//maybe all running full loop?
while (__any(!done) && niter < _max_iter) {
quad1 = quad1_0101 * lamda01*lamda01 +
quad1_0202 * lamda02*lamda02 +
quad1_0303 * lamda03*lamda03 +
quad1_0102 * lamda01*lamda02 +
quad1_0103 * lamda01*lamda03 +
quad1_0203 * lamda02*lamda03;
quad2 = quad2_0101 * lamda01*lamda01 +
quad2_0202 * lamda02*lamda02 +
quad2_0303 * lamda03*lamda03 +
quad2_0102 * lamda01*lamda02 +
quad2_0103 * lamda01*lamda03 +
quad2_0203 * lamda02*lamda03;
quad3 = quad3_0101 * lamda01*lamda01 +
quad3_0202 * lamda02*lamda02 +
quad3_0303 * lamda03*lamda03 +
quad3_0102 * lamda01*lamda02 +
quad3_0103 * lamda01*lamda03 +
quad3_0203 * lamda02*lamda03;
b1 = bond1*bond1 - s01sq - quad1;
b2 = bond2*bond2 - s02sq - quad2;
b3 = bond3*bond3 - s03sq - quad3;
lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
lamda03_new = a31inv*b1 + a32inv*b2 + a33inv*b3;
done++;
done = (fabs(lamda01_new-lamda01) > _tolerance)? 0:done;
done = (fabs(lamda02_new-lamda02) > _tolerance)? 0:done;
done = (fabs(lamda03_new-lamda03) > _tolerance)? 0:done;
lamda01 = done<2?lamda01_new:lamda01;
lamda02 = done<2?lamda02_new:lamda02;
lamda03 = done<2?lamda03_new:lamda03;
niter++;
}
// update forces if atom is owned by this processor
lamda01 *= X_F(1.0)/_dtfsq;
lamda02 *= X_F(1.0)/_dtfsq;
lamda03 *= X_F(1.0)/_dtfsq;
//attenion: are shake clusters <-> atom unique?
nlist = 0;
if (i0 < _nlocal) {
_f[i0] += lamda01*r01.x + lamda02*r02.x + lamda03*r03.x;
_f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y + lamda03*r03.y;
_f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z + lamda03*r03.z;
list[nlist++] = i0;
}
if (i1 < _nlocal) {
_f[i1] -= lamda01*r01.x;
_f[i1+_nmax] -= lamda01*r01.y;
_f[i1+2*_nmax] -= lamda01*r01.z;
list[nlist++] = i1;
}
if (i2 < _nlocal) {
_f[i2] -= lamda02*r02.x;
_f[i2+_nmax] -= lamda02*r02.y;
_f[i2+2*_nmax] -= lamda02*r02.z;
list[nlist++] = i2;
}
if (i3 < _nlocal) {
_f[i3] -= lamda03*r03.x;
_f[i3+_nmax] -= lamda03*r03.y;
_f[i3+2*_nmax] -= lamda03*r03.z;
list[nlist++] = i3;
}
if (vflag||vflag_atom) {
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
X_FLOAT factor=X_F(2.0)/X_F(4.0)*nlist;
v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda03*r03.x*r03.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda03*r03.y*r03.y; *shared = factor*v[1]; shared+=blockDim.x;
v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda03*r03.z*r03.z; *shared = factor*v[2]; shared+=blockDim.x;
v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda03*r03.x*r03.y; *shared = factor*v[3]; shared+=blockDim.x;
v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda03*r03.x*r03.z; *shared = factor*v[4]; shared+=blockDim.x;
v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda03*r03.y*r03.z; *shared = factor*v[5]; shared+=blockDim.x;
v_tally(vflag,vflag_atom,nlist,list,4.0,v);
}
}
__device__ void FixShakeCuda_Shake3Angle(int& vflag,int& vflag_atom,int& m)
{
int nlist,list[3];
ENERGY_FLOAT v[6];
X_FLOAT invmass0,invmass1,invmass2;
// local atom IDs and constraint distances
int i0 = _map_array[_shake_atom[m]];
int i1 = _map_array[_shake_atom[m+_nmax]];
int i2 = _map_array[_shake_atom[m+2*_nmax]];
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]];
X_FLOAT bond12 = _angle_distance[_shake_type[m+2*_nmax]];
// r01 = distance vec between atoms, with PBC
X_FLOAT3 r01,r02,r12;
X_FLOAT4 x_i0,x_i1,x_i2;
x_i0=fetchXType(i0);
x_i1=fetchXType(i1);
x_i2=fetchXType(i2);
r01.x = x_i0.x - x_i1.x;
r01.y = x_i0.y - x_i1.y;
r01.z = x_i0.z - x_i1.z;
minimum_image(r01);
r02.x = x_i0.x - x_i2.x;
r02.y = x_i0.y - x_i2.y;
r02.z = x_i0.z - x_i2.z;
minimum_image(r02);
r12.x = x_i1.x - x_i2.x;
r12.y = x_i1.y - x_i2.y;
r12.z = x_i1.z - x_i2.z;
minimum_image(r12);
// s01 = distance vec after unconstrained update, with PBC
X_FLOAT3 s01,s02,s12;
X_FLOAT3 xs_i0=_xshake[i0];
X_FLOAT3 xs_i1=_xshake[i1];
X_FLOAT3 xs_i2=_xshake[i2];
s01.x = xs_i0.x - xs_i1.x;
s01.y = xs_i0.y - xs_i1.y;
s01.z = xs_i0.z - xs_i1.z;
minimum_image(s01);
s02.x = xs_i0.x - xs_i2.x;
s02.y = xs_i0.y - xs_i2.y;
s02.z = xs_i0.z - xs_i2.z;
minimum_image(s02);
s12.x = xs_i1.x - xs_i2.x;
s12.y = xs_i1.y - xs_i2.y;
s12.z = xs_i1.z - xs_i2.z;
minimum_image(s12);
// scalar distances between atoms
X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z;
X_FLOAT r12sq = r12.x*r12.x + r12.y*r12.y + r12.z*r12.z;
X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z;
X_FLOAT s12sq = s12.x*s12.x + s12.y*s12.y + s12.z*s12.z;
// a,b,c = coeffs in quadratic equation for lamda
if (_rmass_flag) {
invmass0 = X_F(1.0)/_rmass[i0];
invmass1 = X_F(1.0)/_rmass[i1];
invmass2 = X_F(1.0)/_rmass[i2];
} else {
invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
invmass2 = X_F(1.0)/_mass[static_cast <int> (x_i2.w)];
}
X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) *
(s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
X_FLOAT a12 = X_F(2.0) * invmass0 *
(s01.x*r02.x + s01.y*r02.y + s01.z*r02.z);
X_FLOAT a13 = - X_F(2.0) * invmass1 *
(s01.x*r12.x + s01.y*r12.y + s01.z*r12.z);
X_FLOAT a21 = X_F(2.0) * invmass0 *
(s02.x*r01.x + s02.y*r01.y + s02.z*r01.z);
X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) *
(s02.x*r02.x + s02.y*r02.y + s02.z*r02.z);
X_FLOAT a23 = X_F(2.0) * invmass2 *
(s02.x*r12.x + s02.y*r12.y + s02.z*r12.z);
X_FLOAT a31 = - X_F(2.0) * invmass1 *
(s12.x*r01.x + s12.y*r01.y + s12.z*r01.z);
X_FLOAT a32 = X_F(2.0) * invmass2 *
(s12.x*r02.x + s12.y*r02.y + s12.z*r02.z);
X_FLOAT a33 = X_F(2.0) * (invmass1+invmass2) *
(s12.x*r12.x + s12.y*r12.y + s12.z*r12.z);
// inverse of matrix
X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
if (determ == X_F(0.0)) _flag[0]++;
X_FLOAT determinv = X_F(1.0)/determ;
X_FLOAT a11inv = determinv * (a22*a33 - a23*a32);
X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32);
X_FLOAT a13inv = determinv * (a12*a23 - a13*a22);
X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31);
X_FLOAT a22inv = determinv * (a11*a33 - a13*a31);
X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21);
X_FLOAT a31inv = determinv * (a21*a32 - a22*a31);
X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31);
X_FLOAT a33inv = determinv * (a11*a22 - a12*a21);
// quadratic correction coeffs
X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z);
X_FLOAT r0112 = (r01.x*r12.x + r01.y*r12.y + r01.z*r12.z);
X_FLOAT r0212 = (r02.x*r12.x + r02.y*r12.y + r02.z*r12.z);
X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq;
X_FLOAT quad1_1212 = invmass1*invmass1 * r12sq;
X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102;
X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0+invmass1)*invmass1 * r0112;
X_FLOAT quad1_0212 = - X_F(2.0) * invmass0*invmass1 * r0212;
X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq;
X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
X_FLOAT quad2_1212 = invmass2*invmass2 * r12sq;
X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102;
X_FLOAT quad2_0112 = X_F(2.0) * invmass0*invmass2 * r0112;
X_FLOAT quad2_0212 = X_F(2.0) * (invmass0+invmass2)*invmass2 * r0212;
X_FLOAT quad3_0101 = invmass1*invmass1 * r01sq;
X_FLOAT quad3_0202 = invmass2*invmass2 * r02sq;
X_FLOAT quad3_1212 = (invmass1+invmass2)*(invmass1+invmass2) * r12sq;
X_FLOAT quad3_0102 = - X_F(2.0) * invmass1*invmass2 * r0102;
X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1+invmass2)*invmass1 * r0112;
X_FLOAT quad3_0212 = X_F(2.0) * (invmass1+invmass2)*invmass2 * r0212;
// iterate until converged
X_FLOAT lamda01 = X_F(0.0);
X_FLOAT lamda02 = X_F(0.0);
X_FLOAT lamda12 = X_F(0.0);
int niter = 0;
int done = 0;
X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new;
//maybe all running full loop?
while (__any(!done) && niter < _max_iter) {
quad1 = quad1_0101 * lamda01*lamda01 +
quad1_0202 * lamda02*lamda02 +
quad1_1212 * lamda12*lamda12 +
quad1_0102 * lamda01*lamda02 +
quad1_0112 * lamda01*lamda12 +
quad1_0212 * lamda02*lamda12;
quad2 = quad2_0101 * lamda01*lamda01 +
quad2_0202 * lamda02*lamda02 +
quad2_1212 * lamda12*lamda12 +
quad2_0102 * lamda01*lamda02 +
quad2_0112 * lamda01*lamda12 +
quad2_0212 * lamda02*lamda12;
quad3 = quad3_0101 * lamda01*lamda01 +
quad3_0202 * lamda02*lamda02 +
quad3_1212 * lamda12*lamda12 +
quad3_0102 * lamda01*lamda02 +
quad3_0112 * lamda01*lamda12 +
quad3_0212 * lamda02*lamda12;
b1 = bond1*bond1 - s01sq - quad1;
b2 = bond2*bond2 - s02sq - quad2;
b3 = bond12*bond12 - s12sq - quad3;
lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
lamda12_new = a31inv*b1 + a32inv*b2 + a33inv*b3;
done++;
done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done;
done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done;
done = (fabs(lamda12_new-lamda12) > _tolerance)?0: done;
lamda01 = done<2?lamda01_new:lamda01;
lamda02 = done<2?lamda02_new:lamda02;
lamda12 = done<2?lamda12_new:lamda12;
niter++;
}
// update forces if atom is owned by this processor
lamda01 *= X_F(1.0)/_dtfsq;
lamda02 *= X_F(1.0)/_dtfsq;
lamda12 *= X_F(1.0)/_dtfsq;
//attenion: are shake clusters <-> atom unique?
nlist = 0;
if (i0 < _nlocal) {
_f[i0] += lamda01*r01.x + lamda02*r02.x;
_f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y;
_f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z;
list[nlist++] = i0;
}
if (i1 < _nlocal) {
_f[i1] -= lamda01*r01.x - lamda12*r12.x;
_f[i1+_nmax] -= lamda01*r01.y - lamda12*r12.y;
_f[i1+2*_nmax] -= lamda01*r01.z - lamda12*r12.z;
list[nlist++] = i1;
}
if (i2 < _nlocal) {
_f[i2] -= lamda02*r02.x + lamda12*r12.x;
_f[i2+_nmax] -= lamda02*r02.y + lamda12*r12.y;
_f[i2+2*_nmax] -= lamda02*r02.z + lamda12*r12.z;
list[nlist++] = i2;
}
if (vflag||vflag_atom) {
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist;
v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda12*r12.x*r12.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda12*r12.y*r12.y; *shared = factor*v[1]; shared+=blockDim.x;
v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda12*r12.z*r12.z; *shared = factor*v[2]; shared+=blockDim.x;
v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda12*r12.x*r12.y; *shared = factor*v[3]; shared+=blockDim.x;
v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda12*r12.x*r12.z; *shared = factor*v[4]; shared+=blockDim.x;
v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda12*r12.y*r12.z; *shared = factor*v[5]; shared+=blockDim.x;
v_tally(vflag,vflag_atom,nlist,list,3.0,v);
}
}
__global__ void FixShakeCuda_Shake_Kernel(int vflag,int vflag_atom,int* list,int nlist)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<nlist)
{
int m = list[i];
int sflag = _shake_flag[m];
if (sflag == 2) FixShakeCuda_Shake2(vflag,vflag_atom,m);
else if(sflag == 3) FixShakeCuda_Shake3(vflag,vflag_atom,m);
else if(sflag == 4) FixShakeCuda_Shake4(vflag,vflag_atom,m);
else FixShakeCuda_Shake3Angle(vflag,vflag_atom,m);
}
else
{
ENERGY_FLOAT* shared=&sharedmem[threadIdx.x];
*shared=ENERGY_F(0.0); shared+=blockDim.x;
*shared=ENERGY_F(0.0); shared+=blockDim.x;
*shared=ENERGY_F(0.0); shared+=blockDim.x;
*shared=ENERGY_F(0.0); shared+=blockDim.x;
*shared=ENERGY_F(0.0); shared+=blockDim.x;
*shared=ENERGY_F(0.0);
}
if(vflag)
{
__syncthreads();
int eflag=0;
PairVirialCompute_A_Kernel(eflag,vflag);
}
}
__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
if(j>_nmax) _flag[0]=1;
X_FLOAT3 xs=_xshake[j];
((X_FLOAT*) _buffer)[i]=xs.x + dx;
((X_FLOAT*) _buffer)[i+1*n] = xs.y + dy;
((X_FLOAT*) _buffer)[i+2*n] = xs.z + dz;
}
}
__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
if(j>_nmax) _flag[0]=1;
X_FLOAT3 xs=_xshake[j];
xs.x += dx;
xs.y += dy;
xs.z += dz;
_xshake[i+first]=xs;
}
}
__global__ void FixShakeCuda_UnpackComm_Kernel(int n,int first)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<n)
{
X_FLOAT3 xs;
xs.x=((X_FLOAT*) _buffer)[i];
xs.y=((X_FLOAT*) _buffer)[i+1*n];
xs.z=((X_FLOAT*) _buffer)[i+2*n];
_xshake[i+first]=xs;
}
}

View File

@ -1,64 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_temp_berendsen_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_temp_berendsen_cuda_cu.h"
#include "fix_temp_berendsen_cuda_kernel.cu"
void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) );
}
void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
}
void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor)
{
V_FLOAT factor=afactor;
if(sdata->atom.update_nmax)
Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixTempBerendsenCuda_EndOfStep_Kernel<<<grid, threads,0>>> (groupbit,factor);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor);

View File

@ -1,36 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal)
if (_mask[i] & groupbit) {
_v[i]*=factor;
_v[i+_nmax]*=factor;
_v[i+2*_nmax]*=factor;
}
}

View File

@ -1,64 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_temp_rescale_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_temp_rescale_cuda_cu.h"
#include "fix_temp_rescale_cuda_kernel.cu"
void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) );
}
void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
}
void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor)
{
V_FLOAT factor=afactor;
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
//cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixTempRescaleCuda_EndOfStep_Kernel<<<grid, threads,0>>> (groupbit,factor);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor);

View File

@ -1,36 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal)
if (_mask[i] & groupbit) {
_v[i]*=factor;
_v[i+_nmax]*=factor;
_v[i+2*_nmax]*=factor;
}
}

View File

@ -1,64 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_temp_rescale_limit_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_temp_rescale_limit_cuda_cu.h"
#include "fix_temp_rescale_limit_cuda_kernel.cu"
void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) );
}
void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
}
void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit)
{
V_FLOAT factor=afactor;
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)
//cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int3 layout=getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel<<<grid, threads,0>>> (groupbit,factor,limit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit);

View File

@ -1,43 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor,V_FLOAT limit)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal)
if (_mask[i] & groupbit) {
V_FLOAT vx = _v[i];
V_FLOAT vy = _v[i+_nmax];
V_FLOAT vz = _v[i+2*_nmax];
vx*=factor;
vy*=factor;
vz*=factor;
_v[i]=vx>0?min(vx,limit):max(vx,-limit);
_v[i+_nmax]=vy>0?min(vy,limit):max(vy,-limit);
_v[i+2*_nmax]=vz>0?min(vz,limit):max(vz,-limit);
}
}

View File

@ -1,66 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define MY_PREFIX fix_viscous_cuda
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "fix_viscous_cuda_cu.h"
#include "fix_viscous_cuda_kernel.cu"
void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
}
void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata)
{
Cuda_FixViscousCuda_UpdateNmax(sdata);
}
void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma)
{
if(sdata->atom.update_nmax)
Cuda_FixViscousCuda_UpdateNmax(sdata);
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
int3 layout=getgrid(sdata->atom.nlocal,0);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixViscousCuda_PostForce_Kernel<<<grid, threads,0>>> (groupbit,(F_FLOAT*) gamma);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed");
}

View File

@ -1,27 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma);

View File

@ -1,35 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit,F_FLOAT* gamma)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nlocal)
if (_mask[i] & groupbit) {
F_FLOAT drag = gamma[_type[i]];
_f[i] -= drag*_v[i];
_f[i+1*_nmax] -= drag*_v[i+1*_nmax];
_f[i+2*_nmax] -= drag*_v[i+2*_nmax];
}
}

View File

@ -1,367 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#include <time.h>
#define MY_PREFIX neighbor
#define IncludeCommonNeigh
#include "cuda_shared.h"
#include "cuda_common.h"
#include "crm_cuda_utils.cu"
#include "cuda_wrapper_cu.h"
#define _cutneighsq MY_AP(cutneighsq)
#define _ex_type MY_AP(ex_type)
#define _nex_type MY_AP(nex_type)
#define _ex1_bit MY_AP(ex1_bit)
#define _ex2_bit MY_AP(ex2_bit)
#define _nex_group MY_AP(nex_group)
#define _ex_mol_bit MY_AP(ex_mol_bit)
#define _nex_mol MY_AP(nex_mol)
__device__ __constant__ CUDA_FLOAT* _cutneighsq;
__device__ __constant__ int* _ex_type;
__device__ __constant__ int _nex_type;
__device__ __constant__ int* _ex1_bit;
__device__ __constant__ int* _ex2_bit;
__device__ __constant__ int _nex_group;
__device__ __constant__ int* _ex_mol_bit;
__device__ __constant__ int _nex_mol;
#include "neighbor_cu.h"
#include "neighbor_kernel.cu"
void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
int size=(unsigned)(sizeof(int)*20+sneighlist->bin_dim[0]*sneighlist->bin_dim[1]*sneighlist->bin_dim[2]*(sizeof(int)+sneighlist->bin_nmax*3*sizeof(CUDA_FLOAT)));
if(sdata->buffersize<size)
{
MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
if(sdata->buffer!=NULL) CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer=CudaWrapper_AllocCudaData(size);
sdata->buffersize=size;
sdata->buffer_new++;
MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
}
cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) );
CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed");
}
int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
if(sdata->buffer_new)
Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
// initialize only on first call
CUDA_FLOAT rez_bin_size[3] =
{
(1.0 * sneighlist->bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
(1.0 * sneighlist->bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
(1.0 * sneighlist->bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
};
short init = 0;
if(! init)
{
init = 0;
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo , sizeof(X_FLOAT)*3);
}
int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
timespec starttime,endtime;
clock_gettime(CLOCK_REALTIME,&starttime);
cudaMemset((int*) (sdata->buffer),0,sizeof(int)*(20+(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2]))+3*sizeof(CUDA_FLOAT)*(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2])*(sneighlist->bin_nmax));
Binning_Kernel<<<grid, threads>>> (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],sneighlist->bin_dim[2],rez_bin_size[0],rez_bin_size[1],rez_bin_size[2]);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME,&endtime);
sdata->cuda_timings.neigh_bin+=
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
int binning_error;
cudaMemcpy((void*) &binning_error,(void*) sdata->buffer,1*sizeof(int),cudaMemcpyDeviceToHost);
if(binning_error)
{
sneighlist->bin_extraspace+=0.05;
}
else
{
MYDBG(printf("CUDA: binning successful\n");)
}
CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
return binning_error;
}
int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
//Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
CUDA_FLOAT globcutoff=-1.0;
short init=0;
if(! init)
{
init = 1;
// !! LAMMPS indexes atom types starting with 1 !!
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
//printf("Allocate: %i\n",nx);
sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx);
if(sneighlist->cutneighsq)
{
int cutoffsdiffer=0;
double cutoff0 = sneighlist->cutneighsq[1][1];
for(int i=1; i<=sdata->atom.ntypes; ++i)
{
for(int j=1; j<=sdata->atom.ntypes; ++j)
{
acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]);
if((sneighlist->cutneighsq[i][j]-cutoff0)*(sneighlist->cutneighsq[i][j]-cutoff0)>1e-6) cutoffsdiffer++;
}
}
if(not cutoffsdiffer) globcutoff=(CUDA_FLOAT) cutoff0;
}
else
{
MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); )
return 0;
}
int size = 100;
if(sdata->buffersize < size)
{
MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); )
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); )
}
CudaWrapper_UploadCudaData(acutneighsq,sneighlist->cu_cutneighsq,nx);
cudaMemcpyToSymbol(MY_CONST(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(special_flag) , sdata->atom.special_flag , 4*sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(molecular) , & sdata->atom.molecular , sizeof(int) );
}
cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned) );
//cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(special) , & sdata->atom.special .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(maxspecial) , & sdata->atom.maxspecial , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nspecial) , & sdata->atom.nspecial .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(debugdata) , & sdata->debugdata , sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(overlap_comm) , & sdata->overlap_comm, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(ex_type) , & sneighlist->ex_type.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(ex1_bit) , & sneighlist->ex1_bit.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(ex2_bit) , & sneighlist->ex2_bit.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(ex_mol_bit) , & sneighlist->ex_mol_bit.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(nex_type) , & sneighlist->nex_type, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nex_group) , & sneighlist->nex_group, sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nex_mol) , & sneighlist->nex_mol, sizeof(int) );
if(sdata->overlap_comm)
{
cudaMemcpyToSymbol(MY_CONST(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_CONST(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*) );
}
//dim3 threads(sneighlist->bin_nmax,1,1);
dim3 threads(MIN(128,sneighlist->bin_nmax),1,1);
dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1],sneighlist->bin_dim[2],1);
//printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax);
int buffer[20];
buffer[0]=1;
buffer[1]=0;
CudaWrapper_UploadCudaData( buffer, sdata->buffer, 2*sizeof(int));
CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
//cudaMemset(sdata->debugdata,0,100*sizeof(int));
unsigned int shared_size=(sizeof(int)+3*sizeof(CUDA_FLOAT))*threads.x;
MYDBG(printf("Configuration: %i %i %i %u %i\n",grid.x,grid.y,threads.x,shared_size,sneighlist->bin_nmax);)
//shared_size=2056;
timespec starttime,endtime;
clock_gettime(CLOCK_REALTIME,&starttime);
//for(int i=0;i<100;i++)
{
if(sdata->overlap_comm)
NeighborBuildFullBin_OverlapComm_Kernel<<<grid,threads,shared_size>>>
(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom);
else
{
int exclude=sneighlist->nex_mol|sneighlist->nex_group|sneighlist->nex_type;
if(exclude)
NeighborBuildFullBin_Kernel<1><<<grid,threads,shared_size>>>
(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom,sdata->pair.neighall);
else
NeighborBuildFullBin_Kernel<0><<<grid,threads,shared_size>>>
(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom,sdata->pair.neighall);
}
//NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_FLOAT))*threads.x+sizeof(int)>>>
// (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
clock_gettime(CLOCK_REALTIME,&endtime);
sdata->cuda_timings.neigh_build+=
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
//dim3 threads,grid;
CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int));
if(buffer[0]>=0&&true&&sdata->atom.molecular)
{
//printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
clock_gettime(CLOCK_REALTIME,&starttime);
int3 layout=getgrid(sdata->atom.nlocal,0,512);
threads.x = layout.z; threads.y = 1; threads.z = 1;
grid.x = layout.x; grid.y = layout.y; grid.z = 1;
FindSpecial<<<grid,threads>>>(sdata->pair.use_block_per_atom);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
clock_gettime(CLOCK_REALTIME,&endtime);
sdata->cuda_timings.neigh_special+=
endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
}
}
//printf("Neightime: %lf\n",sdata->cuda_timings.test1);
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
//CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int));
MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");)
return buffer[0];
}
int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");)
// initialize only on first call
/*static*/ short init=0;
if(! init)
{
init = 1;
// !! LAMMPS indexes atom types starting with 1 !!
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2)
printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u "
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
if(sneighlist->cutneighsq)
{
for(int i=1; i<=sdata->atom.ntypes; ++i)
{
for(int j=1; j<=sdata->atom.ntypes; ++j)
{
acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]);
//printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
}
}
}
else
{
MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); )
return 0;
}
int size = 100;
if(sdata->buffersize < size)
{
MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); )
CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
sdata->buffer = CudaWrapper_AllocCudaData(size);
sdata->buffersize = size;
sdata->buffer_new++;
MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); )
}
cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer , sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(cutneighsq) , acutneighsq , nx );
cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned) );
cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) );
cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) );
cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) );
cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) );
free(acutneighsq);
}
int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int return_value = 1;
CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int));
CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed");
NeighborBuildFullNsq_Kernel<<<grid, threads>>> ();
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
int buffer[20];
CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int)*20);
MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");)
return return_value=buffer[0];
}

View File

@ -1,32 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef NEIGHBOR_CU_H_
#define NEIGHBOR_CU_H_
#include "cuda_shared.h"
extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
#endif /*NEIGHBOR_CU_H_*/

View File

@ -1,626 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#define SBBITS 30
__global__ void Binning_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,int bin_dim_z,
CUDA_FLOAT rez_bin_size_x,CUDA_FLOAT rez_bin_size_y,CUDA_FLOAT rez_bin_size_z)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
/*int* bin_count=(int*) _buffer;
bin_count=bin_count+20;
CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer;
binned_x = &binned_x[2];
int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax];
if(i < _nall)
{
// copy atom position from global device memory to local register
// in this 3 steps to get as much coalesced access as possible
X_FLOAT* my_x = _x + i;
CUDA_FLOAT x_i = *my_x; my_x += _nmax;
CUDA_FLOAT y_i = *my_x; my_x += _nmax;
CUDA_FLOAT z_i = *my_x;
// calculate flat bin index
int bx=__float2int_rd(rez_bin_size_x * (x_i - _sublo[0]))+2;
int by=__float2int_rd(rez_bin_size_y * (y_i - _sublo[1]))+2;
int bz=__float2int_rd(rez_bin_size_z * (z_i - _sublo[2]))+2;
bx-=bx*negativCUDA(1.0f*bx);
bx-=(bx-bin_dim_x+1)*negativCUDA(1.0f*bin_dim_x-1.0f-1.0f*bx);
by-=by*negativCUDA(1.0f*by);
by-=(by-bin_dim_y+1)*negativCUDA(1.0f*bin_dim_y-1.0f-1.0f*by);
bz-=bz*negativCUDA(1.0f*bz);
bz-=(bz-bin_dim_z+1)*negativCUDA(1.0f*bin_dim_z-1.0f-1.0f*bz);
const unsigned j = bin_dim_z * ( bin_dim_y *bx+by)+bz;
// add new atom to bin, get bin-array position
const unsigned k = atomicAdd(& bin_count[j], 1);
if(k < bin_nmax)
{
binned_id [bin_nmax * j + k] = i;
binned_x [3 * bin_nmax * j + k] = x_i;
binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
binned_x [3 * bin_nmax * j + k + 2*bin_nmax] = z_i;
}
else
{ // normally, this should not happen:
int errorn=atomicAdd((int*) _buffer, 1);
MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); )
}
}
}
__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
{
int m;
if (_nex_type)
if( _ex_type[itype * _cuda_ntypes + jtype]) return 1;
if (_nex_group) {
for (m = 0; m < _nex_group; m++) {
if (_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;
if (_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
}
}
if (_nex_mol) {
if(_molecule[i] == _molecule[j])
for (m = 0; m < _nex_mol; m++)
if (_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m] ) return 1;
}
return 0;
}
extern __shared__ CUDA_FLOAT shared[];
__device__ inline int find_special(int3 &n, int* list,int & tag,int3 flag)
{
int k=n.z;
for (int l = 0; l < n.z; l++) k = ((list[l] == tag)?l:k);
return k<n.x ? flag.x : (k<n.y? flag.y : (k<n.z?flag.z:0));
}
template <const unsigned int exclude>
__global__ void NeighborBuildFullBin_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style, bool neighall)
{
int natoms = neighall?_nall:_nlocal;
//const bool domol=false;
int bin_dim_z=gridDim.y;
CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer;
binned_x = &binned_x[2];
int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax];
int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y;
int bin_x = blockIdx.x/bin_dim_y;
int bin_y = blockIdx.x-bin_x*bin_dim_y;
int bin_z = blockIdx.y;
int bin_c = bin_count[bin];
CUDA_FLOAT cut;
if(globcutoff>0)
cut = globcutoff;
int i=_nall;
CUDA_FLOAT* my_x;
CUDA_FLOAT x_i,y_i,z_i;
for(int actOffset=0; actOffset<bin_c; actOffset+=blockDim.x){
int actIdx=threadIdx.x+actOffset;
CUDA_FLOAT* other_x=shared;
int* other_id=(int*) &other_x[3*blockDim.x];
if(actIdx < bin_c)
{
i = binned_id[__mul24(bin,bin_nmax)+actIdx];
my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+actIdx;
x_i = *my_x; my_x += bin_nmax;
y_i = *my_x; my_x += bin_nmax;
z_i = *my_x;
}
else
i=2*_nall;
__syncthreads();
int jnum=0;
int itype;
if(i<natoms)
{
jnum = 0;
_ilist[i]=i;
itype = _type[i];
}
//__syncthreads();
for(int otherActOffset=0; otherActOffset<bin_c; otherActOffset+=blockDim.x){
int otherActIdx=threadIdx.x+otherActOffset;
if(otherActIdx<bin_c)
{
if(otherActOffset==actOffset)
{
other_id[threadIdx.x]=i;
other_x[threadIdx.x] = x_i;
other_x[threadIdx.x+blockDim.x] = y_i;
other_x[threadIdx.x+2*blockDim.x] = z_i;
}
else
{
other_id[threadIdx.x] = binned_id[__mul24(bin,bin_nmax)+otherActIdx];
my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+otherActIdx;
other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
other_x[threadIdx.x+__mul24(2,blockDim.x)] = *my_x;
}
}
__syncthreads();
int kk=threadIdx.x;
for(int k = 0; k < MIN(bin_c-otherActOffset,blockDim.x); ++k)
{
if(i<natoms)
{
kk++;
kk=kk<MIN(bin_c-otherActOffset,blockDim.x)?kk:0;
int j = other_id[kk];
if(exclude && exclusion(i,j,itype,_type[j])) continue;
if(globcutoff<0)
{
int jtype = _type[j];
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_FLOAT delx = x_i - other_x[kk];
CUDA_FLOAT dely = y_i - other_x[kk+blockDim.x];
CUDA_FLOAT delz = z_i - other_x[kk+2*blockDim.x];
CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if(rsq <= cut && i != j)
{
if(jnum<_maxneighbors){
if(block_style)
_neighbors[i*_maxneighbors+jnum]= j;
else
_neighbors[i+jnum*natoms]= j;
}
++jnum;
}
}
}
__syncthreads();
}
for(int obin_x=bin_x-1;obin_x<bin_x+2;obin_x++)
for(int obin_y=bin_y-1;obin_y<bin_y+2;obin_y++)
for(int obin_z=bin_z-1;obin_z<bin_z+2;obin_z++)
{
if(obin_x<0||obin_y<0||obin_z<0) continue;
if(obin_x>=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue;
int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z;
if(other_bin==bin) continue;
int obin_c=bin_count[other_bin];
for(int otherActOffset=0; otherActOffset<obin_c; otherActOffset+=blockDim.x){
int otherActIdx=otherActOffset+threadIdx.x;
if(threadIdx.x < MIN(blockDim.x,obin_c-otherActOffset))
{
other_id[threadIdx.x] = binned_id[__mul24(other_bin,bin_nmax)+otherActIdx];
my_x = binned_x + __mul24(__mul24(other_bin,3),bin_nmax)+otherActIdx;
other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
other_x[threadIdx.x+2*blockDim.x] = *my_x;
}
__syncthreads();
for(int k = 0; k < MIN(blockDim.x,obin_c-otherActOffset); ++k)
{
if(i<natoms)
{
int j = other_id[k];
if(exclude && exclusion(i,j,itype,_type[j])) continue;
if(globcutoff<0)
{
int jtype = _type[j];
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_FLOAT delx = x_i - other_x[k];
CUDA_FLOAT dely = y_i - other_x[k+blockDim.x];
CUDA_FLOAT delz = z_i - other_x[k+2*blockDim.x];
CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if(rsq <= cut && i != j)
{
if(jnum<_maxneighbors)
{
if(block_style)
_neighbors[i*_maxneighbors+jnum]= j;
else
_neighbors[i+jnum*natoms]= j;
}
++jnum;
}
}
}
__syncthreads();
}
}
if(jnum > _maxneighbors) ((int*)_buffer)[0] = -jnum;
if(i<natoms)
_numneigh[i] = jnum;
}
}
__global__ void FindSpecial(int block_style)
{
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int which;
int tag_mask=0;
int3 spec_flag;
int3 mynspecial = {0,0,1};
if(ii>=_nlocal) return;
int special_id[CUDA_MAX_NSPECIAL];
int i = _ilist[ii];
if(i>=_nlocal) return;
int jnum = _numneigh[i];
if (_special_flag[1] == 0) spec_flag.x = -1;
else if (_special_flag[1] == 1) spec_flag.x = 0;
else spec_flag.x = 1;
if (_special_flag[2] == 0) spec_flag.y = -1;
else if (_special_flag[2] == 1) spec_flag.y = 0;
else spec_flag.y = 2;
if (_special_flag[3] == 0) spec_flag.z = -1;
else if (_special_flag[3] == 1) spec_flag.z = 0;
else spec_flag.z = 3;
mynspecial.x=_nspecial[i];
mynspecial.y=_nspecial[i+_nmax];
mynspecial.z=_nspecial[i+2*_nmax];
if(i<_nlocal)
{
int* list = &_special[i];
for(int k=0;k<mynspecial.z;k++)
{
special_id[k]=list[k*_nmax];
tag_mask = tag_mask|special_id[k];
}
}
for(int k=0;k<MIN(jnum,_maxneighbors);k++)
{
int j;
if(block_style)
j = _neighbors[i*_maxneighbors+k];
else
j = _neighbors[i+k*_nlocal];
int tag_j=_tag[j];
which=0;
if((tag_mask&tag_j)==tag_j)
{
which = find_special(mynspecial,special_id,tag_j,spec_flag);
if(which>0)
{
if(block_style)
_neighbors[i*_maxneighbors+k]=j ^ (which << SBBITS);
else
_neighbors[i+k*_nlocal]=j ^ (which << SBBITS);
}
else if(which<0)
{
if(block_style)
_neighbors[i*_maxneighbors+k]=_neighbors[i*_maxneighbors+jnum-1];
else
_neighbors[i+k*_nlocal]=_neighbors[i+(jnum-1)*_nlocal];
jnum--;
k--;
}
}
}
_numneigh[i]=jnum;
}
__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style)
{
int bin_dim_z=gridDim.y;
CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer;
binned_x = &binned_x[2];
int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax];
int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y;
int bin_x = blockIdx.x/bin_dim_y;
int bin_y = blockIdx.x-bin_x*bin_dim_y;
int bin_z = blockIdx.y;
int bin_c = bin_count[bin];
CUDA_FLOAT cut;
if(globcutoff>0)
cut = globcutoff;
int i=_nall;
CUDA_FLOAT* my_x;
CUDA_FLOAT x_i,y_i,z_i;
for(int actOffset=0; actOffset<bin_c; actOffset+=blockDim.x){
int actIdx=threadIdx.x+actOffset;
CUDA_FLOAT* other_x=shared;
int* other_id=(int*) &other_x[3*blockDim.x];
if(actIdx < bin_c)
{
i = binned_id[__mul24(bin,bin_nmax)+actIdx];
my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+actIdx;
x_i = *my_x; my_x += bin_nmax;
y_i = *my_x; my_x += bin_nmax;
z_i = *my_x;
}
else
i=2*_nall;
__syncthreads();
int jnum=0;
int jnum_border=0;
int jnum_inner=0;
int i_border=-1;
int itype;
if(i<_nlocal)
{
jnum = 0;
_ilist[i]=i;
itype = _type[i];
}
__syncthreads();
for(int otherActOffset=0; otherActOffset<bin_c; otherActOffset+=blockDim.x){
int otherActIdx=threadIdx.x+otherActOffset;
if(otherActIdx<bin_c)
{
if(otherActOffset==actOffset)
{
other_id[threadIdx.x]=i;
other_x[threadIdx.x] = x_i;
other_x[threadIdx.x+blockDim.x] = y_i;
other_x[threadIdx.x+2*blockDim.x] = z_i;
}
else
{
other_id[threadIdx.x] = binned_id[__mul24(bin,bin_nmax)+otherActIdx];
my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+otherActIdx;
other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
other_x[threadIdx.x+__mul24(2,blockDim.x)] = *my_x;
}
}
__syncthreads();
int kk=threadIdx.x;
for(int k = 0; k < MIN(bin_c-otherActOffset,blockDim.x); ++k)
{
if(i<_nlocal)
{
kk++;
kk=kk<MIN(bin_c-otherActOffset,blockDim.x)?kk:0;
int j = other_id[kk];
if(globcutoff<0)
{
int jtype = _type[j];
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_FLOAT delx = x_i - other_x[kk];
CUDA_FLOAT dely = y_i - other_x[kk+blockDim.x];
CUDA_FLOAT delz = z_i - other_x[kk+2*blockDim.x];
CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if(rsq <= cut && i != j)
{
if((j>=_nlocal)&&(i_border<0))
i_border=atomicAdd(_inum_border,1);
if(jnum<_maxneighbors)
{
if(block_style)
{
_neighbors[i*_maxneighbors+jnum]= j;
if(j>=_nlocal)
{_neighbors_border[i_border*_maxneighbors+jnum_border]=j;}
else
{_neighbors_inner[i*_maxneighbors+jnum_inner]=j;}
}
else
{
_neighbors[i+jnum*_nlocal]=j;
if(j>=_nlocal)
{_neighbors_border[i_border+jnum_border*_nlocal]=j;}
else
{_neighbors_inner[i+jnum_inner*_nlocal]=j;}
}
}
++jnum;
if(j>=_nlocal)
jnum_border++;
else
jnum_inner++;
}
}
}
__syncthreads();
}
for(int obin_x=bin_x-1;obin_x<bin_x+2;obin_x++)
for(int obin_y=bin_y-1;obin_y<bin_y+2;obin_y++)
for(int obin_z=bin_z-1;obin_z<bin_z+2;obin_z++)
{
if(obin_x<0||obin_y<0||obin_z<0) continue;
if(obin_x>=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue;
int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z;
if(other_bin==bin) continue;
int obin_c=bin_count[other_bin];
for(int otherActOffset=0; otherActOffset<obin_c; otherActOffset+=blockDim.x){
int otherActIdx=otherActOffset+threadIdx.x;
if(threadIdx.x < MIN(blockDim.x,obin_c-otherActOffset))
{
other_id[threadIdx.x] = binned_id[__mul24(other_bin,bin_nmax)+otherActIdx];
my_x = binned_x + __mul24(__mul24(other_bin,3),bin_nmax)+otherActIdx;
other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
other_x[threadIdx.x+2*blockDim.x] = *my_x;
}
__syncthreads();
for(int k = 0; k < MIN(blockDim.x,obin_c-otherActOffset); ++k)
{
if(i<_nlocal)
{
int j = other_id[k];
if(globcutoff<0)
{
int jtype = _type[j];
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_FLOAT delx = x_i - other_x[k];
CUDA_FLOAT dely = y_i - other_x[k+blockDim.x];
CUDA_FLOAT delz = z_i - other_x[k+2*blockDim.x];
CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if(rsq <= cut && i != j)
{
if((j>=_nlocal)&&(i_border<0))
i_border=atomicAdd(_inum_border,1);
if(jnum<_maxneighbors)
{
if(block_style)
{
_neighbors[i*_maxneighbors+jnum]= j;
if(j>=_nlocal)
{_neighbors_border[i_border*_maxneighbors+jnum_border]=j;}
else
{_neighbors_inner[i*_maxneighbors+jnum_inner]=j;}
}
else
{
_neighbors[i+jnum*_nlocal]=j;
if(j>=_nlocal)
{_neighbors_border[i_border+jnum_border*_nlocal]=j;}
else
{_neighbors_inner[i+jnum_inner*_nlocal]=j;}
}
}
++jnum;
if(j>=_nlocal)
jnum_border++;
else
jnum_inner++;
}
}
}
__syncthreads();
}
}
if(jnum > _maxneighbors) ((int*)_buffer)[0] = -jnum;
if(i<_nlocal)
{
_numneigh[i] = jnum;
_numneigh_inner[i] = jnum_inner;
if(i_border>=0) _numneigh_border[i_border] = jnum_border;
if(i_border>=0) _ilist_border[i_border] = i;
}
}
}
__global__ void NeighborBuildFullNsq_Kernel()
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* buffer = (int*) _buffer;
if(i < _nlocal)
{
X_FLOAT* my_x = _x + i;
CUDA_FLOAT x_i = *my_x; my_x += _nmax;
CUDA_FLOAT y_i = *my_x; my_x += _nmax;
CUDA_FLOAT z_i = *my_x;
int jnum = 0;
int* jlist = _firstneigh[i];
_ilist[i]=i;
int itype = _type[i];
__syncthreads();
for(int j = 0; j < _nall; ++j)
{
my_x = _x + j;
CUDA_FLOAT x_j = *my_x; my_x += _nmax;
CUDA_FLOAT y_j = *my_x; my_x += _nmax;
CUDA_FLOAT z_j = *my_x;
CUDA_FLOAT delx = x_i - x_j;
CUDA_FLOAT dely = y_i - y_j;
CUDA_FLOAT delz = z_i - z_j;
CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
int jtype = _type[j];
if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j)
{
if(jnum<_maxneighbors)
jlist[jnum] = j;
if(i==151) ((int*)_buffer)[jnum+2]=j;
++jnum;
}
__syncthreads();
}
if(jnum > _maxneighbors) buffer[0] = 0;
_numneigh[i] = jnum;
if(i==151) ((int*)_buffer)[1]=jnum;
}
}

View File

@ -1,78 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define _rhoinv MY_AP(coeff1)
#define _sigma MY_AP(coeff2)
#define _a MY_AP(coeff3)
#define _c MY_AP(coeff4)
#define _d MY_AP(coeff5)
#include "pair_born_coul_long_cuda_cu.h"
#include "pair_born_coul_long_cuda_kernel_nc.cu"
#include <time.h>
void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata)
{
Cuda_Pair_Init_AllStyles(sdata, 5,true);
}
void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
{
static short init=0;
if(! init)
{
init = 1;
Cuda_PairBornCoulLongCuda_Init(sdata);
}
dim3 grid,threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BORN,COUL_LONG,DATA_NONE>
<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
else
Pair_Kernel_TpA<PAIR_BORN,COUL_LONG,DATA_NONE>
<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}
#undef _rhoinv
#undef _sigma
#undef _a
#undef _c
#undef _d

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
#ifdef CUDA_USE_BINNING
extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
#else
extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
#endif

View File

@ -1,34 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
{
const F_FLOAT r2inv = F_F(1.0)/rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT r6inv = r2inv*r2inv*r2inv;
const F_FLOAT rexp = _EXP_((_sigma[ij_type]-r)*_rhoinv[ij_type]);
const F_FLOAT forceborn = _a[ij_type]*_rhoinv[ij_type]*r*rexp -
F_F(6.0)*_c[ij_type]*r6inv + F_F(8.0)*_d[ij_type]*r2inv*r6inv;
if(eflag) evdwl += factor_lj*(_a[ij_type]*rexp - _c[ij_type]*r6inv
+_d[ij_type]*r2inv*r6inv-_offset[ij_type]);
return factor_lj*forceborn*r2inv;
}

View File

@ -1,74 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define _rhoinv MY_AP(coeff1)
#define _buck1 MY_AP(coeff2)
#define _buck2 MY_AP(coeff3)
#define _a MY_AP(coeff4)
#define _c MY_AP(coeff5)
#include "pair_buck_coul_cut_cuda_cu.h"
#include <time.h>
void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata)
{
Cuda_Pair_Init_AllStyles(sdata, 5,true);
}
void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
{
static short init=0;
if(! init)
{
init = 1;
Cuda_PairBuckCoulCutCuda_Init(sdata);
}
dim3 grid,threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BUCK,COUL_CUT,DATA_NONE>
<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
else
Pair_Kernel_TpA<PAIR_BUCK,COUL_CUT,DATA_NONE>
<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}
#undef _rhoinv
#undef _buck1
#undef _buck2
#undef _a
#undef _c

View File

@ -1,30 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_shared.h"
#ifdef CUDA_USE_BINNING
extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag);
#else
extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
#endif

View File

@ -1,77 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <stdio.h>
#define _rhoinv MY_AP(coeff1)
#define _buck1 MY_AP(coeff2)
#define _buck2 MY_AP(coeff3)
#define _a MY_AP(coeff4)
#define _c MY_AP(coeff5)
#include "pair_buck_coul_long_cuda_cu.h"
#include <time.h>
void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata)
{
Cuda_Pair_Init_AllStyles(sdata, 5,true);
}
void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
{
static short init=0;
if(! init)
{
init = 1;
Cuda_PairBuckCoulLongCuda_Init(sdata);
}
dim3 grid,threads;
int sharedperproc;
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BUCK,COUL_LONG,DATA_NONE>
<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
else
Pair_Kernel_TpA<PAIR_BUCK,COUL_LONG,DATA_NONE>
<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}
#undef _rhoinv
#undef _buck1
#undef _buck2
#undef _a
#undef _c

Some files were not shown because too many files have changed in this diff Show More