forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
2be078632d
commit
5f799182b3
|
@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
|||
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include
|
||||
CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
|
||||
|
||||
BIN_DIR = ./
|
||||
|
|
|
@ -17,16 +17,16 @@
|
|||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/
|
||||
CUDA_HOME = /sw/analysis-x64/cuda/3.2/centos5.5_binary/
|
||||
NVCC = nvcc
|
||||
|
||||
CUDA_ARCH = -arch=sm_13
|
||||
CUDA_PRECISION = -D_SINGLE_SINGLE
|
||||
CUDA_PRECISION = -D_SINGLE_DOUBLE
|
||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON -openmp
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp
|
||||
CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
|
||||
|
||||
BIN_DIR = ./
|
||||
|
|
|
@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
|||
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT
|
||||
CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
|
||||
|
||||
BIN_DIR = ./
|
||||
|
|
|
@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
|||
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
|
||||
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
||||
|
||||
BIN_DIR = ./
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
||||
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
|
||||
OCL_LINK = -lOpenCL
|
||||
OCL_PREC = -D_SINGLE_SINGLE
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
|||
CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
||||
CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
|
||||
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
||||
|
||||
BIN_DIR = ./
|
||||
|
|
|
@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11
|
|||
CUDA_PRECISION = -D_SINGLE_SINGLE
|
||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||
CUDA_LIB = -L$(CUDA_HOME)/lib
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
|
||||
CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32
|
||||
|
||||
CUDR_CPP = mpic++
|
||||
CUDR_OPTS = -O2 -m32 -g
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
|
||||
OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT
|
||||
OCL_LINK = -framework OpenCL
|
||||
OCL_PREC = -D_SINGLE_SINGLE
|
||||
|
||||
|
|
|
@ -13,7 +13,8 @@
|
|||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
|
@ -28,10 +29,11 @@ GPU_LIB = $(LIB_DIR)/libgpu.a
|
|||
# Headers for Geryon
|
||||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||
NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H)
|
||||
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H)
|
||||
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) nv_kernel_def.h
|
||||
# Headers for Pair Stuff
|
||||
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
|
||||
pair_gpu_device.h pair_gpu_balance.h
|
||||
PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
|
||||
pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
|
||||
pair_gpu_balance.h pppm_gpu_memory.h
|
||||
|
||||
ALL_H = $(NVD_H) $(PAIR_H)
|
||||
|
||||
|
@ -39,28 +41,37 @@ EXECS = $(BIN_DIR)/nvc_get_devices
|
|||
CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
|
||||
$(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
|
||||
$(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
|
||||
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
||||
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
|
||||
$(OBJ_DIR)/charge_gpu_memory.o \
|
||||
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
|
||||
$(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
|
||||
$(OBJ_DIR)/pair_gpu_device.o \
|
||||
$(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
|
||||
$(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
|
||||
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
|
||||
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
|
||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
||||
$(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
|
||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
||||
$(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
|
||||
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
|
||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
|
||||
$(CUDPP)
|
||||
PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
|
||||
PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
|
||||
$(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
|
||||
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
|
||||
$(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
|
||||
$(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \
|
||||
$(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \
|
||||
$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
|
||||
$(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
|
||||
$(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \
|
||||
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \
|
||||
$(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
|
||||
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
|
||||
|
||||
|
@ -93,6 +104,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
|
|||
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
|
||||
$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
|
||||
|
||||
|
@ -105,11 +119,20 @@ $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
|
|||
$(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
|
||||
$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c pair_gpu_device.cpp
|
||||
$(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h
|
||||
$(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
||||
$(CUDR) -o $@ -c atomic_gpu_memory.cpp
|
||||
|
@ -117,6 +140,24 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c
|
|||
$(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
|
||||
$(CUDR) -o $@ -c charge_gpu_memory.cpp
|
||||
|
||||
$(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h
|
||||
$(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
|
||||
$(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
|
||||
|
||||
|
@ -144,7 +185,7 @@ $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_
|
|||
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
|
||||
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
|
||||
$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||
|
@ -156,7 +197,7 @@ $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_c
|
|||
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
|
||||
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
|
||||
$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||
|
@ -168,9 +209,21 @@ $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc
|
|||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
|
||||
$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
|
||||
$(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
|
||||
|
||||
|
@ -180,7 +233,7 @@ $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ke
|
|||
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
|
||||
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
|
||||
$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||
|
@ -192,9 +245,21 @@ $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj9
|
|||
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
|
||||
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
|
||||
$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
|
||||
$(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
|
||||
|
||||
|
@ -204,7 +269,7 @@ $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_c
|
|||
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
|
||||
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
|
||||
$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
|
||||
|
@ -216,7 +281,7 @@ $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/c
|
|||
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
|
||||
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
|
||||
$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
|
@ -23,30 +24,37 @@ OCL_LIB = $(LIB_DIR)/libgpu.a
|
|||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H)
|
||||
# Headers for Pair Stuff
|
||||
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
|
||||
pair_gpu_device.h pair_gpu_balance.h
|
||||
PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
|
||||
pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
|
||||
pair_gpu_balance.h pppm_gpu_memory.h
|
||||
|
||||
ALL_H = $(OCL_H) $(PAIR_H)
|
||||
|
||||
EXECS = $(BIN_DIR)/ocl_get_devices
|
||||
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
||||
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
|
||||
$(OBJ_DIR)/charge_gpu_memory.o \
|
||||
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
|
||||
$(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
||||
$(OBJ_DIR)/pair_gpu_device.o \
|
||||
$(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
|
||||
$(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
|
||||
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
|
||||
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
|
||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
||||
$(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
|
||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
||||
$(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
|
||||
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
|
||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o
|
||||
KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
|
||||
KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \
|
||||
$(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \
|
||||
$(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
|
||||
$(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
|
||||
$(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
|
||||
$(OBJ_DIR)/crml_gpu_cl.h \
|
||||
$(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h
|
||||
|
||||
$(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \
|
||||
$(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \
|
||||
$(OBJ_DIR)/cmmc_long_gpu_cl.h
|
||||
|
||||
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
||||
|
||||
all: $(OCL_LIB) $(EXECS)
|
||||
|
@ -57,14 +65,23 @@ $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
|
|||
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
|
||||
$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(OCL_H)
|
||||
$(OCL) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
|
||||
$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
|
||||
$(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
|
||||
$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
|
||||
$(OCL) -o $@ -c pair_gpu_device.cpp
|
||||
$(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h
|
||||
$(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
||||
$(OCL) -o $@ -c atomic_gpu_memory.cpp
|
||||
|
@ -72,6 +89,15 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c
|
|||
$(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
|
||||
$(OCL) -o $@ -c charge_gpu_memory.cpp
|
||||
|
||||
$(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h
|
||||
$(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
|
||||
$(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
|
||||
|
||||
|
@ -93,7 +119,7 @@ $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
|
|||
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
|
||||
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
|
||||
$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
|
||||
|
@ -102,7 +128,7 @@ $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
|
|||
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
|
||||
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
|
||||
$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
|
||||
|
@ -111,16 +137,25 @@ $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
|
|||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
|
||||
$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
|
||||
$(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
|
||||
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
|
||||
$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
|
||||
|
@ -129,16 +164,25 @@ $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
|
|||
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
|
||||
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
|
||||
$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
|
||||
$(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
|
||||
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
|
||||
$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
|
||||
|
@ -147,7 +191,7 @@ $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
|
|||
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
|
||||
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
|
||||
$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
|
|
|
@ -23,23 +23,28 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
|||
template <class numtyp, class acctyp>
|
||||
AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) {
|
||||
device=&pair_gpu_device;
|
||||
ans=new PairGPUAns<numtyp,acctyp>();
|
||||
nbor=new PairGPUNbor();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
AtomicGPUMemoryT::~AtomicGPUMemory() {
|
||||
delete ans;
|
||||
delete nbor;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
|
||||
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||
nbor->bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const char *pair_program) {
|
||||
int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const char *pair_program) {
|
||||
nbor_time_avail=false;
|
||||
screen=_screen;
|
||||
|
||||
|
@ -48,24 +53,30 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
|||
gpu_nbor=true;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
|
||||
_gpu_host,max_nbors,cell_size,false))
|
||||
return false;
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
if (_threads_per_atom>1 && gpu_nbor==false) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
nbor=&device->nbor;
|
||||
|
||||
_block_size=BLOCK_1D;
|
||||
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
|
||||
_block_size=ucl_device->group_size();
|
||||
_block_size=device->pair_block_size();
|
||||
compile_kernels(*ucl_device,pair_program);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_split);
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init(*ucl_device);
|
||||
|
@ -73,9 +84,14 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
|||
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
|
||||
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AtomicGPUMemoryT::estimate_gpu_overhead() {
|
||||
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -83,7 +99,10 @@ void AtomicGPUMemoryT::clear_atomic() {
|
|||
// Output any timing information
|
||||
acc_timers();
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
|
||||
_gpu_overhead*=hd_balancer.timestep();
|
||||
_driver_overhead*=hd_balancer.timestep();
|
||||
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
|
||||
|
||||
if (_compiled) {
|
||||
k_pair_fast.clear();
|
||||
|
@ -107,8 +126,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
|||
success=true;
|
||||
|
||||
nbor_time_avail=true;
|
||||
|
||||
int mn=nbor->max_nbor_loop(inum,numj);
|
||||
int mn=nbor->max_nbor_loop(inum,numj,ilist);
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,mn,success);
|
||||
if (!success)
|
||||
|
@ -116,7 +134,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
|||
|
||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
|
@ -130,8 +148,8 @@ template <class numtyp, class acctyp>
|
|||
inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
|
||||
const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, int *tag,
|
||||
int **nspecial, int **special,
|
||||
bool &success) {
|
||||
nbor_time_avail=true;
|
||||
|
@ -144,10 +162,10 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
|
|||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
|
||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
}
|
||||
|
@ -156,24 +174,25 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
|
|||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
resize_atom(0,nall,success);
|
||||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
|
||||
nbor->gpu_nbor());
|
||||
atom->inum(inum);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
|
@ -187,7 +206,8 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
|
|||
atom->add_x_data(host_x,host_type);
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
|
@ -195,29 +215,32 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
|
|||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, int *tag,
|
||||
int **nspecial, int **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success) {
|
||||
acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
resize_atom(0,nall,success);
|
||||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
|
||||
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
|
||||
atom->inum(inum);
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
boxlo, boxhi, tag, nspecial, special, success);
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
hd_balancer.start_timer();
|
||||
|
@ -226,19 +249,21 @@ int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
|
|||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
return device->nbor.host_nbor.begin();
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double AtomicGPUMemoryT::host_memory_usage_atomic() const {
|
||||
return device->atom.host_memory_usage()+
|
||||
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
||||
sizeof(AtomicGPUMemory<numtyp,acctyp>);
|
||||
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||
4*sizeof(numtyp)+sizeof(AtomicGPUMemory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef ATOMIC_GPU_MEMORY_H
|
||||
#define ATOMIC_GPU_MEMORY_H
|
||||
|
||||
#define BLOCK_1D 64
|
||||
|
||||
#include "pair_gpu_device.h"
|
||||
#include "pair_gpu_balance.h"
|
||||
#include "mpi.h"
|
||||
|
@ -39,17 +37,28 @@ class AtomicGPUMemory {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const char *pair_program);
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const char *pair_program);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(inum, nall, success))
|
||||
if (atom->resize(nall, success))
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
|
@ -85,13 +94,16 @@ class AtomicGPUMemory {
|
|||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (nbor_time_avail) {
|
||||
nbor->time_nbor.add_to_total();
|
||||
nbor->time_kernel.add_to_total();
|
||||
nbor_time_avail=false;
|
||||
if (device->time_device()) {
|
||||
if (nbor_time_avail) {
|
||||
nbor->time_nbor.add_to_total();
|
||||
nbor->time_kernel.add_to_total();
|
||||
nbor_time_avail=false;
|
||||
}
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
}
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
|
@ -99,6 +111,7 @@ class AtomicGPUMemory {
|
|||
nbor_time_avail=false;
|
||||
time_pair.zero();
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
}
|
||||
|
||||
/// Copy neighbor list from host
|
||||
|
@ -108,24 +121,32 @@ class AtomicGPUMemory {
|
|||
/// Build neighbor list on device
|
||||
void build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, bool &success);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int timestep, const int f_ago, const int inum_full,
|
||||
void compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int * compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag, int **nspecial,
|
||||
int * compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
|
@ -148,6 +169,9 @@ class AtomicGPUMemory {
|
|||
/// Atom Data
|
||||
PairGPUAtom<numtyp,acctyp> *atom;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
PairGPUAns<numtyp,acctyp> *ans;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
|
@ -167,8 +191,10 @@ class AtomicGPUMemory {
|
|||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size;
|
||||
int _block_size, _threads_per_atom;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
||||
|
||||
|
|
|
@ -23,23 +23,28 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
|||
template <class numtyp, class acctyp>
|
||||
ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
|
||||
device=&pair_gpu_device;
|
||||
ans=new PairGPUAns<numtyp,acctyp>();
|
||||
nbor=new PairGPUNbor();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
ChargeGPUMemoryT::~ChargeGPUMemory() {
|
||||
delete ans;
|
||||
delete nbor;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
|
||||
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||
nbor->bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const char *pair_program) {
|
||||
int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const char *pair_program) {
|
||||
nbor_time_avail=false;
|
||||
screen=_screen;
|
||||
|
||||
|
@ -48,24 +53,31 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
|||
gpu_nbor=true;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
|
||||
_gpu_host,max_nbors,cell_size,false))
|
||||
return false;
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
if (_threads_per_atom>1 && gpu_nbor==false) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
nbor=&device->nbor;
|
||||
|
||||
_block_size=BLOCK_1D;
|
||||
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
|
||||
_block_size=ucl_device->group_size();
|
||||
_block_size=device->pair_block_size();
|
||||
_block_bio_size=device->block_bio_pair();
|
||||
compile_kernels(*ucl_device,pair_program);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_split);
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init(*ucl_device);
|
||||
|
@ -74,9 +86,14 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
|||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
|
||||
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
return true;
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void ChargeGPUMemoryT::estimate_gpu_overhead() {
|
||||
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -84,7 +101,10 @@ void ChargeGPUMemoryT::clear_atomic() {
|
|||
// Output any timing information
|
||||
acc_timers();
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
|
||||
_gpu_overhead*=hd_balancer.timestep();
|
||||
_driver_overhead*=hd_balancer.timestep();
|
||||
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
|
||||
|
||||
if (_compiled) {
|
||||
k_pair_fast.clear();
|
||||
|
@ -109,7 +129,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
|||
|
||||
nbor_time_avail=true;
|
||||
|
||||
int mn=nbor->max_nbor_loop(inum,numj);
|
||||
int mn=nbor->max_nbor_loop(inum,numj,ilist);
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,mn,success);
|
||||
if (!success)
|
||||
|
@ -117,7 +137,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
|||
|
||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
|
@ -131,8 +151,8 @@ template <class numtyp, class acctyp>
|
|||
inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
|
||||
const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, int *tag,
|
||||
int **nspecial, int **special,
|
||||
bool &success) {
|
||||
nbor_time_avail=true;
|
||||
|
@ -145,10 +165,10 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
|
|||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
|
||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
}
|
||||
|
@ -157,24 +177,26 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
|
|||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q) {
|
||||
void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
resize_atom(0,nall,success);
|
||||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
|
||||
nbor->gpu_nbor());
|
||||
atom->inum(inum);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
|
@ -187,10 +209,14 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
|
|||
atom->cast_q_data(host_q);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
atom->add_other_data();
|
||||
atom->add_q_data();
|
||||
|
||||
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
|
@ -198,30 +224,33 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
|
|||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag,
|
||||
int** ChargeGPUMemoryT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, int *tag,
|
||||
int **nspecial, int **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double *host_q) {
|
||||
double *host_q, double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
resize_atom(0,nall,success);
|
||||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
|
||||
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
|
||||
atom->inum(inum);
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
boxlo, boxhi, tag, nspecial, special, success);
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
atom->cast_q_data(host_q);
|
||||
|
@ -232,20 +261,25 @@ int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
|
|||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
atom->add_other_data();
|
||||
atom->add_q_data();
|
||||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
return device->nbor.host_nbor.begin();
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double ChargeGPUMemoryT::host_memory_usage_atomic() const {
|
||||
return device->atom.host_memory_usage()+
|
||||
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
||||
sizeof(ChargeGPUMemory<numtyp,acctyp>);
|
||||
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||
4*sizeof(numtyp)+sizeof(ChargeGPUMemory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef CHARGE_GPU_MEMORY_H
|
||||
#define CHARGE_GPU_MEMORY_H
|
||||
|
||||
#define BLOCK_1D 64
|
||||
|
||||
#include "pair_gpu_device.h"
|
||||
#include "pair_gpu_balance.h"
|
||||
#include "mpi.h"
|
||||
|
@ -39,19 +37,30 @@ class ChargeGPUMemory {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const char *pair_program);
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const char *pair_program);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(inum, nall, success)) {
|
||||
if (atom->resize(nall, success)) {
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
}
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
|
@ -87,13 +96,16 @@ class ChargeGPUMemory {
|
|||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (nbor_time_avail) {
|
||||
nbor->time_nbor.add_to_total();
|
||||
nbor->time_kernel.add_to_total();
|
||||
nbor_time_avail=false;
|
||||
if (device->time_device()) {
|
||||
if (nbor_time_avail) {
|
||||
nbor->time_nbor.add_to_total();
|
||||
nbor->time_kernel.add_to_total();
|
||||
nbor_time_avail=false;
|
||||
}
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
}
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
|
@ -101,6 +113,7 @@ class ChargeGPUMemory {
|
|||
nbor_time_avail=false;
|
||||
time_pair.zero();
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
}
|
||||
|
||||
/// Copy neighbor list from host
|
||||
|
@ -110,24 +123,25 @@ class ChargeGPUMemory {
|
|||
/// Build neighbor list on device
|
||||
void build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, bool &success);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int timestep, const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success,
|
||||
double *charge);
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *charge,
|
||||
const int nlocal, double *boxlo, double *prd);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int * compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag, int **nspecial,
|
||||
int** compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *charge);
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double *charge, double *boxlo, double *prd);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
|
@ -152,6 +166,10 @@ class ChargeGPUMemory {
|
|||
PairGPUAtom<numtyp,acctyp> *atom;
|
||||
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
PairGPUAns<numtyp,acctyp> *ans;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor data
|
||||
|
@ -171,8 +189,10 @@ class ChargeGPUMemory {
|
|||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size;
|
||||
int _block_size, _block_bio_size, _threads_per_atom;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
||||
|
||||
|
|
|
@ -28,12 +28,12 @@ static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
|
|||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen) {
|
||||
int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen) {
|
||||
CMMMF.clear();
|
||||
gpu_mode=CMMMF.device->gpu_mode();
|
||||
double gpu_split=CMMMF.device->particle_split();
|
||||
|
@ -54,13 +54,11 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
CMMMF.device->world_barrier();
|
||||
if (message)
|
||||
|
@ -75,45 +73,45 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split,
|
||||
screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
CMMMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
|
||||
if (init_ok==0)
|
||||
CMMMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void cmm_gpu_clear() {
|
||||
CMMMF.clear();
|
||||
}
|
||||
|
||||
int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
int** cmm_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success);
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success) {
|
||||
return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
void cmm_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||
}
|
||||
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef CMM_GPU_KERNEL
|
||||
#define CMM_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
|
@ -46,7 +44,7 @@
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
|
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -82,40 +82,56 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
if (ii<inum) {
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
if (ii<inum) {
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -164,8 +180,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
@ -183,49 +238,64 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,__global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
__global numtyp* sp_lj_in,__global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -273,8 +343,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
|
|
@ -42,22 +42,26 @@ int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,cmm_cut_gpu_kernel);
|
||||
int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,cmm_cut_gpu_kernel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int cmm_types=ntypes;
|
||||
shared_types=false;
|
||||
if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
cmm_types=MAX_SHARED_TYPES;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
cmm_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_cmm_types=cmm_types;
|
||||
|
@ -84,7 +88,7 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
|||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -122,9 +126,10 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
@ -133,16 +138,18 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch);
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch);
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
|
|
@ -29,13 +29,20 @@ class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq, int **host_cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
|
|
|
@ -28,14 +28,14 @@ static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
|
|||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
CMMLMF.clear();
|
||||
gpu_mode=CMMLMF.device->gpu_mode();
|
||||
double gpu_split=CMMLMF.device->particle_split();
|
||||
|
@ -56,15 +56,12 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
|
||||
host_lj3, host_lj4, offset, special_lj, inum,
|
||||
nall, 300, maxspecial, cell_size, gpu_split,
|
||||
screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e,g_ewald);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
|
||||
|
||||
CMMLMF.device->world_barrier();
|
||||
if (message)
|
||||
|
@ -79,48 +76,51 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
|
||||
host_lj3, host_lj4, offset, special_lj, inum,
|
||||
nall, 300, maxspecial, cell_size, gpu_split,
|
||||
screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||
qqrd2e, g_ewald);
|
||||
CMMLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
|
||||
if (init_ok==0)
|
||||
CMMLMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void cmml_gpu_clear() {
|
||||
CMMLMF.clear();
|
||||
}
|
||||
|
||||
int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
int** cmml_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q) {
|
||||
return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success, host_q);
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd) {
|
||||
return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q,boxlo,prd);
|
||||
}
|
||||
|
||||
void cmml_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q) {
|
||||
CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||
host_q);
|
||||
host_q,nlocal,boxlo,prd);
|
||||
}
|
||||
|
||||
double cmml_gpu_bytes() {
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef CMML_GPU_KERNEL
|
||||
#define CMML_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
|
@ -54,7 +52,7 @@
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
|
||||
|
@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
|
|||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define fetch_q(i,y) q_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, __global numtyp *q_ ,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
|
@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
@ -213,8 +229,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
@ -234,51 +291,67 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
const numtyp qqrd2e, const numtyp g_ewald,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (ii<8)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
lj3[ii]=lj3_in[ii];
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
@ -351,8 +424,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Store answers
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
|
|
@ -43,26 +43,30 @@ int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
double **host_cut_ljsq,
|
||||
const double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,cmmc_long_gpu_kernel);
|
||||
int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
double **host_cut_ljsq,
|
||||
const double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,cmmc_long_gpu_kernel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
@ -95,7 +99,7 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
|||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -133,9 +137,10 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
@ -144,19 +149,21 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald);
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald);
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||
&this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
|
|
@ -29,15 +29,22 @@ class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq, int ** cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq, int ** cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
|
|
|
@ -28,16 +28,16 @@ static CRML_GPU_Memory<PRECISION,ACC_PRECISION> CRMLMF;
|
|||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
double host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald, const double cut_lj_innersq,
|
||||
const double denom_lj, double **epsilon,
|
||||
double **sigma, const bool mix_arithmetic) {
|
||||
int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
double host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald, const double cut_lj_innersq,
|
||||
const double denom_lj, double **epsilon,
|
||||
double **sigma, const bool mix_arithmetic) {
|
||||
CRMLMF.clear();
|
||||
gpu_mode=CRMLMF.device->gpu_mode();
|
||||
double gpu_split=CRMLMF.device->particle_split();
|
||||
|
@ -58,16 +58,13 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
|||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||
qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
|
||||
epsilon,sigma,mix_arithmetic);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
|
||||
epsilon,sigma,mix_arithmetic);
|
||||
|
||||
CRMLMF.device->world_barrier();
|
||||
if (message)
|
||||
|
@ -82,50 +79,54 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
|||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split,
|
||||
screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald,
|
||||
cut_lj_innersq, denom_lj, epsilon, sigma,
|
||||
mix_arithmetic);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||
qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon,
|
||||
sigma, mix_arithmetic);
|
||||
|
||||
CRMLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
|
||||
if (init_ok==0)
|
||||
CRMLMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void crml_gpu_clear() {
|
||||
CRMLMF.clear();
|
||||
}
|
||||
|
||||
int * crml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
int** crml_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q) {
|
||||
return CRMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success, host_q);
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd) {
|
||||
return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
void crml_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q) {
|
||||
CRMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||
host_q);
|
||||
void crml_gpu_compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q, const int nlocal,
|
||||
double *boxlo, double *prd) {
|
||||
CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
|
||||
eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
|
||||
nlocal,boxlo,prd);
|
||||
}
|
||||
|
||||
double crml_gpu_bytes() {
|
||||
|
|
|
@ -54,7 +54,7 @@
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
|
||||
|
@ -90,6 +90,7 @@ __inline float fetch_q(const int& i, const float *q)
|
|||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define fetch_q(i,y) q_[i]
|
||||
#define BLOCK_BIO_PAIR 64
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -98,18 +99,22 @@ __inline float fetch_q(const int& i, const float *q)
|
|||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
const int lj_types, __global numtyp *sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_, const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald,
|
||||
const numtyp denom_lj, const numtyp cut_bothsq,
|
||||
const numtyp cut_ljsq, const numtyp cut_lj_innersq) {
|
||||
const numtyp cut_ljsq, const numtyp cut_lj_innersq,
|
||||
const int t_per_atom) {
|
||||
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__local numtyp sp_lj[8];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
|
@ -120,29 +125,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
if (ii<inum) {
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
@ -219,8 +236,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
@ -240,50 +298,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_, const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald,
|
||||
const numtyp denom_lj, const numtyp cut_bothsq,
|
||||
const numtyp cut_ljsq,
|
||||
const numtyp cut_lj_innersq) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, __global numtyp *q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const numtyp denom_lj,
|
||||
const numtyp cut_bothsq, const numtyp cut_ljsq,
|
||||
const numtyp cut_lj_innersq,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (ii<8)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
ljd[ii]=ljd_in[ii];
|
||||
ljd[ii+64]=ljd_in[ii+64];
|
||||
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
ljd[tid]=ljd_in[tid];
|
||||
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
|
||||
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
@ -366,8 +439,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Store answers
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
|
|
@ -43,7 +43,7 @@ int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool CRML_GPU_MemoryT::init(const int ntypes,
|
||||
int CRML_GPU_MemoryT::init(const int ntypes,
|
||||
double host_cut_bothsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
|
@ -56,20 +56,24 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
|
|||
const double g_ewald, const double cut_lj_innersq,
|
||||
const double denom_lj, double **epsilon,
|
||||
double **sigma, const bool mix_arithmetic) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,crml_gpu_kernel);
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,crml_gpu_kernel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (this->_block_size>=64 && mix_arithmetic)
|
||||
if (this->_block_bio_size>=64 && mix_arithmetic)
|
||||
shared_types=true;
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
int h_size=lj_types*lj_types;
|
||||
if (h_size<MAX_BIO_SHARED_TYPES)
|
||||
h_size=MAX_BIO_SHARED_TYPES;
|
||||
int max_bio_shared_types=this->device->max_bio_shared_types();
|
||||
if (h_size<max_bio_shared_types)
|
||||
h_size=max_bio_shared_types;
|
||||
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
|
||||
UCL_WRITE_OPTIMIZED);
|
||||
for (int i=0; i<h_size*32; i++)
|
||||
|
@ -79,7 +83,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
|
|||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_lj3,host_lj4);
|
||||
|
||||
ljd.alloc(MAX_BIO_SHARED_TYPES,*(this->ucl_device),UCL_READ_ONLY);
|
||||
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
@ -99,7 +103,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
|
|||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -125,7 +129,7 @@ double CRML_GPU_MemoryT::host_memory_usage() const {
|
|||
template <class numtyp, class acctyp>
|
||||
void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
const int BX=this->_block_bio_size;
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
|
@ -137,9 +141,10 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
@ -147,21 +152,24 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
|
||||
&sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
|
||||
&_cut_ljsq, &_cut_lj_innersq);
|
||||
&_cut_ljsq, &_cut_lj_innersq,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
|
||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
|
||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
|
||||
&this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
|
|
@ -29,17 +29,24 @@ class CRML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double host_cut_bothsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald,
|
||||
const double cut_lj_innersq, const double denom_lj,
|
||||
double **epsilon, double **sigma, const bool mix_arithmetic);
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double host_cut_bothsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald,
|
||||
const double cut_lj_innersq, const double denom_lj,
|
||||
double **epsilon, double **sigma, const bool mix_arithmetic);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
|
|
|
@ -49,14 +49,14 @@ void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
|
|||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool gb_gpu_init(const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **shape,
|
||||
double **well, double **cutsq, double **sigma,
|
||||
double **epsilon, double *host_lshape, int **form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
int gb_gpu_init(const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **shape,
|
||||
double **well, double **cutsq, double **sigma,
|
||||
double **epsilon, double *host_lshape, int **form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
GBMF.clear();
|
||||
gpu_mode=GBMF.device->gpu_mode();
|
||||
double gpu_split=GBMF.device->particle_split();
|
||||
|
@ -77,14 +77,12 @@ bool gb_gpu_init(const int ntypes, const double gamma,
|
|||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
|
||||
sigma, epsilon, host_lshape, form, host_lj1,
|
||||
host_lj2, host_lj3, host_lj4, offset, special_lj,
|
||||
inum, nall, max_nbors, cell_size, gpu_split, screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
|
||||
sigma, epsilon, host_lshape, form, host_lj1,
|
||||
host_lj2, host_lj3, host_lj4, offset, special_lj,
|
||||
inum, nall, max_nbors, cell_size, gpu_split, screen);
|
||||
|
||||
GBMF.device->world_barrier();
|
||||
if (message)
|
||||
|
@ -99,22 +97,22 @@ bool gb_gpu_init(const int ntypes, const double gamma,
|
|||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
|
||||
sigma, epsilon, host_lshape, form, host_lj1,
|
||||
host_lj2, host_lj3, host_lj4, offset, special_lj,
|
||||
inum, nall, max_nbors, cell_size, gpu_split,
|
||||
screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma,
|
||||
epsilon, host_lshape, form, host_lj1, host_lj2,
|
||||
host_lj3, host_lj4, offset, special_lj, inum, nall,
|
||||
max_nbors, cell_size, gpu_split, screen);
|
||||
|
||||
GBMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
|
||||
if (init_ok==0)
|
||||
GBMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -131,8 +129,8 @@ template <class gbmtyp>
|
|||
inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
|
||||
const int host_inum, const int nall,
|
||||
double **host_x, double **host_quat,
|
||||
int *host_type, double *boxlo,
|
||||
double *boxhi, bool &success) {
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, bool &success) {
|
||||
gbm.nbor_time_avail=true;
|
||||
|
||||
success=true;
|
||||
|
@ -144,7 +142,7 @@ inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
|
|||
gbm.atom->cast_copy_x(host_x,host_type);
|
||||
int mn;
|
||||
gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
|
||||
boxlo, boxhi, NULL, NULL, NULL, success, mn);
|
||||
sublo, subhi, NULL, NULL, NULL, success, mn);
|
||||
gbm.nbor->copy_unpacked(inum,mn);
|
||||
gbm.last_ellipse=inum;
|
||||
gbm.max_last_ellipse=inum;
|
||||
|
@ -163,7 +161,7 @@ void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
|
|||
|
||||
gbm.nbor_time_avail=true;
|
||||
|
||||
int mn=gbm.nbor->max_nbor_loop(inum,numj);
|
||||
int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist);
|
||||
gbm.resize_atom(inum,nall,success);
|
||||
gbm.resize_local(inum,0,mn,osize,success);
|
||||
if (!success)
|
||||
|
@ -216,9 +214,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/
|
||||
(BX/gbm._threads_per_atom)));
|
||||
int stride=gbm.nbor->nbor_pitch();
|
||||
int ainum=gbm.atom->inum();
|
||||
int ainum=gbm.ans->inum();
|
||||
int anall=gbm.atom->nall();
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
|
@ -226,7 +225,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||
if (gbm.last_ellipse>0) {
|
||||
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
|
||||
static_cast<double>(BX)));
|
||||
(BX/gbm._threads_per_atom)));
|
||||
gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
|
||||
ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
|
@ -237,11 +236,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||
&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
|
||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
||||
&stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
|
||||
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
|
||||
&stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
|
||||
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall,
|
||||
&gbm._threads_per_atom);
|
||||
gbm.time_gayberne.stop();
|
||||
|
||||
if (gbm.last_ellipse==gbm.atom->inum()) {
|
||||
if (gbm.last_ellipse==gbm.ans->inum()) {
|
||||
gbm.time_kernel2.start();
|
||||
gbm.time_kernel2.stop();
|
||||
gbm.time_gayberne2.start();
|
||||
|
@ -254,9 +254,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||
// ------------ SPHERE_ELLIPSE ---------------
|
||||
|
||||
gbm.time_kernel2.start();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
|
||||
gbm.last_ellipse)/BX));
|
||||
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
|
||||
gbm.last_ellipse)/
|
||||
(BX/gbm._threads_per_atom)));
|
||||
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
|
||||
SPHERE_ELLIPSE,SPHERE_ELLIPSE);
|
||||
gbm.time_kernel2.stop();
|
||||
|
||||
|
@ -266,13 +267,14 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||
&gbm.shape.begin(), &gbm.well.begin(),
|
||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||
&gbm._lj_types, &gbm.lshape.begin(),
|
||||
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
|
||||
&vflag, &gbm.last_ellipse, &ainum, &anall);
|
||||
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
|
||||
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
|
||||
&vflag, &gbm.last_ellipse, &ainum, &anall,
|
||||
&gbm._threads_per_atom);
|
||||
gbm.time_gayberne2.stop();
|
||||
} else {
|
||||
gbm.atom->dev_ans.zero();
|
||||
gbm.atom->dev_engv.zero();
|
||||
gbm.ans->dev_ans.zero();
|
||||
gbm.ans->dev_engv.zero();
|
||||
gbm.time_kernel.stop();
|
||||
gbm.time_gayberne.start();
|
||||
gbm.time_gayberne.stop();
|
||||
|
@ -284,29 +286,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||
|
||||
// ------------ LJ ---------------
|
||||
gbm.time_pair.start();
|
||||
if (gbm.last_ellipse<gbm.atom->inum()) {
|
||||
if (gbm.last_ellipse<gbm.ans->inum()) {
|
||||
if (gbm.shared_types) {
|
||||
GBMF.k_lj_fast.set_size(GX,BX);
|
||||
GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
||||
&gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
|
||||
&stride, &gbm.nbor->dev_packed.begin(),
|
||||
&gbm.atom->dev_ans.begin(),
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
|
||||
&gbm.ans->dev_ans.begin(),
|
||||
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
|
||||
&gbm._threads_per_atom);
|
||||
} else {
|
||||
GBMF.k_lj.set_size(GX,BX);
|
||||
GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
||||
&gbm.lj3.begin(), &gbm._lj_types,
|
||||
&gbm.gamma_upsilon_mu.begin(), &stride,
|
||||
&gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
|
||||
&gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
|
||||
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
|
||||
&gbm._threads_per_atom);
|
||||
}
|
||||
}
|
||||
gbm.time_pair.stop();
|
||||
} else {
|
||||
gbm.time_kernel.start();
|
||||
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
|
||||
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
|
||||
ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
gbm.time_gayberne.start();
|
||||
|
@ -315,9 +319,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||
&gbm.shape.begin(), &gbm.well.begin(),
|
||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
||||
&stride, &gbm.atom->dev_ans.begin(), &ainum,
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &ainum, &anall);
|
||||
&stride, &gbm.ans->dev_ans.begin(), &ainum,
|
||||
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom);
|
||||
gbm.time_gayberne.stop();
|
||||
}
|
||||
}
|
||||
|
@ -326,30 +330,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||
// Reneighbor on GPU if necessary and then compute forces, torques, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double **host_quat) {
|
||||
gbm.acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
gbm.zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
|
||||
int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full);
|
||||
gbm.atom->inum(inum);
|
||||
gbm.hd_balancer.balance(cpu_time);
|
||||
int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
|
||||
gbm.ans->inum(inum);
|
||||
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
|
||||
host_quat, host_type, boxlo, boxhi, success);
|
||||
host_quat, host_type, sublo, subhi, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
gbm.atom->cast_quat_data(host_quat[0]);
|
||||
|
@ -361,47 +366,49 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
|
|||
gbm.atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
|
||||
gbm.atom->add_other_data();
|
||||
gbm.atom->add_quat_data();
|
||||
*ilist=gbm.nbor->host_ilist.begin();
|
||||
*jnum=gbm.nbor->host_acc.begin();
|
||||
|
||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
||||
gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
|
||||
gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
gbm.device->add_ans_object(gbm.ans);
|
||||
gbm.hd_balancer.stop_timer();
|
||||
return gbm.device->nbor.host_nbor.begin();
|
||||
return gbm.nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x,
|
||||
host_type, boxlo, boxhi, eflag, vflag, eatom, vatom,
|
||||
host_start, cpu_time, success, host_quat);
|
||||
int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double **host_quat) {
|
||||
return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, eflag, vflag, eatom, vatom, host_start, ilist,
|
||||
jnum, cpu_time, success, host_quat);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, torques,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
|
||||
const int inum_full,const int nall,double **host_x,
|
||||
int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
|
||||
const int nall,double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, double **host_quat) {
|
||||
gbm.acc_timers();
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
gbm.zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int ago=gbm.hd_balancer.ago_first(f_ago);
|
||||
int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time,
|
||||
gbm.nbor->gpu_nbor());
|
||||
gbm.atom->inum(inum);
|
||||
int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
gbm.ans->inum(inum);
|
||||
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
||||
host_start=inum;
|
||||
|
||||
|
@ -421,21 +428,21 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
|
|||
gbm.atom->cast_quat_data(host_quat[0]);
|
||||
gbm.hd_balancer.start_timer();
|
||||
gbm.atom->add_x_data(host_x,host_type);
|
||||
gbm.atom->add_other_data();
|
||||
gbm.atom->add_quat_data();
|
||||
|
||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
||||
gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
|
||||
gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
|
||||
gbm.device->add_ans_object(gbm.ans);
|
||||
gbm.hd_balancer.stop_timer();
|
||||
return list;
|
||||
}
|
||||
|
||||
int * gb_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double **host_quat) {
|
||||
return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x,
|
||||
int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double **host_quat) {
|
||||
return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x,
|
||||
host_type, ilist, numj, firstneigh, eflag, vflag,
|
||||
eatom, vatom, host_start, cpu_time, success,
|
||||
host_quat);
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
#ifndef GB_GPU_EXTRA_H
|
||||
#define GB_GPU_EXTRA_H
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
|
@ -47,7 +46,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
|
||||
#else
|
||||
|
||||
|
@ -58,6 +57,8 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
|||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -97,17 +97,17 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
|
|||
__global acctyp4 *ans, const int astride,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall) {
|
||||
const int nall, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=gum[ii+3];
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
sp_lj[0]=gum[3];
|
||||
sp_lj[1]=gum[4];
|
||||
sp_lj[2]=gum[5];
|
||||
sp_lj[3]=gum[6];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
|
@ -121,262 +121,309 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+mul24(stride,numj);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
numtyp a1[9], b1[9], g1[9];
|
||||
numtyp4 ishape=shape[itype];
|
||||
{
|
||||
numtyp t[9];
|
||||
gpu_quat_to_mat_trans(q,i,a1);
|
||||
gpu_times3(ishape,a1,t);
|
||||
gpu_transpose_times3(a1,t,g1);
|
||||
gpu_times3(well[itype],a1,t);
|
||||
gpu_transpose_times3(a1,t,b1);
|
||||
}
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
numtyp a1[9], b1[9], g1[9];
|
||||
numtyp4 ishape=shape[itype];
|
||||
{
|
||||
numtyp g2[9];
|
||||
{
|
||||
gpu_times3(shape[jtype],a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
gpu_plus3(g1,g2,g12);
|
||||
numtyp t[9];
|
||||
gpu_quat_to_mat_trans(q,i,a1);
|
||||
gpu_times3(ishape,a1,t);
|
||||
gpu_transpose_times3(a1,t,g1);
|
||||
gpu_times3(well[itype],a1,t);
|
||||
gpu_transpose_times3(a1,t,b1);
|
||||
}
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
{
|
||||
numtyp g2[9];
|
||||
{
|
||||
gpu_times3(shape[jtype],a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
gpu_plus3(g1,g2,g12);
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
|
||||
// -- replace r12 with r12 hat
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
numtyp sigma = sig_eps[mtype].x;
|
||||
numtyp epsilon = sig_eps[mtype].y;
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
|
||||
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
|
||||
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
|
||||
}
|
||||
|
||||
// torque for particle 1
|
||||
{
|
||||
numtyp tempv[3], tempv2[3];
|
||||
tempv[0] = -uslj_rsq*kappa[0];
|
||||
tempv[1] = -uslj_rsq*kappa[1];
|
||||
tempv[2] = -uslj_rsq*kappa[2];
|
||||
gpu_row_times3(kappa,g1,tempv2);
|
||||
gpu_cross3(tempv,tempv2,tUr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
|
||||
// Compute teta
|
||||
numtyp temp[9], tempv[3], tempv2[3];
|
||||
compute_eta_torque(g12,a1,ishape,temp);
|
||||
numtyp temp1 = -eta*gum[1];
|
||||
|
||||
tempv[0] = temp1*temp[0];
|
||||
tempv[1] = temp1*temp[1];
|
||||
tempv[2] = temp1*temp[2];
|
||||
gpu_cross3(a1,tempv,tempv2);
|
||||
teta[0] = tempv2[0];
|
||||
teta[1] = tempv2[1];
|
||||
teta[2] = tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[3];
|
||||
tempv[1] = temp1*temp[4];
|
||||
tempv[2] = temp1*temp[5];
|
||||
gpu_cross3(a1+3,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[6];
|
||||
tempv[1] = temp1*temp[7];
|
||||
tempv[2] = temp1*temp[8];
|
||||
gpu_cross3(a1+6,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
numtyp chi, dchi[3], tchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// -- replace r12 with r12 hat
|
||||
// Compute b12
|
||||
numtyp b2[9], b12[9];
|
||||
{
|
||||
gpu_times3(well[jtype],a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
gpu_plus3(b1,b2,b12);
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
r12[0]*=r;
|
||||
r12[1]*=r;
|
||||
r12[2]*=r;
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
chi = gpu_dot3(r12,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
|
||||
gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
|
||||
|
||||
// compute t_chi
|
||||
numtyp tempv[3];
|
||||
gpu_row_times3(iota,b1,tempv);
|
||||
gpu_cross3(tempv,iota,tchi);
|
||||
temp1 = (numtyp)-4.0*ir*ir;
|
||||
tchi[0] *= temp1;
|
||||
tchi[1] *= temp1;
|
||||
tchi[2] *= temp1;
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag>0)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag>0) {
|
||||
r12[0]*=-r;
|
||||
r12[1]*=-r;
|
||||
r12[2]*=-r;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.x+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.y+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
f.z+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
f.x+=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.y+=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.z+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
|
||||
// Torque on 1
|
||||
temp1 = -u_r*eta*factor_lj;
|
||||
temp2 = -u_r*chi*factor_lj;
|
||||
numtyp temp3 = -chi*eta*factor_lj;
|
||||
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
|
||||
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
|
||||
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[7][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=tor.x;
|
||||
red_acc[4][tid]=tor.y;
|
||||
red_acc[5][tid]=tor.z;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
numtyp sigma = sig_eps[mtype].x;
|
||||
numtyp epsilon = sig_eps[mtype].y;
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
|
||||
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
|
||||
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
|
||||
}
|
||||
|
||||
// torque for particle 1
|
||||
{
|
||||
numtyp tempv[3], tempv2[3];
|
||||
tempv[0] = -uslj_rsq*kappa[0];
|
||||
tempv[1] = -uslj_rsq*kappa[1];
|
||||
tempv[2] = -uslj_rsq*kappa[2];
|
||||
gpu_row_times3(kappa,g1,tempv2);
|
||||
gpu_cross3(tempv,tempv2,tUr);
|
||||
}
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
|
||||
// Compute teta
|
||||
numtyp temp[9], tempv[3], tempv2[3];
|
||||
compute_eta_torque(g12,a1,ishape,temp);
|
||||
numtyp temp1 = -eta*gum[1];
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
tor.x=red_acc[3][tid];
|
||||
tor.y=red_acc[4][tid];
|
||||
tor.z=red_acc[5][tid];
|
||||
|
||||
tempv[0] = temp1*temp[0];
|
||||
tempv[1] = temp1*temp[1];
|
||||
tempv[2] = temp1*temp[2];
|
||||
gpu_cross3(a1,tempv,tempv2);
|
||||
teta[0] = tempv2[0];
|
||||
teta[1] = tempv2[1];
|
||||
teta[2] = tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[3];
|
||||
tempv[1] = temp1*temp[4];
|
||||
tempv[2] = temp1*temp[5];
|
||||
gpu_cross3(a1+3,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
if (eflag>0 || vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
red_acc[6][tid]=energy;
|
||||
|
||||
tempv[0] = temp1*temp[6];
|
||||
tempv[1] = temp1*temp[7];
|
||||
tempv[2] = temp1*temp[8];
|
||||
gpu_cross3(a1+6,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
}
|
||||
|
||||
numtyp chi, dchi[3], tchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// Compute b12
|
||||
numtyp b2[9], b12[9];
|
||||
{
|
||||
gpu_times3(well[jtype],a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
gpu_plus3(b1,b2,b12);
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<7; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
energy=red_acc[6][tid];
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
r12[0]*=r;
|
||||
r12[1]*=r;
|
||||
r12[2]*=r;
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
chi = gpu_dot3(r12,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
|
||||
gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
|
||||
|
||||
// compute t_chi
|
||||
numtyp tempv[3];
|
||||
gpu_row_times3(iota,b1,tempv);
|
||||
gpu_cross3(tempv,iota,tchi);
|
||||
temp1 = (numtyp)-4.0*ir*ir;
|
||||
tchi[0] *= temp1;
|
||||
tchi[1] *= temp1;
|
||||
tchi[2] *= temp1;
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag>0)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag>0) {
|
||||
r12[0]*=-r;
|
||||
r12[1]*=-r;
|
||||
r12[2]*=-r;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.x+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.y+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
f.z+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
f.x+=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.y+=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.z+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
|
||||
// Torque on 1
|
||||
temp1 = -u_r*eta*factor_lj;
|
||||
temp2 = -u_r*chi*factor_lj;
|
||||
numtyp temp3 = -chi*eta*factor_lj;
|
||||
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
|
||||
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
|
||||
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=astride;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=astride;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
ans[ii+astride]=tor;
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=astride;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
ans[ii+astride]=tor;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
|
|
@ -34,33 +34,36 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
|
|||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag,const int start, const int inum,
|
||||
const int nall) {
|
||||
__local numtyp sp_lj[4];
|
||||
const int nall, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom+start;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=gum[ii+3];
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
|
||||
__syncthreads();
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=gum[3];
|
||||
sp_lj[1]=gum[4];
|
||||
sp_lj[2]=gum[5];
|
||||
sp_lj[3]=gum[6];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+stride*numj;
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
@ -69,7 +72,7 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
|
|||
numtyp one_well=well[itype].x;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=stride) {
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -241,8 +244,47 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
|
|||
f.z+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
@ -265,39 +307,42 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int nall) {
|
||||
__local numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=gum[ii+3];
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
|
||||
__syncthreads();
|
||||
const int nall, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom+start;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=gum[3];
|
||||
sp_lj[1]=gum[4];
|
||||
sp_lj[2]=gum[5];
|
||||
sp_lj[3]=gum[6];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *list_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=stride) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -338,8 +383,47 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1+=energy;
|
||||
|
@ -361,50 +445,54 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
|
||||
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in, __global numtyp *gum,
|
||||
const int stride,
|
||||
__global int *dev_ij, __global acctyp4 *ans,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag,const int vflag, const int start,
|
||||
const int inum, const int nall) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int nall, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom+start;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=gum[ii+3];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
if (tid<4)
|
||||
sp_lj[tid]=gum[tid+3];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *list_end=nbor+mul24(stride,numj);
|
||||
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=stride) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -443,8 +531,47 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1+=energy;
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef PAIR_GPU_KERNEL_H
|
||||
#define PAIR_GPU_KERNEL_H
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
|
@ -32,7 +30,7 @@
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
|
||||
#else
|
||||
|
||||
|
@ -42,6 +40,7 @@
|
|||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -32,30 +32,35 @@ template <class numtyp, class acctyp>
|
|||
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
|
||||
_max_bytes(0.0) {
|
||||
device=&pair_gpu_device;
|
||||
ans=new PairGPUAns<numtyp,acctyp>();
|
||||
nbor=new PairGPUNbor;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GB_GPU_MemoryT::~GB_GPU_Memory() {
|
||||
clear();
|
||||
delete ans;
|
||||
delete nbor;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
|
||||
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||
nbor->bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu,
|
||||
double **host_shape, double **host_well,
|
||||
double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double *host_lshape,
|
||||
int **h_form, double **host_lj1, double **host_lj2,
|
||||
double **host_lj3, double **host_lj4,
|
||||
double **host_offset, const double *host_special_lj,
|
||||
const int nlocal, const int nall,
|
||||
const int max_nbors, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu,
|
||||
double **host_shape, double **host_well,
|
||||
double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double *host_lshape,
|
||||
int **h_form, double **host_lj1, double **host_lj2,
|
||||
double **host_lj3, double **host_lj4,
|
||||
double **host_offset, const double *host_special_lj,
|
||||
const int nlocal, const int nall,
|
||||
const int max_nbors, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
nbor_time_avail=false;
|
||||
screen=_screen;
|
||||
|
||||
|
@ -64,24 +69,24 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
|||
gpu_nbor=true;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host,
|
||||
max_nbors,cell_size,true))
|
||||
return false;
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
|
||||
_gpu_host,max_nbors,cell_size,true);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
nbor=&device->nbor;
|
||||
|
||||
_block_size=BLOCK_1D;
|
||||
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
|
||||
_block_size=ucl_device->group_size();
|
||||
_block_size=device->pair_block_size();
|
||||
compile_kernels(*ucl_device);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_split);
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init(*ucl_device);
|
||||
|
@ -90,8 +95,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
|||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
int max_shared_types=device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
@ -186,12 +192,19 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
|||
}
|
||||
|
||||
if (multiple_forms)
|
||||
atom->dev_ans.zero();
|
||||
ans->dev_ans.zero();
|
||||
|
||||
_max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
_max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
// Memory for ilist ordered by particle type
|
||||
return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
|
||||
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
|
||||
return 0;
|
||||
else return -3;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::estimate_gpu_overhead() {
|
||||
device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -209,9 +222,9 @@ void GB_GPU_MemoryT::clear() {
|
|||
|
||||
// Output any timing information
|
||||
acc_timers();
|
||||
double single[6], times[6];
|
||||
double single[9], times[9];
|
||||
|
||||
single[0]=atom->transfer_time();
|
||||
single[0]=atom->transfer_time()+ans->transfer_time();
|
||||
single[1]=nbor->time_nbor.total_seconds();
|
||||
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
|
||||
nbor->time_kernel.total_seconds();
|
||||
|
@ -220,15 +233,18 @@ void GB_GPU_MemoryT::clear() {
|
|||
single[4]=time_pair.total_seconds();
|
||||
else
|
||||
single[4]=0;
|
||||
single[5]=atom->cast_time();
|
||||
single[5]=atom->cast_time()+ans->cast_time();
|
||||
single[6]=_gpu_overhead;
|
||||
single[7]=_driver_overhead;
|
||||
single[8]=ans->cpu_idle_time();
|
||||
|
||||
MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
|
||||
MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
|
||||
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
|
||||
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
|
||||
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
|
||||
gamma_upsilon_mu.row_bytes();
|
||||
gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
|
||||
double mpi_max_bytes;
|
||||
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
|
||||
device->replica());
|
||||
|
@ -255,10 +271,19 @@ void GB_GPU_MemoryT::clear() {
|
|||
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
|
||||
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
|
||||
}
|
||||
fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size);
|
||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
|
||||
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
|
||||
fprintf(screen,"-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n\n");
|
||||
|
||||
|
||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||
|
||||
|
||||
}
|
||||
_max_bytes=0.0;
|
||||
|
||||
|
@ -299,10 +324,9 @@ void GB_GPU_MemoryT::clear() {
|
|||
|
||||
template <class numtyp, class acctyp>
|
||||
double GB_GPU_MemoryT::host_memory_usage() const {
|
||||
return device->atom.host_memory_usage()+
|
||||
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
||||
sizeof(GB_GPU_Memory<numtyp,acctyp>)+
|
||||
device->nbor.max_atoms()*sizeof(int);
|
||||
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||
4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
|
||||
nbor->max_atoms()*sizeof(int);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef GB_GPU_MEMORY_H
|
||||
#define GB_GPU_MEMORY_H
|
||||
|
||||
#define BLOCK_1D 64
|
||||
|
||||
#include "pair_gpu_device.h"
|
||||
#include "pair_gpu_balance.h"
|
||||
#include "mpi.h"
|
||||
|
@ -35,23 +33,34 @@ class GB_GPU_Memory {
|
|||
* \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \return false if there is not sufficient memory or device init prob **/
|
||||
bool init(const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **host_shape,
|
||||
double **host_well, double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double *host_lshape, int **h_form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
const double *host_special_lj, const int nlocal, const int nall,
|
||||
const int max_nbors, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
* \return false if there is not sufficient memory or device init prob
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **host_shape,
|
||||
double **host_well, double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double *host_lshape, int **h_form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
const double *host_special_lj, const int nlocal, const int nall,
|
||||
const int max_nbors, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
atom->resize(inum, nall, success);
|
||||
if (multiple_forms) atom->dev_ans.zero();
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
atom->resize(nall, success);
|
||||
ans->resize(inum, success);
|
||||
if (multiple_forms) ans->dev_ans.zero();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_bytes)
|
||||
_max_bytes=bytes;
|
||||
}
|
||||
|
@ -74,7 +83,7 @@ class GB_GPU_Memory {
|
|||
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
|
||||
}
|
||||
nbor->resize(nlocal,host_inum,max_nbors,success);
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_bytes)
|
||||
_max_bytes=bytes;
|
||||
}
|
||||
|
@ -91,19 +100,22 @@ class GB_GPU_Memory {
|
|||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (nbor_time_avail) {
|
||||
nbor->time_nbor.add_to_total();
|
||||
nbor->time_kernel.add_to_total();
|
||||
nbor_time_avail=false;
|
||||
if (device->time_device()) {
|
||||
if (nbor_time_avail) {
|
||||
nbor->time_nbor.add_to_total();
|
||||
nbor->time_kernel.add_to_total();
|
||||
nbor_time_avail=false;
|
||||
}
|
||||
time_kernel.add_to_total();
|
||||
time_gayberne.add_to_total();
|
||||
if (multiple_forms) {
|
||||
time_kernel2.add_to_total();
|
||||
time_gayberne2.add_to_total();
|
||||
time_pair.add_to_total();
|
||||
}
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
}
|
||||
time_kernel.add_to_total();
|
||||
time_gayberne.add_to_total();
|
||||
if (multiple_forms) {
|
||||
time_kernel2.add_to_total();
|
||||
time_gayberne2.add_to_total();
|
||||
time_pair.add_to_total();
|
||||
}
|
||||
atom->acc_timers();
|
||||
}
|
||||
|
||||
/// Accumulate timers
|
||||
|
@ -117,6 +129,7 @@ class GB_GPU_Memory {
|
|||
time_pair.zero();
|
||||
}
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
}
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
@ -168,6 +181,10 @@ class GB_GPU_Memory {
|
|||
|
||||
int last_ellipse, max_last_ellipse;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
PairGPUAns<numtyp,acctyp> *ans;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor data
|
||||
|
@ -183,10 +200,12 @@ class GB_GPU_Memory {
|
|||
UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
|
||||
inline int block_size() { return _block_size; }
|
||||
|
||||
int _threads_per_atom;
|
||||
private:
|
||||
bool _allocated, _compiled;
|
||||
int _block_size;
|
||||
double _max_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
|
||||
void compile_kernels(UCL_Device &dev);
|
||||
};
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
Geryon Version 10.280
|
||||
|
||||
Geryon Version 11.094
|
||||
|
||||
|
|
|
@ -167,6 +167,7 @@ class UCL_Device {
|
|||
int _device, _num_devices;
|
||||
std::vector<cudaDeviceProp> _properties;
|
||||
std::vector<cudaStream_t> _cq;
|
||||
std::vector<int> _device_ids;
|
||||
};
|
||||
|
||||
// Grabs the properties for all devices
|
||||
|
@ -178,6 +179,7 @@ inline UCL_Device::UCL_Device() {
|
|||
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
|
||||
break;
|
||||
_properties.push_back(deviceProp);
|
||||
_device_ids.push_back(dev);
|
||||
}
|
||||
_device=-1;
|
||||
_cq.push_back(cudaStream_t());
|
||||
|
@ -194,7 +196,7 @@ inline void UCL_Device::set(int num) {
|
|||
return;
|
||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||
cudaThreadExit();
|
||||
CUDA_SAFE_CALL_NS(cudaSetDevice(num));
|
||||
CUDA_SAFE_CALL_NS(cudaSetDevice(_device_ids[num]));
|
||||
_device=num;
|
||||
}
|
||||
|
||||
|
|
|
@ -42,6 +42,7 @@ inline void ucl_sync(CUstream &stream) {
|
|||
}
|
||||
|
||||
struct NVDProperties {
|
||||
int device_id;
|
||||
std::string name;
|
||||
int major;
|
||||
int minor;
|
||||
|
@ -208,15 +209,20 @@ inline UCL_Device::UCL_Device() {
|
|||
for (int dev=0; dev<_num_devices; ++dev) {
|
||||
CUdevice m;
|
||||
CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
|
||||
int major, minor;
|
||||
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
|
||||
if (major==9999)
|
||||
continue;
|
||||
|
||||
_properties.push_back(NVDProperties());
|
||||
_properties.back().device_id=dev;
|
||||
_properties.back().major=major;
|
||||
_properties.back().minor=minor;
|
||||
|
||||
char namecstr[1024];
|
||||
CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
|
||||
_properties.back().name=namecstr;
|
||||
|
||||
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
|
||||
&_properties.back().minor,m));
|
||||
|
||||
CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
|
||||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
||||
|
@ -262,9 +268,9 @@ inline void UCL_Device::set(int num) {
|
|||
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
|
||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||
}
|
||||
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
|
||||
_device=_properties[num].device_id;
|
||||
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
|
||||
CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
|
||||
_device=num;
|
||||
}
|
||||
|
||||
// List all devices along with all properties
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#define NVD_TIMER_H
|
||||
|
||||
#include "nvd_macros.h"
|
||||
#include "nvd_device.h"
|
||||
|
||||
namespace ucl_cudadr {
|
||||
|
||||
|
@ -66,12 +67,23 @@ class UCL_Timer {
|
|||
/// Stop timing on command queue
|
||||
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
|
||||
|
||||
/// Block until the start event has been reached on device
|
||||
inline void sync_start()
|
||||
{ CU_SAFE_CALL(cuEventSynchronize(start_event)); }
|
||||
|
||||
/// Block until the stop event has been reached on device
|
||||
inline void sync_stop()
|
||||
{ CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
inline void zero() {
|
||||
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
|
||||
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
|
||||
}
|
||||
|
||||
/// Set the total time to zero
|
||||
inline void zero_total() { _total_time=0.0; }
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline double add_to_total()
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#define OCL_TIMER_H
|
||||
|
||||
#include "ocl_macros.h"
|
||||
#include "ocl_device.h"
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
|
@ -67,10 +68,21 @@ class UCL_Timer {
|
|||
/// Stop timing on default command queue
|
||||
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
|
||||
|
||||
/// Block until the start event has been reached on device
|
||||
inline void sync_start()
|
||||
{ CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
|
||||
|
||||
/// Block until the stop event has been reached on device
|
||||
inline void sync_stop()
|
||||
{ CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
inline void zero()
|
||||
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
|
||||
|
||||
/// Set the total time to zero
|
||||
inline void zero_total() { _total_time=0.0; }
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline double add_to_total()
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
|
@ -206,6 +206,191 @@
|
|||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28, class t29>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28, class t29, class t30>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
|
@ -439,6 +624,211 @@
|
|||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28, class t29>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28, class t29, class t30>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||
run();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class t1>
|
||||
|
@ -671,3 +1061,208 @@
|
|||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28, class t29>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28, class t29, class t30>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
|
@ -61,20 +61,23 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||
clear();
|
||||
_kind=kind;
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
|
||||
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
|
||||
_row_size=_pitch/sizeof(numtyp);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_row_size*cols;
|
||||
#endif
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate "
|
||||
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
||||
exit(1);
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
_kind=kind;
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
_row_size=_pitch/sizeof(numtyp);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_row_size*cols;
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
|
@ -94,20 +97,23 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||
clear();
|
||||
_kind=kind;
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
|
||||
int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
|
||||
_row_size=_pitch/sizeof(numtyp);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_row_size*cols;
|
||||
#endif
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate "
|
||||
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
||||
exit(1);
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
_kind=kind;
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
_row_size=_pitch/sizeof(numtyp);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_row_size*cols;
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
|
@ -60,19 +60,24 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||
|
||||
clear();
|
||||
_kind=kind;
|
||||
_cols=cols;
|
||||
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
int err=_device_alloc(*this,cq,_row_bytes,kind);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+cols;
|
||||
#endif
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on device.\n";
|
||||
_row_bytes=0;
|
||||
exit(1);
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
|
||||
_kind=kind;
|
||||
_cols=cols;
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+cols;
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
|
@ -90,19 +95,23 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
inline int alloc(const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||
clear();
|
||||
_kind=kind;
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
int err=_device_alloc(*this,device,_row_bytes,kind);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+cols;
|
||||
#endif
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on device.\n";
|
||||
_row_bytes=0;
|
||||
exit(1);
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
|
||||
_kind=kind;
|
||||
_cols=cols;
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+cols;
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
|
@ -39,7 +39,11 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
|
||||
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
|
||||
#ifdef _OCL_MAT
|
||||
_carray=(cl_mem)(0);
|
||||
#endif
|
||||
}
|
||||
~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
||||
|
||||
/// Construct with specied number of rows and columns
|
||||
|
@ -59,18 +63,23 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||
clear();
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
_kind=kind;
|
||||
int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
|
||||
#ifndef UCL_NO_EXIT
|
||||
int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
||||
<< " bytes on host.\n";
|
||||
_row_bytes=0;
|
||||
exit(1);
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
#endif
|
||||
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_kind=kind;
|
||||
_end=_array+rows*cols;
|
||||
return err;
|
||||
}
|
||||
|
@ -85,19 +94,24 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||
clear();
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
_kind=kind;
|
||||
int err=_host_alloc(*this,device,_row_bytes*_rows,kind);
|
||||
_end=_array+rows*cols;
|
||||
#ifndef UCL_NO_EXIT
|
||||
int err=_host_alloc(*this,device,_row_bytes*rows,kind);
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
||||
<< " bytes on host.\n";
|
||||
_row_bytes=0;
|
||||
exit(1);
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
#endif
|
||||
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_kind=kind;
|
||||
_end=_array+rows*cols;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
|
@ -39,7 +39,11 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
|
||||
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
|
||||
#ifdef _OCL_MAT
|
||||
_carray=(cl_mem)(0);
|
||||
#endif
|
||||
}
|
||||
~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
||||
|
||||
/// Construct with n columns
|
||||
|
@ -59,18 +63,24 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
inline int alloc(const size_t cols, mat_type &cq,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||
clear();
|
||||
_cols=cols;
|
||||
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
_kind=kind;
|
||||
int err=_host_alloc(*this,cq,_row_bytes,kind);
|
||||
_end=_array+cols;
|
||||
#ifndef UCL_NO_EXIT
|
||||
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on host.\n";
|
||||
_row_bytes=0;
|
||||
exit(1);
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
#endif
|
||||
|
||||
_cols=cols;
|
||||
_kind=kind;
|
||||
_end=_array+cols;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -84,18 +94,24 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
inline int alloc(const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||
clear();
|
||||
_cols=cols;
|
||||
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
_kind=kind;
|
||||
int err=_host_alloc(*this,device,_row_bytes,kind);
|
||||
_end=_array+cols;
|
||||
#ifndef UCL_NO_EXIT
|
||||
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on host.\n";
|
||||
_row_bytes=0;
|
||||
exit(1);
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
#endif
|
||||
|
||||
_cols=cols;
|
||||
_kind=kind;
|
||||
_end=_array+cols;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
|
@ -25,8 +25,18 @@
|
|||
#ifndef UCL_NV_KERNEL_H
|
||||
#define UCL_NV_KERNEL_H
|
||||
|
||||
#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
|
||||
#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
|
||||
#if (__CUDA_ARCH__ < 200)
|
||||
#define mul24 __mul24
|
||||
#define MEM_THREADS 16
|
||||
#else
|
||||
#define mul24(X,Y) (X)*(Y)
|
||||
#define MEM_THREADS 32
|
||||
#endif
|
||||
|
||||
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
|
||||
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
|
||||
#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
|
||||
#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
|
||||
#define THREAD_ID_X threadIdx.x
|
||||
#define THREAD_ID_Y threadIdx.y
|
||||
#define BLOCK_ID_X blockIdx.x
|
||||
|
@ -35,8 +45,9 @@
|
|||
#define BLOCK_SIZE_Y blockDim.y
|
||||
#define __kernel extern "C" __global__
|
||||
#define __local __shared__
|
||||
#define mul24 __mul24
|
||||
#define __global
|
||||
#define __inline static __inline__ __device__
|
||||
#define atom_add atomicAdd
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -28,11 +28,11 @@ static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
|
|||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
LJ96MF.clear();
|
||||
gpu_mode=LJ96MF.device->gpu_mode();
|
||||
double gpu_split=LJ96MF.device->particle_split();
|
||||
|
@ -53,13 +53,11 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
LJ96MF.device->world_barrier();
|
||||
if (message)
|
||||
|
@ -74,46 +72,46 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum,
|
||||
nall, 300, maxspecial, cell_size, gpu_split,
|
||||
screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, special_lj, inum, nall, 300, maxspecial,
|
||||
cell_size, gpu_split, screen);
|
||||
|
||||
LJ96MF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
|
||||
if (init_ok==0)
|
||||
LJ96MF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void lj96_gpu_clear() {
|
||||
LJ96MF.clear();
|
||||
}
|
||||
|
||||
int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
int** lj96_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success);
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success) {
|
||||
return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
void lj96_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||
void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
|
||||
eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||
}
|
||||
|
||||
double lj96_gpu_bytes() {
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef LJ96_GPU_KERNEL
|
||||
#define LJ96_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
|
@ -46,7 +44,7 @@
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
|
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -157,8 +172,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
@ -176,49 +230,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
const int nall, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -258,8 +328,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
|
|
@ -42,7 +42,7 @@ int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool LJ96_GPU_MemoryT::init(const int ntypes,
|
||||
int LJ96_GPU_MemoryT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
|
@ -50,14 +50,18 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
|
|||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj96_cut_gpu_kernel);
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj96_cut_gpu_kernel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
@ -84,7 +88,7 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
|
|||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -122,9 +126,10 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
@ -133,16 +138,18 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch);
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch);
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
|
|
@ -29,13 +29,20 @@ class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
|
|
|
@ -28,12 +28,11 @@ static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
|
|||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool ljl_gpu_init(const int ntypes, double **cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen) {
|
||||
int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
LJLMF.clear();
|
||||
gpu_mode=LJLMF.device->gpu_mode();
|
||||
double gpu_split=LJLMF.device->particle_split();
|
||||
|
@ -54,13 +53,11 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
|
|||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
LJLMF.device->world_barrier();
|
||||
if (message)
|
||||
|
@ -75,45 +72,45 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
|
|||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split,
|
||||
screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, special_lj, inum, nall, 300, maxspecial,
|
||||
cell_size, gpu_split, screen);
|
||||
|
||||
LJLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
|
||||
if (init_ok==0)
|
||||
LJLMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void ljl_gpu_clear() {
|
||||
LJLMF.clear();
|
||||
}
|
||||
|
||||
int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
int ** ljl_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success);
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success) {
|
||||
return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
void ljl_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||
}
|
||||
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef LJ_GPU_KERNEL
|
||||
#define LJ_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
|
@ -46,7 +44,7 @@
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
|
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -156,8 +171,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
@ -175,49 +229,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
const int nall, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
@ -256,8 +326,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
|
|
@ -42,22 +42,26 @@ int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool LJL_GPU_MemoryT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj_cut_gpu_kernel);
|
||||
int LJL_GPU_MemoryT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj_cut_gpu_kernel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
@ -84,7 +88,7 @@ bool LJL_GPU_MemoryT::init(const int ntypes,
|
|||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -122,9 +126,10 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
@ -133,16 +138,18 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch);
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch);
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
|
|
@ -29,13 +29,20 @@ class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
|
|
|
@ -28,13 +28,13 @@ static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
|
|||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e) {
|
||||
int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e) {
|
||||
LJCMF.clear();
|
||||
gpu_mode=LJCMF.device->gpu_mode();
|
||||
double gpu_split=LJCMF.device->particle_split();
|
||||
|
@ -55,15 +55,12 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||
qqrd2e);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e);
|
||||
|
||||
LJCMF.device->world_barrier();
|
||||
if (message)
|
||||
|
@ -78,48 +75,51 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split,
|
||||
screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, special_lj, inum, nall, 300, maxspecial,
|
||||
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e);
|
||||
|
||||
LJCMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
|
||||
if (init_ok==0)
|
||||
LJCMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void ljc_gpu_clear() {
|
||||
LJCMF.clear();
|
||||
}
|
||||
|
||||
int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
int** ljc_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q) {
|
||||
return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success, host_q);
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd) {
|
||||
return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
void ljc_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q) {
|
||||
LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||
host_q);
|
||||
void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
|
||||
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
|
||||
nlocal,boxlo,prd);
|
||||
}
|
||||
|
||||
double ljc_gpu_bytes() {
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef LJC_GPU_KERNEL
|
||||
#define LJC_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
|
@ -46,7 +44,7 @@
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
|
||||
|
@ -82,6 +80,8 @@ __inline float fetch_q(const int& i, const float *q)
|
|||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define fetch_q(i,y) q_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -92,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_ , __global numtyp *cutsq,
|
||||
const numtyp qqrd2e) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, __global numtyp *q_ ,
|
||||
__global numtyp *cutsq, const numtyp qqrd2e,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
|
@ -109,29 +113,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
if (ii<inum) {
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
@ -188,8 +204,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
@ -209,54 +266,69 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_ , __global numtyp *_cutsq,
|
||||
const numtyp qqrd2e) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
const numtyp qqrd2e, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (ii<8)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
cutsq[ii]=_cutsq[ii];
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
cutsq[tid]=_cutsq[tid];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
@ -312,8 +384,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Store answers
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
|
|
@ -43,24 +43,28 @@ int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool LJC_GPU_MemoryT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,ljc_cut_gpu_kernel);
|
||||
int LJC_GPU_MemoryT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,ljc_cut_gpu_kernel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
@ -95,7 +99,7 @@ bool LJC_GPU_MemoryT::init(const int ntypes,
|
|||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
|
||||
sp_lj.row_bytes();
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -134,9 +138,10 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
@ -145,19 +150,20 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &cutsq.begin(),
|
||||
&_qqrd2e);
|
||||
&_qqrd2e, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&cutsq.begin(), &_qqrd2e);
|
||||
&cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
|
|
@ -29,15 +29,22 @@ class LJC_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
double **host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e);
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
double **host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
|
|
|
@ -28,14 +28,14 @@ static LJCL_GPU_Memory<PRECISION,ACC_PRECISION> LJCLMF;
|
|||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
double **host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
double **host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
LJCLMF.clear();
|
||||
gpu_mode=LJCLMF.device->gpu_mode();
|
||||
double gpu_split=LJCLMF.device->particle_split();
|
||||
|
@ -56,15 +56,12 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||
qqrd2e,g_ewald);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, special_lj, inum, nall, 300, maxspecial,
|
||||
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
LJCLMF.device->world_barrier();
|
||||
if (message)
|
||||
|
@ -79,48 +76,51 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split,
|
||||
screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, special_lj, inum, nall, 300, maxspecial,
|
||||
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
LJCLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
|
||||
if (init_ok==0)
|
||||
LJCLMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void ljcl_gpu_clear() {
|
||||
LJCLMF.clear();
|
||||
}
|
||||
|
||||
int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
int** ljcl_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q) {
|
||||
return LJCLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success, host_q);
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd) {
|
||||
return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
void ljcl_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q) {
|
||||
LJCLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||
host_q);
|
||||
host_q,nlocal,boxlo,prd);
|
||||
}
|
||||
|
||||
double ljcl_gpu_bytes() {
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef LJCL_GPU_KERNEL
|
||||
#define LJCL_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
|
@ -54,7 +52,7 @@
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
|
||||
|
@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
|
|||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define fetch_q(i,y) q_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, __global numtyp *q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
|
@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
if (ii<inum) {
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
@ -204,8 +220,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
// Store answers
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
@ -225,52 +282,68 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
const numtyp qqrd2e, const numtyp g_ewald,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (ii<8)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
|
@ -334,8 +407,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Store answers
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
|
|
|
@ -43,7 +43,7 @@ int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool LJCL_GPU_MemoryT::init(const int ntypes,
|
||||
int LJCL_GPU_MemoryT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
|
@ -54,14 +54,18 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
|
|||
double **host_cut_ljsq, const double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,ljcl_cut_gpu_kernel);
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,ljcl_cut_gpu_kernel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
@ -94,7 +98,7 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
|
|||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -132,9 +136,10 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
@ -143,19 +148,21 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald);
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald);
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||
&this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
|
|
@ -29,15 +29,22 @@ class LJCL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
|
|
|
@ -29,9 +29,8 @@ __win_sort _win_sort;
|
|||
#endif
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
|
||||
_vflag(false),_inum(0),_ilist(NULL),
|
||||
_newton(false) {
|
||||
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
|
||||
_max_gpu_bytes(0) {
|
||||
#ifndef USE_OPENCL
|
||||
sort_config.op = CUDPP_ADD;
|
||||
sort_config.datatype = CUDPP_UINT;
|
||||
|
@ -56,28 +55,20 @@ int PairGPUAtomT::bytes_per_atom() const {
|
|||
int id_space=0;
|
||||
if (_gpu_nbor)
|
||||
id_space=2;
|
||||
int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
|
||||
int bytes=4*sizeof(numtyp)+id_space;
|
||||
if (_rot)
|
||||
bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
|
||||
bytes+=4*sizeof(numtyp);
|
||||
if (_charge)
|
||||
bytes+=sizeof(numtyp);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
||||
bool PairGPUAtomT::alloc(const int nall) {
|
||||
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
if (_newton)
|
||||
_max_local=_max_atoms;
|
||||
else
|
||||
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
|
||||
|
||||
bool success=true;
|
||||
|
||||
int ans_elements=4;
|
||||
if (_rot)
|
||||
ans_elements+=4;
|
||||
|
||||
// Ignore host/device transfers?
|
||||
bool cpuview=false;
|
||||
if (dev->device_type()==UCL_CPU)
|
||||
|
@ -107,8 +98,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
|||
success=success && (host_x.alloc(_max_atoms*4,*dev,
|
||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||
#endif
|
||||
success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
|
||||
success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
|
||||
// Buffer for casting only if different precisions
|
||||
if (_charge)
|
||||
success=success && (host_q.alloc(_max_atoms,*dev,
|
||||
|
@ -120,15 +109,13 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
|||
|
||||
|
||||
// --------------------------- Device allocations
|
||||
_gpu_bytes=0;
|
||||
int gpu_bytes=0;
|
||||
if (cpuview) {
|
||||
#ifdef GPU_CAST
|
||||
assert(0==1);
|
||||
#else
|
||||
dev_x.view(host_x);
|
||||
#endif
|
||||
dev_engv.view(host_engv);
|
||||
dev_ans.view(host_ans);
|
||||
if (_rot)
|
||||
dev_quat.view(host_quat);
|
||||
if (_charge)
|
||||
|
@ -140,49 +127,80 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
|||
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
|
||||
success=success && (UCL_SUCCESS==
|
||||
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
|
||||
_gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
||||
gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
||||
#else
|
||||
success=success && (UCL_SUCCESS==
|
||||
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
|
||||
#endif
|
||||
success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
success=success && (dev_ans.alloc(ans_elements*_max_local,
|
||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
if (_charge) {
|
||||
success=success && (dev_q.alloc(_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_q.row_bytes();
|
||||
gpu_bytes+=dev_q.row_bytes();
|
||||
}
|
||||
if (_rot) {
|
||||
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_quat.row_bytes();
|
||||
gpu_bytes+=dev_quat.row_bytes();
|
||||
}
|
||||
}
|
||||
if (_gpu_nbor) {
|
||||
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
||||
gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
||||
if (_bonds) {
|
||||
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_tag.row_bytes();
|
||||
gpu_bytes+=dev_tag.row_bytes();
|
||||
}
|
||||
}
|
||||
|
||||
_gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
|
||||
gpu_bytes+=dev_x.row_bytes();
|
||||
if (gpu_bytes>_max_gpu_bytes)
|
||||
_max_gpu_bytes=gpu_bytes;
|
||||
|
||||
_allocated=true;
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
|
||||
const bool rot, UCL_Device &devi, const bool gpu_nbor,
|
||||
bool PairGPUAtomT::add_fields(const bool charge, const bool rot,
|
||||
const bool gpu_nbor, const bool bonds) {
|
||||
bool realloc=false;
|
||||
if (charge && _charge==false) {
|
||||
_charge=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (rot && _rot==false) {
|
||||
_rot=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (gpu_nbor && _gpu_nbor==false) {
|
||||
_gpu_nbor=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (bonds && _bonds==false) {
|
||||
_bonds=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (realloc) {
|
||||
_other=_charge || _rot;
|
||||
int max_atoms=_max_atoms;
|
||||
clear_resize();
|
||||
return alloc(max_atoms);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
|
||||
UCL_Device &devi, const bool gpu_nbor,
|
||||
const bool bonds) {
|
||||
clear();
|
||||
|
||||
bool success=true;
|
||||
_x_avail=false;
|
||||
_q_avail=false;
|
||||
_quat_avail=false;
|
||||
_resized=false;
|
||||
_gpu_nbor=gpu_nbor;
|
||||
_bonds=bonds;
|
||||
_charge=charge;
|
||||
|
@ -190,33 +208,25 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
|
|||
_other=_charge || _rot;
|
||||
dev=&devi;
|
||||
|
||||
_e_fields=1;
|
||||
if (_charge)
|
||||
_e_fields++;
|
||||
_ev_fields=6+_e_fields;
|
||||
|
||||
// Initialize atom and nbor data
|
||||
int ef_inum=inum;
|
||||
if (ef_inum==0)
|
||||
ef_inum=1000;
|
||||
int ef_nall=nall;
|
||||
if (ef_nall<=ef_inum)
|
||||
ef_nall=ef_inum*2;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
|
||||
// Initialize timers for the selected device
|
||||
time_pos.init(*dev);
|
||||
time_other.init(*dev);
|
||||
time_answer.init(*dev);
|
||||
time_q.init(*dev);
|
||||
time_quat.init(*dev);
|
||||
time_pos.zero();
|
||||
time_other.zero();
|
||||
time_answer.zero();
|
||||
time_q.zero();
|
||||
time_quat.zero();
|
||||
_time_cast=0.0;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
compile_kernels(*dev);
|
||||
#endif
|
||||
|
||||
return success && alloc(ef_inum,ef_nall);
|
||||
return success && alloc(ef_nall);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -234,16 +244,12 @@ void PairGPUAtomT::clear_resize() {
|
|||
dev_quat.clear();
|
||||
host_quat.clear();
|
||||
}
|
||||
dev_ans.clear();
|
||||
dev_engv.clear();
|
||||
#ifndef GPU_CAST
|
||||
host_x.clear();
|
||||
#else
|
||||
host_x_cast.clear();
|
||||
host_type_cast.clear();
|
||||
#endif
|
||||
host_ans.clear();
|
||||
host_engv.clear();
|
||||
dev_cell_id.clear();
|
||||
dev_particle_id.clear();
|
||||
dev_tag.clear();
|
||||
|
@ -261,17 +267,14 @@ void PairGPUAtomT::clear_resize() {
|
|||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::clear() {
|
||||
_gpu_bytes=0;
|
||||
_max_gpu_bytes=0;
|
||||
if (!_allocated)
|
||||
return;
|
||||
|
||||
time_pos.clear();
|
||||
time_other.clear();
|
||||
time_answer.clear();
|
||||
time_q.clear();
|
||||
time_quat.clear();
|
||||
clear_resize();
|
||||
_inum=0;
|
||||
_eflag=false;
|
||||
_vflag=false;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
if (_compiled) {
|
||||
|
@ -289,255 +292,10 @@ double PairGPUAtomT::host_memory_usage() const {
|
|||
atom_bytes+=1;
|
||||
if (_rot)
|
||||
atom_bytes+=4;
|
||||
int ans_bytes=atom_bytes+_ev_fields;
|
||||
return _max_atoms*atom_bytes*sizeof(numtyp)+
|
||||
ans_bytes*(_max_local)*sizeof(acctyp)+
|
||||
sizeof(PairGPUAtom<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom) {
|
||||
time_answer.start();
|
||||
_eflag=eflag;
|
||||
_vflag=vflag;
|
||||
_ef_atom=ef_atom;
|
||||
_vf_atom=vf_atom;
|
||||
|
||||
int csize=_ev_fields;
|
||||
if (!eflag)
|
||||
csize-=_e_fields;
|
||||
if (!vflag)
|
||||
csize-=6;
|
||||
|
||||
if (csize>0)
|
||||
ucl_copy(host_engv,dev_engv,_inum*csize,true);
|
||||
if (_rot)
|
||||
ucl_copy(host_ans,dev_ans,_inum*4*2,true);
|
||||
else
|
||||
ucl_copy(host_ans,dev_ans,_inum*4,true);
|
||||
time_answer.stop();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom,
|
||||
int *ilist) {
|
||||
_ilist=ilist;
|
||||
copy_answers(eflag,vflag,ef_atom,vf_atom);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
|
||||
double *virial) {
|
||||
if (_eflag==false && _vflag==false)
|
||||
return 0.0;
|
||||
|
||||
double evdwl=0.0;
|
||||
if (_gpu_nbor) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int ii=_ilist[i];
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[ii][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
}
|
||||
|
||||
evdwl*=0.5;
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
|
||||
double *virial, double &ecoul) {
|
||||
if (_eflag==false && _vflag==false) {
|
||||
ecoul=0.0;
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (_charge==false)
|
||||
return energy_virial(eatom,vatom,virial);
|
||||
|
||||
double evdwl=0.0;
|
||||
double _ecoul=0.0;
|
||||
if (_gpu_nbor) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int ii=_ilist[i];
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[ii][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
}
|
||||
|
||||
evdwl*=0.5;
|
||||
ecoul+=_ecoul*0.5;
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::get_answers(double **f, double **tor) {
|
||||
acctyp *ap=host_ans.begin();
|
||||
if (_gpu_nbor) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
f[i][0]+=*ap;
|
||||
ap++;
|
||||
f[i][1]+=*ap;
|
||||
ap++;
|
||||
f[i][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
tor[i][0]+=*ap;
|
||||
ap++;
|
||||
tor[i][1]+=*ap;
|
||||
ap++;
|
||||
tor[i][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=_ilist[i];
|
||||
f[ii][0]+=*ap;
|
||||
ap++;
|
||||
f[ii][1]+=*ap;
|
||||
ap++;
|
||||
f[ii][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=_ilist[i];
|
||||
tor[ii][0]+=*ap;
|
||||
ap++;
|
||||
tor[ii][1]+=*ap;
|
||||
ap++;
|
||||
tor[ii][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort arrays for neighbor list calculation
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::sort_neighbor(const int num_atoms) {
|
||||
|
|
|
@ -23,7 +23,6 @@
|
|||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#include "geryon/ocl_device.h"
|
||||
#include "geryon/ocl_timer.h"
|
||||
#include "geryon/ocl_mat.h"
|
||||
#include "geryon/ocl_kernel.h"
|
||||
|
@ -32,7 +31,6 @@ using namespace ucl_opencl;
|
|||
#else
|
||||
|
||||
#include "cudpp.h"
|
||||
#include "geryon/nvd_device.h"
|
||||
#include "geryon/nvd_timer.h"
|
||||
#include "geryon/nvd_mat.h"
|
||||
#include "geryon/nvd_kernel.h"
|
||||
|
@ -40,10 +38,6 @@ using namespace ucl_cudadr;
|
|||
|
||||
#endif
|
||||
|
||||
#ifndef int2
|
||||
struct int2 { int x; int y; };
|
||||
#endif
|
||||
|
||||
#include "pair_gpu_precision.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -56,13 +50,9 @@ class PairGPUAtom {
|
|||
inline int max_atoms() const { return _max_atoms; }
|
||||
/// Current number of local+ghost atoms stored
|
||||
inline int nall() const { return _nall; }
|
||||
/// Current number of local atoms stored
|
||||
inline int inum() const { return _inum; }
|
||||
|
||||
/// Set number of local+ghost atoms for future copy operations
|
||||
inline void nall(const int n) { _nall=n; }
|
||||
/// Set number of local atoms for future copy operations
|
||||
inline void inum(const int n) { _inum=n; }
|
||||
|
||||
/// Memory usage per atom in this class
|
||||
int bytes_per_atom() const;
|
||||
|
@ -70,21 +60,33 @@ class PairGPUAtom {
|
|||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param rot True if atom storage needs quaternions
|
||||
* \param gpu_nbor True if neighboring will be performed on device **/
|
||||
bool init(const int inum, const int nall, const bool charge, const bool rot,
|
||||
bool init(const int nall, const bool charge, const bool rot,
|
||||
UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
|
||||
|
||||
/// Check if we have enough device storage and realloc if not
|
||||
inline bool resize(const int inum, const int nall, bool &success) {
|
||||
_inum=inum;
|
||||
/** Returns true if resized with any call during this timestep **/
|
||||
inline bool resize(const int nall, bool &success) {
|
||||
_nall=nall;
|
||||
if (inum>_max_local || nall>_max_atoms) {
|
||||
if (nall>_max_atoms) {
|
||||
clear_resize();
|
||||
success = success && alloc(inum,nall);
|
||||
return true;
|
||||
success = success && alloc(nall);
|
||||
_resized=true;
|
||||
}
|
||||
return false;
|
||||
return _resized;
|
||||
}
|
||||
|
||||
|
||||
/// If already initialized by another LAMMPS style, add fields as necessary
|
||||
/** \param rot True if atom storage needs quaternions
|
||||
* \param gpu_nbor True if neighboring will be performed on device **/
|
||||
bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
|
||||
const bool bonds);
|
||||
|
||||
/// Returns true if GPU is using charges
|
||||
bool charge() { return _charge; }
|
||||
|
||||
/// Returns true if GPU is using quaternions
|
||||
bool quat() { return _rot; }
|
||||
|
||||
/// Only free matrices of length inum or nall for resizing
|
||||
void clear_resize();
|
||||
|
||||
|
@ -100,28 +102,42 @@ class PairGPUAtom {
|
|||
/// Add copy times to timers
|
||||
inline void acc_timers() {
|
||||
time_pos.add_to_total();
|
||||
time_answer.add_to_total();
|
||||
if (_other)
|
||||
time_other.add_to_total();
|
||||
if (_charge)
|
||||
time_q.add_to_total();
|
||||
if (_rot)
|
||||
time_quat.add_to_total();
|
||||
}
|
||||
|
||||
/// Add copy times to timers
|
||||
inline void zero_timers() {
|
||||
time_pos.zero();
|
||||
time_answer.zero();
|
||||
if (_other)
|
||||
time_other.zero();
|
||||
if (_charge)
|
||||
time_q.zero();
|
||||
if (_rot)
|
||||
time_quat.zero();
|
||||
}
|
||||
|
||||
/// Return the total time for host/device data transfer
|
||||
/** Zeros the total so that the atom times are only included once **/
|
||||
inline double transfer_time() {
|
||||
double total=time_pos.total_seconds()+time_answer.total_seconds();
|
||||
if (_other) total+=time_other.total_seconds();
|
||||
double total=time_pos.total_seconds();
|
||||
time_pos.zero_total();
|
||||
if (_charge) {
|
||||
total+=time_q.total_seconds();
|
||||
time_q.zero_total();
|
||||
}
|
||||
if (_rot) {
|
||||
total+=time_q.total_seconds();
|
||||
time_quat.zero_total();
|
||||
}
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
/// Return the total time for data cast/pack
|
||||
inline double cast_time() { return _time_cast; }
|
||||
/** Zeros the time so that atom times are only included once **/
|
||||
inline double cast_time()
|
||||
{ double t=_time_cast; _time_cast=0.0; return t; }
|
||||
|
||||
/// Pack LAMMPS atom type constants into matrix and copy to device
|
||||
template <class dev_typ, class t1>
|
||||
|
@ -216,43 +232,52 @@ class PairGPUAtom {
|
|||
|
||||
// -------------------------COPY TO GPU ----------------------------------
|
||||
|
||||
/// Signal that we need to transfer atom data for next timestep
|
||||
inline void data_unavail()
|
||||
{ _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
|
||||
|
||||
/// Cast positions and types to write buffer
|
||||
inline void cast_x_data(double **host_ptr, const int *host_type) {
|
||||
double t=MPI_Wtime();
|
||||
#ifdef GPU_CAST
|
||||
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
|
||||
#else
|
||||
numtyp *_write_loc=host_x.begin();
|
||||
for (int i=0; i<_nall; i++) {
|
||||
*_write_loc=host_ptr[i][0];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][1];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][2];
|
||||
_write_loc++;
|
||||
*_write_loc=host_type[i];
|
||||
_write_loc++;
|
||||
if (_x_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
#ifdef GPU_CAST
|
||||
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
|
||||
#else
|
||||
numtyp *_write_loc=host_x.begin();
|
||||
for (int i=0; i<_nall; i++) {
|
||||
*_write_loc=host_ptr[i][0];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][1];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][2];
|
||||
_write_loc++;
|
||||
*_write_loc=host_type[i];
|
||||
_write_loc++;
|
||||
}
|
||||
#endif
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
#endif
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy positions and types to device asynchronously
|
||||
/** Copies nall() elements **/
|
||||
inline void add_x_data(double **host_ptr, int *host_type) {
|
||||
time_pos.start();
|
||||
#ifdef GPU_CAST
|
||||
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
|
||||
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
|
||||
int block_size=64;
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
|
||||
k_cast_x.set_size(GX,block_size);
|
||||
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
|
||||
&_nall);
|
||||
#else
|
||||
ucl_copy(dev_x,host_x,_nall*4,true);
|
||||
#endif
|
||||
if (_x_avail==false) {
|
||||
#ifdef GPU_CAST
|
||||
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
|
||||
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
|
||||
int block_size=64;
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
|
||||
k_cast_x.set_size(GX,block_size);
|
||||
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
|
||||
&_nall);
|
||||
#else
|
||||
ucl_copy(dev_x,host_x,_nall*4,true);
|
||||
#endif
|
||||
_x_avail=true;
|
||||
}
|
||||
time_pos.stop();
|
||||
}
|
||||
|
||||
|
@ -262,87 +287,68 @@ class PairGPUAtom {
|
|||
add_x_data(host_ptr,host_type);
|
||||
}
|
||||
|
||||
/// Cast charges to write buffer
|
||||
// Cast charges to write buffer
|
||||
template<class cpytyp>
|
||||
inline void cast_q_data(cpytyp *host_ptr) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_q.view((numtyp*)host_ptr,_nall,*dev);
|
||||
dev_q.view(host_q);
|
||||
} else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
if (_q_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_q.view((numtyp*)host_ptr,_nall,*dev);
|
||||
dev_q.view(host_q);
|
||||
} else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
}
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
|
||||
/// Copy charges to device asynchronously
|
||||
// Copy charges to device asynchronously
|
||||
inline void add_q_data() {
|
||||
ucl_copy(dev_q,host_q,_nall,true);
|
||||
if (_q_avail==false) {
|
||||
ucl_copy(dev_q,host_q,_nall,true);
|
||||
_q_avail=true;
|
||||
}
|
||||
}
|
||||
|
||||
/// Cast quaternions to write buffer
|
||||
// Cast quaternions to write buffer
|
||||
template<class cpytyp>
|
||||
inline void cast_quat_data(cpytyp *host_ptr) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
|
||||
dev_quat.view(host_quat);
|
||||
} else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
if (_quat_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
|
||||
dev_quat.view(host_quat);
|
||||
} else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
}
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
|
||||
/// Copy quaternions to device
|
||||
// Copy quaternions to device
|
||||
/** Copies nall()*4 elements **/
|
||||
inline void add_quat_data() {
|
||||
ucl_copy(dev_quat,host_quat,_nall*4,true);
|
||||
if (_quat_avail==false) {
|
||||
ucl_copy(dev_quat,host_quat,_nall*4,true);
|
||||
_quat_avail=true;
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy data other than pos and data to device
|
||||
inline void add_other_data() {
|
||||
time_other.start();
|
||||
if (_charge)
|
||||
add_q_data();
|
||||
if (_rot)
|
||||
add_quat_data();
|
||||
time_other.stop();
|
||||
}
|
||||
|
||||
/// Return number of bytes used on device
|
||||
inline double gpu_bytes() { return _gpu_bytes; }
|
||||
|
||||
// -------------------------COPY FROM GPU -------------------------------
|
||||
|
||||
/// Copy answers from device into read buffer asynchronously
|
||||
void copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom);
|
||||
|
||||
/// Copy answers from device into read buffer asynchronously
|
||||
void copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom, int *ilist);
|
||||
|
||||
/// Copy energy and virial data into LAMMPS memory
|
||||
double energy_virial(double *eatom, double **vatom, double *virial);
|
||||
|
||||
/// Copy energy and virial data into LAMMPS memory
|
||||
double energy_virial(double *eatom, double **vatom, double *virial,
|
||||
double &ecoul);
|
||||
|
||||
/// Add forces and torques from the GPU into a LAMMPS pointer
|
||||
void get_answers(double **f, double **tor);
|
||||
inline double max_gpu_bytes()
|
||||
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
|
||||
|
||||
// ------------------------------ DATA ----------------------------------
|
||||
|
||||
|
@ -352,10 +358,6 @@ class PairGPUAtom {
|
|||
UCL_D_Vec<numtyp> dev_q;
|
||||
/// Quaterions
|
||||
UCL_D_Vec<numtyp> dev_quat;
|
||||
/// Force and possibly torque
|
||||
UCL_D_Vec<acctyp> dev_ans;
|
||||
/// Energy and virial per-atom storage
|
||||
UCL_D_Vec<acctyp> dev_engv;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
UCL_D_Vec<double> dev_x_cast;
|
||||
|
@ -370,10 +372,6 @@ class PairGPUAtom {
|
|||
UCL_H_Vec<numtyp> host_q;
|
||||
/// Buffer for moving quat data to GPU
|
||||
UCL_H_Vec<numtyp> host_quat;
|
||||
/// Force and possibly torque data on host
|
||||
UCL_H_Vec<acctyp> host_ans;
|
||||
/// Energy/virial data on host
|
||||
UCL_H_Vec<acctyp> host_engv;
|
||||
|
||||
/// Cell list identifiers for device nbor builds
|
||||
UCL_D_Vec<unsigned> dev_cell_id;
|
||||
|
@ -383,7 +381,7 @@ class PairGPUAtom {
|
|||
UCL_D_Vec<int> dev_tag;
|
||||
|
||||
/// Device timers
|
||||
UCL_Timer time_pos, time_other, time_answer;
|
||||
UCL_Timer time_pos, time_q, time_quat;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *dev;
|
||||
|
@ -396,19 +394,19 @@ class PairGPUAtom {
|
|||
#endif
|
||||
|
||||
bool _compiled;
|
||||
|
||||
bool alloc(const int inum, const int nall);
|
||||
|
||||
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
|
||||
int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
|
||||
// True if data has been copied to device already
|
||||
bool _x_avail, _q_avail, _quat_avail, _resized;
|
||||
|
||||
bool alloc(const int nall);
|
||||
|
||||
bool _allocated, _rot, _charge, _other;
|
||||
int _max_atoms, _nall;
|
||||
bool _gpu_nbor, _bonds;
|
||||
int *_ilist;
|
||||
double _time_cast;
|
||||
|
||||
double _gpu_bytes;
|
||||
double _max_gpu_bytes;
|
||||
|
||||
bool _newton;
|
||||
|
||||
#ifndef USE_OPENCL
|
||||
CUDPPConfiguration sort_config;
|
||||
CUDPPHandle sort_plan;
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
#define _HD_BALANCE_EVERY 25
|
||||
#define _HD_BALANCE_WEIGHT 0.5
|
||||
#define _HD_BALANCE_GAP 1.05
|
||||
#define _HD_BALANCE_GAP 1.10
|
||||
|
||||
/// Host/device load balancer
|
||||
template<class numtyp, class acctyp>
|
||||
|
@ -33,7 +33,8 @@ class PairGPUBalance {
|
|||
inline ~PairGPUBalance() { clear(); }
|
||||
|
||||
/// Clear any old data and setup for new LAMMPS run
|
||||
inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const double split);
|
||||
inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const bool gpu_nbor,
|
||||
const double split);
|
||||
|
||||
/// Clear all host and device data
|
||||
inline void clear() {
|
||||
|
@ -43,23 +44,25 @@ class PairGPUBalance {
|
|||
_init_done=false;
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the timestep since initialization
|
||||
inline int timestep() { return _timestep; }
|
||||
|
||||
/// Get a count of the number of particles host will handle for initial alloc
|
||||
inline int first_host_count(const int nlocal,const bool gpu_nbor,
|
||||
const double gpu_split) const {
|
||||
inline int first_host_count(const int nlocal, const double gpu_split,
|
||||
const bool gpu_nbor) const {
|
||||
int host_nlocal=0;
|
||||
if (gpu_nbor && gpu_split!=1.0) {
|
||||
if (gpu_split>0)
|
||||
host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
|
||||
else
|
||||
host_nlocal=static_cast<int>(ceil(0.1*nlocal));
|
||||
host_nlocal=static_cast<int>(ceil(0.05*nlocal));
|
||||
}
|
||||
return host_nlocal;
|
||||
}
|
||||
|
||||
/// Return the number of particles the device will handle this timestep
|
||||
inline int get_gpu_count(const int timestep, const int ago,
|
||||
const int inum_full);
|
||||
inline int get_gpu_count(const int ago, const int inum_full);
|
||||
|
||||
/// Return the average fraction of particles handled by device on all procs
|
||||
inline double all_avg_split() {
|
||||
|
@ -82,10 +85,10 @@ class PairGPUBalance {
|
|||
if (_measure_this_step) {
|
||||
_device->gpu->sync();
|
||||
_device->gpu_barrier();
|
||||
_device->start_host_timer();
|
||||
_device_time.start();
|
||||
_device->gpu->sync();
|
||||
_device->gpu_barrier();
|
||||
_device->start_host_timer();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -95,34 +98,34 @@ class PairGPUBalance {
|
|||
/// Calculate the new host/device split based on the cpu and device times
|
||||
/** \note Only does calculation every _HD_BALANCE_EVERY timesteps
|
||||
(and first 10) **/
|
||||
inline void balance(const double cpu_time, const bool gpu_nbor);
|
||||
inline void balance(const double cpu_time);
|
||||
|
||||
/// Calls balance() and then get_gpu_count()
|
||||
inline int balance(const int timestep, const int ago, const int inum_full,
|
||||
const double cpu_time, const bool gpu_nbor) {
|
||||
balance(cpu_time,gpu_nbor);
|
||||
return get_gpu_count(timestep,ago,inum_full);
|
||||
inline int balance(const int ago,const int inum_full,const double cpu_time) {
|
||||
balance(cpu_time);
|
||||
return get_gpu_count(ago,inum_full);
|
||||
}
|
||||
|
||||
private:
|
||||
PairGPUDevice<numtyp,acctyp> *_device;
|
||||
UCL_Timer _device_time;
|
||||
bool _init_done;
|
||||
bool _init_done, _gpu_nbor;
|
||||
|
||||
bool _load_balance;
|
||||
double _actual_split, _avg_split, _desired_split, _max_split;
|
||||
int _avg_count;
|
||||
|
||||
bool _measure_this_step;
|
||||
int _inum, _inum_full;
|
||||
int _inum, _inum_full, _timestep;
|
||||
};
|
||||
|
||||
#define PairGPUBalanceT PairGPUBalance<numtyp,acctyp>
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
|
||||
const double split) {
|
||||
void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
|
||||
const bool gpu_nbor, const double split) {
|
||||
clear();
|
||||
_gpu_nbor=gpu_nbor;
|
||||
_init_done=true;
|
||||
|
||||
_device=gpu;
|
||||
|
@ -130,7 +133,7 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
|
|||
|
||||
if (split<0.0) {
|
||||
_load_balance=true;
|
||||
_desired_split=0.9;
|
||||
_desired_split=0.90;
|
||||
} else {
|
||||
_load_balance=false;
|
||||
_desired_split=split;
|
||||
|
@ -138,14 +141,14 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
|
|||
_actual_split=_desired_split;
|
||||
_avg_split=0.0;
|
||||
_avg_count=0;
|
||||
_timestep=0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
|
||||
const int inum_full) {
|
||||
int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) {
|
||||
_measure_this_step=false;
|
||||
if (_load_balance) {
|
||||
if (_avg_count<11 || timestep%_HD_BALANCE_EVERY==0) {
|
||||
if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
|
||||
_measure_this_step=true;
|
||||
_inum_full=inum_full;
|
||||
}
|
||||
|
@ -156,44 +159,44 @@ int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
|
|||
}
|
||||
_inum=static_cast<int>(floor(_actual_split*inum_full));
|
||||
if (_inum==0) _inum++;
|
||||
_timestep++;
|
||||
return _inum;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) {
|
||||
void PairGPUBalanceT::balance(const double cpu_time) {
|
||||
if (_measure_this_step) {
|
||||
_measure_this_step=false;
|
||||
double gpu_time=_device_time.seconds();
|
||||
|
||||
double max_gpu_time;
|
||||
MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
|
||||
_device->gpu_comm());
|
||||
|
||||
if (_inum_full==_inum) {
|
||||
_desired_split=1.0;
|
||||
return;
|
||||
}
|
||||
|
||||
_measure_this_step=false;
|
||||
double gpu_time=_device_time.seconds();
|
||||
double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
|
||||
double cpu_other_time=_device->host_time()-cpu_time;
|
||||
int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
|
||||
cpu_time_per_atom);
|
||||
|
||||
double cpu_gpu_time[3], max_times[3];
|
||||
cpu_gpu_time[0]=cpu_time/(_inum_full-_inum);
|
||||
cpu_gpu_time[1]=gpu_time/_inum;
|
||||
cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full;
|
||||
double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
|
||||
_desired_split=split*_HD_BALANCE_GAP;
|
||||
if (_desired_split>1.0)
|
||||
_desired_split=1.0;
|
||||
if (_desired_split<0.0)
|
||||
_desired_split=0.0;
|
||||
|
||||
MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX,
|
||||
_device->gpu_comm());
|
||||
double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]);
|
||||
split*=_HD_BALANCE_GAP;
|
||||
|
||||
if (split>1.0)
|
||||
split=1.0;
|
||||
if (_avg_count<10)
|
||||
_desired_split=(_desired_split*_avg_count+split)/(_avg_count+1);
|
||||
else
|
||||
_desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+
|
||||
_HD_BALANCE_WEIGHT*split;
|
||||
|
||||
if (!gpu_nbor) {
|
||||
if (!_gpu_nbor) {
|
||||
if (_desired_split<_max_split)
|
||||
_actual_split=_desired_split;
|
||||
else
|
||||
_actual_split=_max_split;
|
||||
}
|
||||
//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl;
|
||||
}
|
||||
_avg_split+=_desired_split;
|
||||
_avg_count++;
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> neigh_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
|
@ -36,6 +36,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||
#else
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define BLOCK_NBOR_BUILD 64
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -54,29 +55,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||
#define numtyp4 float4
|
||||
#endif
|
||||
|
||||
#define CELL_BLOCK_SIZE 64
|
||||
#define BLOCK_2D 8
|
||||
#define BLOCK_CELL_2D 8
|
||||
|
||||
#define SBBITS 30
|
||||
|
||||
#define SBBITS 30
|
||||
|
||||
__kernel void transpose(int *out, int *in, int columns_in, int rows_in)
|
||||
{
|
||||
__local float block[BLOCK_2D][BLOCK_2D+1];
|
||||
__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
||||
|
||||
unsigned ti=THREAD_ID_X;
|
||||
unsigned tj=THREAD_ID_Y;
|
||||
unsigned bi=BLOCK_ID_X;
|
||||
unsigned bj=BLOCK_ID_Y;
|
||||
|
||||
unsigned i=bi*BLOCK_2D+ti;
|
||||
unsigned j=bj*BLOCK_2D+tj;
|
||||
unsigned i=bi*BLOCK_CELL_2D+ti;
|
||||
unsigned j=bj*BLOCK_CELL_2D+tj;
|
||||
if ((i<columns_in) && (j<rows_in))
|
||||
block[tj][ti]=in[j*columns_in+i];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
i=bj*BLOCK_2D+ti;
|
||||
j=bi*BLOCK_2D+tj;
|
||||
i=bj*BLOCK_CELL_2D+ti;
|
||||
j=bi*BLOCK_CELL_2D+tj;
|
||||
if ((i<rows_in) && (j<columns_in))
|
||||
out[j*rows_in+i] = block[ti][tj];
|
||||
}
|
||||
|
@ -141,7 +143,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||
int *cell_particle_id,
|
||||
int *cell_counts,
|
||||
int *nbor_list,
|
||||
int *host_nbor_list,
|
||||
int *host_nbor_list,
|
||||
int *host_numj,
|
||||
int neigh_bin_size,
|
||||
numtyp cell_size,
|
||||
int ncellx, int ncelly, int ncellz,
|
||||
|
@ -154,8 +157,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||
|
||||
int icell = ix + iy*ncellx + iz*ncellx*ncelly;
|
||||
|
||||
__shared__ int cell_list_sh[CELL_BLOCK_SIZE];
|
||||
__shared__ numtyp4 pos_sh[CELL_BLOCK_SIZE];
|
||||
__shared__ int cell_list_sh[BLOCK_NBOR_BUILD];
|
||||
__shared__ numtyp4 pos_sh[BLOCK_NBOR_BUILD];
|
||||
|
||||
int icell_begin = cell_counts[icell];
|
||||
int icell_end = cell_counts[icell+1];
|
||||
|
@ -185,9 +188,9 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||
neigh_list=neigh_counts+stride;
|
||||
nbor_list[pid_i]=pid_i;
|
||||
} else {
|
||||
stride=nt-inum;
|
||||
neigh_counts=host_nbor_list+pid_i-inum;
|
||||
neigh_list=neigh_counts+stride;
|
||||
stride=1;
|
||||
neigh_counts=host_numj+pid_i-inum;
|
||||
neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
|
||||
}
|
||||
|
||||
// loop through neighbors
|
||||
|
@ -203,13 +206,13 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||
int num_atom_cell = jcell_end - jcell_begin;
|
||||
|
||||
// load jcell to shared memory
|
||||
int num_iter = (int)ceil((numtyp)num_atom_cell/CELL_BLOCK_SIZE);
|
||||
int num_iter = (int)ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
|
||||
|
||||
for (int k = 0; k < num_iter; k++) {
|
||||
int end_idx = min(CELL_BLOCK_SIZE, num_atom_cell-k*CELL_BLOCK_SIZE);
|
||||
int end_idx = min(BLOCK_NBOR_BUILD, num_atom_cell-k*BLOCK_NBOR_BUILD);
|
||||
|
||||
if (tid < end_idx) {
|
||||
pid_j = cell_particle_id[tid+k*CELL_BLOCK_SIZE+jcell_begin];
|
||||
pid_j = cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
|
||||
cell_list_sh[tid] = pid_j;
|
||||
atom_j = fetch_pos(pid_j,pos); //[pid_j];
|
||||
pos_sh[tid].x = atom_j.x;
|
||||
|
@ -222,20 +225,18 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||
|
||||
for (int j = 0; j < end_idx; j++) {
|
||||
int pid_j = cell_list_sh[j]; // gather from shared memory
|
||||
if (pid_i<inum || pid_j<inum || pid_j>pid_i) {
|
||||
diff.x = atom_i.x - pos_sh[j].x;
|
||||
diff.y = atom_i.y - pos_sh[j].y;
|
||||
diff.z = atom_i.z - pos_sh[j].z;
|
||||
diff.x = atom_i.x - pos_sh[j].x;
|
||||
diff.y = atom_i.y - pos_sh[j].y;
|
||||
diff.z = atom_i.z - pos_sh[j].z;
|
||||
|
||||
r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
|
||||
if (r2 < cell_size*cell_size && r2 > 1e-5) {
|
||||
if (cnt < neigh_bin_size) {
|
||||
*neigh_list = pid_j;
|
||||
neigh_list+=stride;
|
||||
}
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
|
||||
if (r2 < cell_size*cell_size && r2 > 1e-5) {
|
||||
if (cnt < neigh_bin_size) {
|
||||
*neigh_list = pid_j;
|
||||
neigh_list+=stride;
|
||||
}
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
@ -249,9 +250,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||
}
|
||||
|
||||
__kernel void kernel_special(__global int *dev_nbor,
|
||||
__global int *host_nbor_list, __global int *tag,
|
||||
__global int *host_nbor_list,
|
||||
__global int *host_numj, __global int *tag,
|
||||
__global int *nspecial, __global int *special,
|
||||
int inum, int nt, int nall) {
|
||||
int inum, int nt, int nall, int max_nbors) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
|
||||
|
@ -263,15 +265,17 @@ __kernel void kernel_special(__global int *dev_nbor,
|
|||
int n2=nspecial[ii*3+1];
|
||||
int n3=nspecial[ii*3+2];
|
||||
|
||||
int numj;
|
||||
if (ii < inum) {
|
||||
stride=inum;
|
||||
list=dev_nbor+stride+ii;
|
||||
numj=*list;
|
||||
list+=stride;
|
||||
} else {
|
||||
stride=nt-inum;
|
||||
list=host_nbor_list+ii-inum;
|
||||
stride=1;
|
||||
list=host_nbor_list+(ii-inum)*max_nbors;
|
||||
numj=host_numj[ii-inum];
|
||||
}
|
||||
int numj=*list;
|
||||
list+=stride;
|
||||
list_end=list+numj*stride;
|
||||
|
||||
for ( ; list<list_end; list+=stride) {
|
||||
|
@ -294,4 +298,3 @@ __kernel void kernel_special(__global int *dev_nbor,
|
|||
}
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
|
|
@ -19,13 +19,22 @@
|
|||
#include "pair_gpu_precision.h"
|
||||
#include <map>
|
||||
#include <math.h>
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "pair_gpu_dev_cl.h"
|
||||
#else
|
||||
#include "pair_gpu_dev_ptx.h"
|
||||
#endif
|
||||
|
||||
#define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
|
||||
_gpu_mode(GPU_FORCE), _first_device(0),
|
||||
_last_device(0) {
|
||||
_last_device(0), _compiled(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -34,14 +43,19 @@ PairGPUDeviceT::~PairGPUDevice() {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
||||
const int first_gpu, const int last_gpu,
|
||||
const int gpu_mode, const double p_split,
|
||||
const int nthreads) {
|
||||
int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
||||
const int first_gpu, const int last_gpu,
|
||||
const int gpu_mode, const double p_split,
|
||||
const int nthreads, const int t_per_atom) {
|
||||
_nthreads=nthreads;
|
||||
#ifdef _OPENMP
|
||||
omp_set_num_threads(nthreads);
|
||||
#endif
|
||||
_threads_per_atom=t_per_atom;
|
||||
_threads_per_charge=t_per_atom;
|
||||
|
||||
if (_device_init)
|
||||
return true;
|
||||
return 0;
|
||||
_device_init=true;
|
||||
_comm_world=world;
|
||||
_comm_replica=replica;
|
||||
|
@ -96,7 +110,12 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
|||
// set the device ID
|
||||
_procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
|
||||
(last_gpu-first_gpu+1)));
|
||||
int my_gpu=node_rank/_procs_per_gpu;
|
||||
int my_gpu=node_rank/_procs_per_gpu+first_gpu;
|
||||
|
||||
// Time on the device only if 1 proc per gpu
|
||||
_time_device=true;
|
||||
if (_procs_per_gpu>1)
|
||||
_time_device=false;
|
||||
|
||||
// Set up a per device communicator
|
||||
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
|
||||
|
@ -104,39 +123,109 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
|||
|
||||
gpu=new UCL_Device();
|
||||
if (my_gpu>=gpu->num_devices())
|
||||
return false;
|
||||
return -2;
|
||||
|
||||
gpu->set(my_gpu);
|
||||
return true;
|
||||
|
||||
_long_range_precompute=0;
|
||||
|
||||
int flag=compile_kernels();
|
||||
|
||||
return flag;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
const int maxspecial, const bool gpu_nbor,
|
||||
const int gpu_host, const int max_nbors,
|
||||
const double cell_size, const bool pre_cut) {
|
||||
int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
|
||||
const bool rot, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
PairGPUNbor *nbor, const int maxspecial,
|
||||
const int gpu_host, const int max_nbors,
|
||||
const double cell_size, const bool pre_cut) {
|
||||
if (!_device_init)
|
||||
return false;
|
||||
return -1;
|
||||
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
|
||||
return -5;
|
||||
|
||||
// Counts of data transfers for timing overhead estimates
|
||||
_data_in_estimate=0;
|
||||
_data_out_estimate=1;
|
||||
|
||||
// Initial number of local particles
|
||||
int ef_nlocal=nlocal;
|
||||
if (_particle_split<1.0 && _particle_split>0.0)
|
||||
ef_nlocal=static_cast<int>(_particle_split*nlocal);
|
||||
|
||||
bool gpu_nbor=false;
|
||||
if (_gpu_mode==GPU_NEIGH)
|
||||
gpu_nbor=true;
|
||||
|
||||
if (_init_count==0) {
|
||||
// Initialize atom and nbor data
|
||||
int ef_nlocal=nlocal;
|
||||
if (_particle_split<1.0 && _particle_split>0.0)
|
||||
ef_nlocal=static_cast<int>(_particle_split*nlocal);
|
||||
if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
|
||||
gpu_nbor && maxspecial>0))
|
||||
return false;
|
||||
if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
|
||||
gpu_host,pre_cut))
|
||||
return false;
|
||||
nbor.cell_size(cell_size);
|
||||
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
|
||||
return -3;
|
||||
|
||||
_data_in_estimate++;
|
||||
if (charge)
|
||||
_data_in_estimate++;
|
||||
if (rot)
|
||||
_data_in_estimate++;
|
||||
} else {
|
||||
if (cell_size>nbor.cell_size())
|
||||
nbor.cell_size(cell_size);
|
||||
if (atom.charge()==false && charge)
|
||||
_data_in_estimate++;
|
||||
if (atom.quat()==false && rot)
|
||||
_data_in_estimate++;
|
||||
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial))
|
||||
return -3;
|
||||
}
|
||||
|
||||
if (!ans.init(ef_nlocal,charge,rot,*gpu))
|
||||
return -3;
|
||||
|
||||
if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
|
||||
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
|
||||
_block_cell_id, _block_nbor_build))
|
||||
return -3;
|
||||
nbor->cell_size(cell_size);
|
||||
|
||||
_init_count++;
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
|
||||
const int nall) {
|
||||
if (!_device_init)
|
||||
return -1;
|
||||
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
|
||||
return -5;
|
||||
|
||||
if (_init_count==0) {
|
||||
// Initialize atom and nbor data
|
||||
if (!atom.init(nall,true,false,*gpu,false,false))
|
||||
return -3;
|
||||
} else
|
||||
if (!atom.add_fields(true,false,false,false))
|
||||
return -3;
|
||||
|
||||
if (!ans.init(nlocal,true,false,*gpu))
|
||||
return -3;
|
||||
|
||||
_init_count++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUDeviceT::set_single_precompute
|
||||
(PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) {
|
||||
_long_range_precompute=1;
|
||||
pppm_single=pppm;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUDeviceT::set_double_precompute
|
||||
(PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) {
|
||||
_long_range_precompute=2;
|
||||
pppm_double=pppm;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -152,11 +241,17 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
|
|||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"-------------------------------------\n");
|
||||
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
|
||||
fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu);
|
||||
fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu);
|
||||
#ifdef _OPENMP
|
||||
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
|
||||
#endif
|
||||
fprintf(screen,"-------------------------------------");
|
||||
fprintf(screen,"-------------------------------------\n");
|
||||
|
||||
for (int i=first_gpu; i<=last_gpu; i++) {
|
||||
int last=last_gpu+1;
|
||||
if (last>gpu->num_devices())
|
||||
last=gpu->num_devices();
|
||||
for (int i=first_gpu; i<last; i++) {
|
||||
std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
|
||||
toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
|
||||
" GHZ (";
|
||||
|
@ -177,32 +272,152 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
|
||||
const double max_bytes, FILE *screen) {
|
||||
double single[5], times[5];
|
||||
void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls,
|
||||
double &gpu_overhead,
|
||||
double &gpu_driver_overhead) {
|
||||
UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
|
||||
UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
|
||||
UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
|
||||
UCL_Timer over_timer(*gpu);
|
||||
|
||||
single[0]=atom.transfer_time();
|
||||
if (_data_in_estimate>0) {
|
||||
host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
|
||||
dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
|
||||
timers_in=new UCL_Timer[_data_in_estimate];
|
||||
}
|
||||
|
||||
if (_data_out_estimate>0) {
|
||||
host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
|
||||
dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
|
||||
timers_out=new UCL_Timer[_data_out_estimate];
|
||||
}
|
||||
|
||||
if (kernel_calls>0) {
|
||||
kernel_data=new UCL_D_Vec<int>[kernel_calls];
|
||||
timers_kernel=new UCL_Timer[kernel_calls];
|
||||
}
|
||||
|
||||
for (int i=0; i<_data_in_estimate; i++) {
|
||||
host_data_in[i].alloc(1,*gpu);
|
||||
dev_data_in[i].alloc(1,*gpu);
|
||||
timers_in[i].init(*gpu);
|
||||
}
|
||||
|
||||
for (int i=0; i<_data_out_estimate; i++) {
|
||||
host_data_out[i].alloc(1,*gpu);
|
||||
dev_data_out[i].alloc(1,*gpu);
|
||||
timers_out[i].init(*gpu);
|
||||
}
|
||||
|
||||
for (int i=0; i<kernel_calls; i++) {
|
||||
kernel_data[i].alloc(1,*gpu);
|
||||
timers_kernel[i].init(*gpu);
|
||||
}
|
||||
|
||||
gpu_overhead=0.0;
|
||||
gpu_driver_overhead=0.0;
|
||||
|
||||
for (int i=0; i<10; i++) {
|
||||
gpu->sync();
|
||||
gpu_barrier();
|
||||
over_timer.start();
|
||||
gpu->sync();
|
||||
gpu_barrier();
|
||||
|
||||
double driver_time=MPI_Wtime();
|
||||
for (int i=0; i<_data_in_estimate; i++) {
|
||||
timers_in[i].start();
|
||||
ucl_copy(dev_data_in[i],host_data_in[i],true);
|
||||
timers_in[i].stop();
|
||||
}
|
||||
|
||||
for (int i=0; i<kernel_calls; i++) {
|
||||
timers_kernel[i].start();
|
||||
zero(kernel_data[i],1);
|
||||
timers_kernel[i].stop();
|
||||
}
|
||||
|
||||
for (int i=0; i<_data_out_estimate; i++) {
|
||||
timers_out[i].start();
|
||||
ucl_copy(host_data_out[i],dev_data_out[i],true);
|
||||
timers_out[i].stop();
|
||||
}
|
||||
over_timer.stop();
|
||||
|
||||
double time=over_timer.seconds();
|
||||
driver_time=MPI_Wtime()-driver_time;
|
||||
|
||||
if (time_device()) {
|
||||
for (int i=0; i<_data_in_estimate; i++)
|
||||
timers_in[i].add_to_total();
|
||||
for (int i=0; i<kernel_calls; i++)
|
||||
timers_kernel[i].add_to_total();
|
||||
for (int i=0; i<_data_out_estimate; i++)
|
||||
timers_out[i].add_to_total();
|
||||
}
|
||||
|
||||
double mpi_time, mpi_driver_time;
|
||||
MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
|
||||
MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
|
||||
gpu_overhead+=mpi_time;
|
||||
gpu_driver_overhead+=mpi_driver_time;
|
||||
}
|
||||
gpu_overhead/=10.0;
|
||||
gpu_driver_overhead/=10.0;
|
||||
|
||||
if (_data_in_estimate>0) {
|
||||
delete [] host_data_in;
|
||||
delete [] dev_data_in;
|
||||
delete [] timers_in;
|
||||
}
|
||||
|
||||
if (_data_out_estimate>0) {
|
||||
delete [] host_data_out;
|
||||
delete [] dev_data_out;
|
||||
delete [] timers_out;
|
||||
}
|
||||
|
||||
if (kernel_calls>0) {
|
||||
delete [] kernel_data;
|
||||
delete [] timers_kernel;
|
||||
}
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUDeviceT::output_times(UCL_Timer &time_pair,
|
||||
PairGPUAns<numtyp,acctyp> &ans,
|
||||
PairGPUNbor &nbor, const double avg_split,
|
||||
const double max_bytes,
|
||||
const double gpu_overhead,
|
||||
const double driver_overhead,
|
||||
const int threads_per_atom, FILE *screen) {
|
||||
double single[8], times[8];
|
||||
|
||||
single[0]=atom.transfer_time()+ans.transfer_time();
|
||||
single[1]=nbor.time_nbor.total_seconds();
|
||||
single[2]=nbor.time_kernel.total_seconds();
|
||||
single[3]=time_pair.total_seconds();
|
||||
single[4]=atom.cast_time();
|
||||
single[4]=atom.cast_time()+ans.cast_time();
|
||||
single[5]=gpu_overhead;
|
||||
single[6]=driver_overhead;
|
||||
single[7]=ans.cpu_idle_time();
|
||||
|
||||
MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
|
||||
MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
|
||||
|
||||
double my_max_bytes=max_bytes;
|
||||
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
|
||||
double mpi_max_bytes;
|
||||
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
|
||||
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
||||
|
||||
if (replica_me()==0)
|
||||
if (screen && times[3]>0.0) {
|
||||
if (screen && times[5]>0.0) {
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
fprintf(screen," GPU Time Info (average): ");
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
if (procs_per_gpu()==1) {
|
||||
if (time_device()) {
|
||||
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
|
||||
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size);
|
||||
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
|
||||
|
@ -212,7 +427,71 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
|
|||
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
|
||||
fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size);
|
||||
}
|
||||
fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size);
|
||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||
fprintf(screen,"Threads / atom: %d.\n",threads_per_atom);
|
||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
|
||||
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size);
|
||||
|
||||
fprintf(screen,"-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
|
||||
UCL_Timer &time_out,
|
||||
UCL_Timer &time_map,
|
||||
UCL_Timer &time_rho,
|
||||
UCL_Timer &time_interp,
|
||||
PairGPUAns<numtyp,acctyp> &ans,
|
||||
const double max_bytes,
|
||||
const double cpu_time,
|
||||
const double idle_time, FILE *screen) {
|
||||
double single[8], times[8];
|
||||
|
||||
single[0]=time_out.total_seconds();
|
||||
single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
|
||||
single[2]=time_map.total_seconds();
|
||||
single[3]=time_rho.total_seconds();
|
||||
single[4]=time_interp.total_seconds();
|
||||
single[5]=ans.transfer_time()+ans.cast_time();
|
||||
single[6]=cpu_time;
|
||||
single[7]=idle_time;
|
||||
|
||||
MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
|
||||
|
||||
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
|
||||
double mpi_max_bytes;
|
||||
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
|
||||
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
||||
|
||||
if (replica_me()==0)
|
||||
if (screen && times[6]>0.0) {
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
fprintf(screen," GPU Time Info (average): ");
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
if (time_device()) {
|
||||
fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size);
|
||||
fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size);
|
||||
fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size);
|
||||
fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size);
|
||||
fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size);
|
||||
fprintf(screen,"Total rho: %.4f s.\n",
|
||||
(times[0]+times[2]+times[3])/_replica_size);
|
||||
fprintf(screen,"Total interp: %.4f s.\n",
|
||||
(times[1]+times[4])/_replica_size);
|
||||
fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
|
||||
fprintf(screen,"Total: %.4f s.\n",
|
||||
(times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
|
||||
_replica_size);
|
||||
}
|
||||
fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size);
|
||||
fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size);
|
||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||
|
||||
fprintf(screen,"-------------------------------------");
|
||||
|
@ -223,10 +502,17 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
|
|||
template <class numtyp, class acctyp>
|
||||
void PairGPUDeviceT::clear() {
|
||||
if (_init_count>0) {
|
||||
_long_range_precompute=0;
|
||||
_init_count--;
|
||||
if (_init_count==0) {
|
||||
atom.clear();
|
||||
nbor.clear();
|
||||
_nbor_shared.clear();
|
||||
if (_compiled) {
|
||||
k_zero.clear();
|
||||
k_info.clear();
|
||||
delete dev_program;
|
||||
_compiled=false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -241,21 +527,80 @@ void PairGPUDeviceT::clear_device() {
|
|||
}
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int PairGPUDeviceT::compile_kernels() {
|
||||
int flag=0;
|
||||
|
||||
if (_compiled)
|
||||
return flag;
|
||||
|
||||
std::string flags="-cl-mad-enable";
|
||||
dev_program=new UCL_Program(*gpu);
|
||||
int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str());
|
||||
if (success!=UCL_SUCCESS)
|
||||
return -4;
|
||||
k_zero.set_function(*dev_program,"kernel_zero");
|
||||
k_info.set_function(*dev_program,"kernel_info");
|
||||
_compiled=true;
|
||||
|
||||
UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
|
||||
UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
|
||||
k_info.set_size(1,1);
|
||||
k_info.run(&d_gpu_lib_data.begin());
|
||||
ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
|
||||
|
||||
#ifndef USE_OPENCL
|
||||
if (static_cast<double>(h_gpu_lib_data[0])/100.0>gpu->arch())
|
||||
return -4;
|
||||
#endif
|
||||
|
||||
_num_mem_threads=h_gpu_lib_data[1];
|
||||
_warp_size=h_gpu_lib_data[2];
|
||||
if (_threads_per_atom<1)
|
||||
_threads_per_atom=h_gpu_lib_data[3];
|
||||
if (_threads_per_charge<1)
|
||||
_threads_per_charge=h_gpu_lib_data[13];
|
||||
_pppm_max_spline=h_gpu_lib_data[4];
|
||||
_pppm_block=h_gpu_lib_data[5];
|
||||
_block_pair=h_gpu_lib_data[6];
|
||||
_max_shared_types=h_gpu_lib_data[7];
|
||||
_block_cell_2d=h_gpu_lib_data[8];
|
||||
_block_cell_id=h_gpu_lib_data[9];
|
||||
_block_nbor_build=h_gpu_lib_data[10];
|
||||
_block_bio_pair=h_gpu_lib_data[11];
|
||||
_max_bio_shared_types=h_gpu_lib_data[12];
|
||||
|
||||
if (static_cast<size_t>(_block_pair)>gpu->group_size())
|
||||
_block_pair=gpu->group_size();
|
||||
if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
|
||||
_block_bio_pair=gpu->group_size();
|
||||
if (_threads_per_atom>_warp_size)
|
||||
_threads_per_atom=_warp_size;
|
||||
if (_warp_size%_threads_per_atom!=0)
|
||||
_threads_per_atom=1;
|
||||
if (_threads_per_charge>_warp_size)
|
||||
_threads_per_charge=_warp_size;
|
||||
if (_warp_size%_threads_per_charge!=0)
|
||||
_threads_per_charge=1;
|
||||
|
||||
return flag;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUDeviceT::host_memory_usage() const {
|
||||
return atom.host_memory_usage()+
|
||||
nbor.host_memory_usage()+4*sizeof(numtyp)+
|
||||
return atom.host_memory_usage()+4*sizeof(numtyp)+
|
||||
sizeof(PairGPUDevice<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template class PairGPUDevice<PRECISION,ACC_PRECISION>;
|
||||
PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||
const int last_gpu, const int gpu_mode,
|
||||
const double particle_split, const int nthreads) {
|
||||
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||
const int last_gpu, const int gpu_mode,
|
||||
const double particle_split, const int nthreads,
|
||||
const int t_per_atom) {
|
||||
return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
|
||||
particle_split,nthreads);
|
||||
particle_split,nthreads,t_per_atom);
|
||||
}
|
||||
|
||||
void lmp_clear_device() {
|
||||
|
@ -264,14 +609,5 @@ void lmp_clear_device() {
|
|||
|
||||
double lmp_gpu_forces(double **f, double **tor, double *eatom,
|
||||
double **vatom, double *virial, double &ecoul) {
|
||||
if (pair_gpu_device.init_count()) {
|
||||
pair_gpu_device.stop_host_timer();
|
||||
pair_gpu_device.gpu->sync();
|
||||
double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
|
||||
pair_gpu_device.atom.get_answers(f,tor);
|
||||
|
||||
return evdw;
|
||||
}
|
||||
return 0.0;
|
||||
return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,11 +19,17 @@
|
|||
#define PAIR_GPU_DEVICE_H
|
||||
|
||||
#include "pair_gpu_atom.h"
|
||||
#include "pair_gpu_ans.h"
|
||||
#include "pair_gpu_nbor.h"
|
||||
#include "pppm_gpu_memory.h"
|
||||
#include "mpi.h"
|
||||
#include <sstream>
|
||||
#include "stdio.h"
|
||||
#include <string>
|
||||
#include <queue>
|
||||
|
||||
template <class numtyp, class acctyp,
|
||||
class grdtyp, class grdtyp4> class PPPMGPUMemory;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class PairGPUDevice {
|
||||
|
@ -33,10 +39,15 @@ class PairGPUDevice {
|
|||
|
||||
/// Initialize the device for use by this process
|
||||
/** Sets up a per-device MPI communicator for load balancing and initializes
|
||||
* the device (>=first_gpu and <=last_gpu) that this proc will be using **/
|
||||
bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||
* the device (>=first_gpu and <=last_gpu) that this proc will be using
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -2 if GPU not found
|
||||
* - -4 if GPU library not compiled for GPU **/
|
||||
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||
const int last_gpu, const int gpu_mode,
|
||||
const double particle_split, const int nthreads);
|
||||
const double particle_split, const int nthreads,
|
||||
const int t_per_atom);
|
||||
|
||||
/// Initialize the device for Atom and Neighbor storage
|
||||
/** \param rot True if quaternions need to be stored
|
||||
|
@ -50,19 +61,67 @@ class PairGPUDevice {
|
|||
* \param max_nbors Initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff+skin
|
||||
* \param pre_cut True if cutoff test will be performed in separate kernel
|
||||
* than the force kernel **/
|
||||
bool init(const bool charge, const bool rot, const int nlocal,
|
||||
const int host_nlocal, const int nall, const int maxspecial,
|
||||
const bool gpu_nbor, const int gpu_host, const int max_nbors,
|
||||
const double cell_size, const bool pre_cut);
|
||||
* than the force kernel
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
|
||||
const int nlocal, const int host_nlocal, const int nall,
|
||||
PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
|
||||
const int max_nbors, const double cell_size, const bool pre_cut);
|
||||
|
||||
/// Initialize the device for Atom storage only
|
||||
/** \param nlocal Total number of local particles to allocate memory for
|
||||
* \param nall Total number of local+ghost particles
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
|
||||
|
||||
/// Output a message for pair_style acceleration with device stats
|
||||
void init_message(FILE *screen, const char *name,
|
||||
const int first_gpu, const int last_gpu);
|
||||
|
||||
/// Perform charge assignment asynchronously for PPPM
|
||||
void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
|
||||
float,_lgpu_float4> *pppm);
|
||||
|
||||
/// Perform charge assignment asynchronously for PPPM
|
||||
void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
|
||||
double,_lgpu_double4> *pppm);
|
||||
|
||||
/// Esimate the overhead from GPU calls from multiple procs
|
||||
/** \param kernel_calls Number of kernel calls/timestep for timing estimated
|
||||
* overhead
|
||||
* \param gpu_overhead Estimated gpu overhead per timestep (sec)
|
||||
* \param driver_overhead Estimated overhead from driver per timestep (s) **/
|
||||
void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
|
||||
double &gpu_driver_overhead);
|
||||
|
||||
/// Returns true if double precision is supported on card
|
||||
inline bool double_precision() { return gpu->double_precision(); }
|
||||
|
||||
/// Output a message with timing information
|
||||
void output_times(UCL_Timer &time_pair, const double avg_split,
|
||||
const double max_bytes, FILE *screen);
|
||||
void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans,
|
||||
PairGPUNbor &nbor, const double avg_split,
|
||||
const double max_bytes, const double gpu_overhead,
|
||||
const double driver_overhead,
|
||||
const int threads_per_atom, FILE *screen);
|
||||
|
||||
/// Output a message with timing information
|
||||
void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
|
||||
UCL_Timer & time_map, UCL_Timer & time_rho,
|
||||
UCL_Timer &time_interp,
|
||||
PairGPUAns<numtyp,acctyp> &ans,
|
||||
const double max_bytes, const double cpu_time,
|
||||
const double cpu_idle_time, FILE *screen);
|
||||
|
||||
/// Clear all memory on host and device associated with atom and nbor data
|
||||
void clear();
|
||||
|
@ -70,11 +129,37 @@ class PairGPUDevice {
|
|||
/// Clear all memory on host and device
|
||||
void clear_device();
|
||||
|
||||
/// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
|
||||
inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
|
||||
{ ans_queue.push(ans); }
|
||||
|
||||
/// Add "answers" (force,energies,etc.) into LAMMPS structures
|
||||
inline double fix_gpu(double **f, double **tor, double *eatom,
|
||||
double **vatom, double *virial, double &ecoul) {
|
||||
atom.data_unavail();
|
||||
if (ans_queue.empty()==false) {
|
||||
stop_host_timer();
|
||||
double evdw=0.0;
|
||||
while (ans_queue.empty()==false) {
|
||||
evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
|
||||
ans_queue.pop();
|
||||
}
|
||||
return evdw;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/// Start timer on host
|
||||
inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
|
||||
inline void start_host_timer()
|
||||
{ _cpu_full=MPI_Wtime(); _host_timer_started=true; }
|
||||
|
||||
/// Stop timer on host
|
||||
inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
|
||||
inline void stop_host_timer() {
|
||||
if (_host_timer_started) {
|
||||
_cpu_full=MPI_Wtime()-_cpu_full;
|
||||
_host_timer_started=false;
|
||||
}
|
||||
}
|
||||
|
||||
/// Return host time
|
||||
inline double host_time() { return _cpu_full; }
|
||||
|
@ -114,6 +199,42 @@ class PairGPUDevice {
|
|||
inline double particle_split() const { return _particle_split; }
|
||||
/// Return the initialization count for the device
|
||||
inline int init_count() const { return _init_count; }
|
||||
/// True if device is being timed
|
||||
inline bool time_device() const { return _time_device; }
|
||||
|
||||
/// Return the number of threads accessing memory simulatenously
|
||||
inline int num_mem_threads() const { return _num_mem_threads; }
|
||||
/// Return the number of threads per atom for pair styles
|
||||
inline int threads_per_atom() const { return _threads_per_atom; }
|
||||
/// Return the number of threads per atom for pair styles using charge
|
||||
inline int threads_per_charge() const { return _threads_per_charge; }
|
||||
/// Return the min of the pair block size or the device max block size
|
||||
inline int pair_block_size() const { return _block_pair; }
|
||||
/// Return the maximum number of atom types that can be used with shared mem
|
||||
inline int max_shared_types() const { return _max_shared_types; }
|
||||
/// Return the maximum order for PPPM splines
|
||||
inline int pppm_max_spline() const { return _pppm_max_spline; }
|
||||
/// Return the block size for PPPM kernels
|
||||
inline int pppm_block() const { return _pppm_block; }
|
||||
/// Return the block size for neighbor binning
|
||||
inline int block_cell_2d() const { return _block_cell_2d; }
|
||||
/// Return the block size for atom mapping for neighbor builds
|
||||
inline int block_cell_id() const { return _block_cell_id; }
|
||||
/// Return the block size for neighbor build kernel
|
||||
inline int block_nbor_build() const { return _block_nbor_build; }
|
||||
/// Return the block size for "bio" pair styles
|
||||
inline int block_bio_pair() const { return _block_bio_pair; }
|
||||
/// Return the maximum number of atom types for shared mem with "bio" styles
|
||||
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
|
||||
|
||||
// -------------------- SHARED DEVICE ROUTINES --------------------
|
||||
// Perform asynchronous zero of integer array
|
||||
void zero(UCL_D_Vec<int> &mem, const int numel) {
|
||||
int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
|
||||
_block_pair));
|
||||
k_zero.set_size(num_blocks,_block_pair);
|
||||
k_zero.run(&mem.begin(),&numel);
|
||||
}
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
|
@ -130,11 +251,30 @@ class PairGPUDevice {
|
|||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor Data
|
||||
PairGPUNbor nbor;
|
||||
PairGPUNborShared _nbor_shared;
|
||||
|
||||
// ------------------------ LONG RANGE DATA -------------------------
|
||||
|
||||
// Long Range Data
|
||||
int _long_range_precompute;
|
||||
PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
|
||||
PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
|
||||
/// Precomputations for long range charge assignment (asynchronously)
|
||||
inline void precompute(const int ago, const int nlocal, const int nall,
|
||||
double **host_x, int *host_type, bool &success,
|
||||
double *charge, double *boxlo, double *prd) {
|
||||
if (_long_range_precompute==1)
|
||||
pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
|
||||
boxlo,prd);
|
||||
else if (_long_range_precompute==2)
|
||||
pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
|
||||
boxlo,prd);
|
||||
}
|
||||
|
||||
private:
|
||||
std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
|
||||
int _init_count;
|
||||
bool _device_init;
|
||||
bool _device_init, _host_timer_started, _time_device;
|
||||
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
|
||||
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
|
||||
_replica_size;
|
||||
|
@ -142,6 +282,19 @@ class PairGPUDevice {
|
|||
double _particle_split;
|
||||
double _cpu_full;
|
||||
|
||||
int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
|
||||
int _pppm_max_spline, _pppm_block;
|
||||
int _block_pair, _max_shared_types;
|
||||
int _block_cell_2d, _block_cell_id, _block_nbor_build;
|
||||
int _block_bio_pair, _max_bio_shared_types;
|
||||
|
||||
UCL_Program *dev_program;
|
||||
UCL_Kernel k_zero, k_info;
|
||||
bool _compiled;
|
||||
int compile_kernels();
|
||||
|
||||
int _data_in_estimate, _data_out_estimate;
|
||||
|
||||
template <class t>
|
||||
inline std::string toa(const t& in) {
|
||||
std::ostringstream o;
|
||||
|
|
|
@ -18,15 +18,9 @@
|
|||
|
||||
#include "pair_gpu_precision.h"
|
||||
#include "pair_gpu_nbor.h"
|
||||
#include "pair_gpu_device.h"
|
||||
#include "math.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "pair_gpu_nbor_cl.h"
|
||||
#else
|
||||
#include "pair_gpu_nbor_ptx.h"
|
||||
#include "pair_gpu_build_ptx.h"
|
||||
#endif
|
||||
|
||||
int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
|
||||
if (_gpu_nbor)
|
||||
return (max_nbors+2)*sizeof(int);
|
||||
|
@ -36,12 +30,18 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
|
|||
return (max_nbors+3)*sizeof(int);
|
||||
}
|
||||
|
||||
bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
|
||||
bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
|
||||
const int host_inum, const int max_nbors,
|
||||
const int maxspecial, UCL_Device &devi,
|
||||
const bool gpu_nbor, const int gpu_host,
|
||||
const bool pre_cut) {
|
||||
const bool pre_cut, const int block_cell_2d,
|
||||
const int block_cell_id, const int block_nbor_build) {
|
||||
clear();
|
||||
|
||||
_block_cell_2d=block_cell_2d;
|
||||
_block_cell_id=block_cell_id;
|
||||
_block_nbor_build=block_nbor_build;
|
||||
_shared=shared;
|
||||
dev=&devi;
|
||||
_gpu_nbor=gpu_nbor;
|
||||
if (gpu_host==0)
|
||||
|
@ -80,8 +80,11 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
|
|||
success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
|
||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||
alloc(success);
|
||||
if (!success)
|
||||
return false;
|
||||
|
||||
if (_use_packing==false)
|
||||
compile_kernels(devi);
|
||||
_shared->compile_kernels(devi,gpu_nbor);
|
||||
|
||||
return success;
|
||||
}
|
||||
|
@ -89,13 +92,14 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
|
|||
void PairGPUNbor::alloc(bool &success) {
|
||||
dev_nbor.clear();
|
||||
host_acc.clear();
|
||||
int nt=_max_atoms+_max_host;
|
||||
if (_use_packing==false || _gpu_nbor)
|
||||
success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
else
|
||||
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev,
|
||||
success=success && (host_acc.alloc(nt*2,*dev,
|
||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||
|
||||
_c_bytes=dev_nbor.row_bytes();
|
||||
|
@ -108,11 +112,31 @@ void PairGPUNbor::alloc(bool &success) {
|
|||
if (_max_host>0) {
|
||||
host_nbor.clear();
|
||||
dev_host_nbor.clear();
|
||||
success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev,
|
||||
dev_host_numj.clear();
|
||||
host_ilist.clear();
|
||||
host_jlist.clear();
|
||||
|
||||
success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
|
||||
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
||||
success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host,
|
||||
success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
|
||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
_c_bytes+=dev_host_nbor.row_bytes();
|
||||
success=success && (dev_host_numj.alloc(_max_host,*dev,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||
if (!success)
|
||||
return;
|
||||
for (int i=0; i<nt; i++)
|
||||
host_ilist[i]=i;
|
||||
success=success && (host_jlist.alloc(_max_host,*dev,
|
||||
UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||
if (!success)
|
||||
return;
|
||||
int *ptr=host_nbor.begin();
|
||||
for (int i=0; i<_max_host; i++) {
|
||||
host_jlist[i]=ptr;
|
||||
ptr+=_max_nbors;
|
||||
}
|
||||
_c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
|
||||
}
|
||||
if (_maxspecial>0) {
|
||||
dev_nspecial.clear();
|
||||
|
@ -145,6 +169,9 @@ void PairGPUNbor::clear() {
|
|||
dev_host_nbor.clear();
|
||||
dev_packed.clear();
|
||||
host_nbor.clear();
|
||||
dev_host_numj.clear();
|
||||
host_ilist.clear();
|
||||
host_jlist.clear();
|
||||
dev_nspecial.clear();
|
||||
dev_special.clear();
|
||||
dev_special_t.clear();
|
||||
|
@ -152,27 +179,13 @@ void PairGPUNbor::clear() {
|
|||
time_kernel.clear();
|
||||
time_nbor.clear();
|
||||
}
|
||||
|
||||
if (_compiled) {
|
||||
if (_gpu_nbor) {
|
||||
k_cell_id.clear();
|
||||
k_cell_counts.clear();
|
||||
k_build_nbor.clear();
|
||||
k_transpose.clear();
|
||||
k_special.clear();
|
||||
delete build_program;
|
||||
} else {
|
||||
k_nbor.clear();
|
||||
delete nbor_program;
|
||||
}
|
||||
_compiled=false;
|
||||
}
|
||||
}
|
||||
|
||||
double PairGPUNbor::host_memory_usage() const {
|
||||
if (_gpu_nbor) {
|
||||
if (_gpu_host)
|
||||
return host_nbor.row_bytes()*host_nbor.rows();
|
||||
return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
|
||||
host_jlist.row_bytes();
|
||||
else
|
||||
return 0;
|
||||
} else
|
||||
|
@ -186,7 +199,7 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
|
|||
|
||||
UCL_H_Vec<int> ilist_view;
|
||||
ilist_view.view(ilist,inum,*dev);
|
||||
ucl_copy(dev_nbor,ilist_view,true);
|
||||
ucl_copy(dev_nbor,ilist_view,false);
|
||||
|
||||
UCL_D_Vec<int> nbor_offset;
|
||||
UCL_H_Vec<int> host_offset;
|
||||
|
@ -238,46 +251,20 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
|
|||
if (_use_packing==false) {
|
||||
time_kernel.start();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
|
||||
k_nbor.set_size(GX,block_size);
|
||||
k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
|
||||
_shared->k_nbor.set_size(GX,block_size);
|
||||
_shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
|
||||
time_kernel.stop();
|
||||
}
|
||||
}
|
||||
|
||||
void PairGPUNbor::compile_kernels(UCL_Device &dev) {
|
||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
|
||||
|
||||
if (_gpu_nbor==false) {
|
||||
nbor_program=new UCL_Program(dev);
|
||||
nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
|
||||
k_nbor.set_function(*nbor_program,"kernel_unpack");
|
||||
} else {
|
||||
build_program=new UCL_Program(dev);
|
||||
#ifdef USE_OPENCL
|
||||
std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
|
||||
exit(1);
|
||||
#else
|
||||
build_program->load_string(pair_gpu_build_kernel,flags.c_str());
|
||||
#endif
|
||||
k_cell_id.set_function(*build_program,"calc_cell_id");
|
||||
k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
|
||||
k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
|
||||
k_transpose.set_function(*build_program,"transpose");
|
||||
k_special.set_function(*build_program,"kernel_special");
|
||||
neigh_tex.get_texture(*build_program,"neigh_tex");
|
||||
}
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall,
|
||||
PairGPUAtom<numtyp,acctyp> &atom,
|
||||
double *boxlo, double *boxhi, int *tag,
|
||||
double *sublo, double *subhi, int *tag,
|
||||
int **nspecial, int **special, bool &success,
|
||||
int &mn) {
|
||||
const int nt=inum+host_inum;
|
||||
|
||||
if (_maxspecial>0) {
|
||||
time_nbor.start();
|
||||
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
|
||||
|
@ -290,25 +277,25 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||
time_nbor.stop();
|
||||
time_nbor.add_to_total();
|
||||
time_kernel.start();
|
||||
const int b2x=8;
|
||||
const int b2y=8;
|
||||
const int b2x=_block_cell_2d;
|
||||
const int b2y=_block_cell_2d;
|
||||
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
||||
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
||||
k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||
k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial,
|
||||
&nt);
|
||||
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
|
||||
&_maxspecial,&nt);
|
||||
} else
|
||||
time_kernel.start();
|
||||
|
||||
_nbor_pitch=inum;
|
||||
neigh_tex.bind_float(atom.dev_x,4);
|
||||
_shared->neigh_tex.bind_float(atom.dev_x,4);
|
||||
|
||||
int ncellx, ncelly, ncellz, ncell_3d;
|
||||
ncellx = static_cast<int>(ceil(((boxhi[0] - boxlo[0]) +
|
||||
ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
|
||||
2.0*_cell_size)/_cell_size));
|
||||
ncelly = static_cast<int>(ceil(((boxhi[1] - boxlo[1]) +
|
||||
ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
|
||||
2.0*_cell_size)/_cell_size));
|
||||
ncellz = static_cast<int>(ceil(((boxhi[2] - boxlo[2]) +
|
||||
ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
|
||||
2.0*_cell_size)/_cell_size));
|
||||
ncell_3d = ncellx * ncelly * ncellz;
|
||||
UCL_D_Vec<int> cell_counts;
|
||||
|
@ -316,35 +303,36 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||
_cell_bytes=cell_counts.row_bytes();
|
||||
|
||||
/* build cell list on GPU */
|
||||
const int neigh_block=128;
|
||||
const int neigh_block=_block_cell_id;
|
||||
const int GX=(int)ceil((float)nall/neigh_block);
|
||||
const numtyp boxlo0=static_cast<numtyp>(boxlo[0]);
|
||||
const numtyp boxlo1=static_cast<numtyp>(boxlo[1]);
|
||||
const numtyp boxlo2=static_cast<numtyp>(boxlo[2]);
|
||||
const numtyp boxhi0=static_cast<numtyp>(boxhi[0]);
|
||||
const numtyp boxhi1=static_cast<numtyp>(boxhi[1]);
|
||||
const numtyp boxhi2=static_cast<numtyp>(boxhi[2]);
|
||||
const numtyp sublo0=static_cast<numtyp>(sublo[0]);
|
||||
const numtyp sublo1=static_cast<numtyp>(sublo[1]);
|
||||
const numtyp sublo2=static_cast<numtyp>(sublo[2]);
|
||||
const numtyp subhi0=static_cast<numtyp>(subhi[0]);
|
||||
const numtyp subhi1=static_cast<numtyp>(subhi[1]);
|
||||
const numtyp subhi2=static_cast<numtyp>(subhi[2]);
|
||||
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
|
||||
k_cell_id.set_size(GX,neigh_block);
|
||||
k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
|
||||
&atom.dev_particle_id.begin(),
|
||||
&boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1,
|
||||
&boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
|
||||
_shared->k_cell_id.set_size(GX,neigh_block);
|
||||
_shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
|
||||
&atom.dev_particle_id.begin(),
|
||||
&sublo0, &sublo1, &sublo2, &subhi0, &subhi1,
|
||||
&subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
|
||||
|
||||
atom.sort_neighbor(nall);
|
||||
|
||||
/* calculate cell count */
|
||||
k_cell_counts.set_size(GX,neigh_block);
|
||||
k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall,
|
||||
&ncell_3d);
|
||||
_shared->k_cell_counts.set_size(GX,neigh_block);
|
||||
_shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(),
|
||||
&nall, &ncell_3d);
|
||||
|
||||
/* build the neighbor list */
|
||||
const int cell_block=64;
|
||||
k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
|
||||
k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
|
||||
&cell_counts.begin(), &dev_nbor.begin(),
|
||||
&dev_host_nbor.begin(), &_max_nbors, &cell_size_cast,
|
||||
&ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
|
||||
const int cell_block=_block_nbor_build;
|
||||
_shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
|
||||
_shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
|
||||
&cell_counts.begin(), &dev_nbor.begin(),
|
||||
&dev_host_nbor.begin(), &dev_host_numj.begin(),
|
||||
&_max_nbors,&cell_size_cast,
|
||||
&ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
|
||||
|
||||
/* Get the maximum number of nbors and realloc if necessary */
|
||||
UCL_D_Vec<int> numj;
|
||||
|
@ -353,7 +341,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||
if (nt>inum) {
|
||||
UCL_H_Vec<int> host_offset;
|
||||
host_offset.view_offset(inum,host_acc,nt-inum);
|
||||
ucl_copy(host_offset,dev_host_nbor,nt-inum,false);
|
||||
ucl_copy(host_offset,dev_host_numj,nt-inum,false);
|
||||
}
|
||||
mn=host_acc[0];
|
||||
for (int i=1; i<nt; i++)
|
||||
|
@ -368,10 +356,15 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||
if (_max_host>0) {
|
||||
host_nbor.clear();
|
||||
dev_host_nbor.clear();
|
||||
success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor,
|
||||
success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
|
||||
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
||||
success=success && (dev_host_nbor.alloc((mn+1)*_max_host,
|
||||
success=success && (dev_host_nbor.alloc(mn*_max_host,
|
||||
dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
int *ptr=host_nbor.begin();
|
||||
for (int i=0; i<_max_host; i++) {
|
||||
host_jlist[i]=ptr;
|
||||
ptr+=mn;
|
||||
}
|
||||
_gpu_bytes+=dev_host_nbor.row_bytes();
|
||||
}
|
||||
if (_alloc_packed) {
|
||||
|
@ -385,28 +378,29 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||
_max_nbors=mn;
|
||||
time_kernel.stop();
|
||||
time_kernel.add_to_total();
|
||||
build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial,
|
||||
build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
|
||||
special, success, mn);
|
||||
return;
|
||||
}
|
||||
|
||||
if (_maxspecial>0) {
|
||||
const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
|
||||
k_special.set_size(GX2,cell_block);
|
||||
k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
|
||||
&atom.dev_tag.begin(), &dev_nspecial.begin(),
|
||||
&dev_special.begin(), &inum, &nt, &nall);
|
||||
_shared->k_special.set_size(GX2,cell_block);
|
||||
_shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
|
||||
&dev_host_numj.begin(), &atom.dev_tag.begin(),
|
||||
&dev_nspecial.begin(), &dev_special.begin(),
|
||||
&inum, &nt, &nall, &_max_nbors);
|
||||
}
|
||||
time_kernel.stop();
|
||||
|
||||
time_nbor.start();
|
||||
if (_gpu_host)
|
||||
ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);
|
||||
ucl_copy(host_nbor,dev_host_nbor,false);
|
||||
time_nbor.stop();
|
||||
}
|
||||
|
||||
template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
|
||||
(const int inum, const int host_inum, const int nall,
|
||||
PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *boxlo, double *boxhi,
|
||||
(const int inum, const int host_inum, const int nall,
|
||||
PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
|
||||
int *, int **, int **, bool &success, int &mn);
|
||||
|
||||
|
|
|
@ -19,32 +19,27 @@
|
|||
#define PAIR_GPU_NBOR_H
|
||||
|
||||
#include "pair_gpu_atom.h"
|
||||
#include "pair_gpu_nbor_shared.h"
|
||||
|
||||
#define IJ_SIZE 131072
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#include "geryon/ocl_device.h"
|
||||
#include "geryon/ocl_timer.h"
|
||||
#include "geryon/ocl_mat.h"
|
||||
#include "geryon/ocl_kernel.h"
|
||||
#include "geryon/ocl_texture.h"
|
||||
using namespace ucl_opencl;
|
||||
|
||||
#else
|
||||
|
||||
#include "geryon/nvd_device.h"
|
||||
#include "geryon/nvd_timer.h"
|
||||
#include "geryon/nvd_mat.h"
|
||||
#include "geryon/nvd_kernel.h"
|
||||
#include "geryon/nvd_texture.h"
|
||||
using namespace ucl_cudadr;
|
||||
|
||||
#endif
|
||||
|
||||
class PairGPUNbor {
|
||||
public:
|
||||
PairGPUNbor() : _allocated(false), _use_packing(false), _compiled(false) {}
|
||||
PairGPUNbor() : _allocated(false), _use_packing(false) {}
|
||||
~PairGPUNbor() { clear(); }
|
||||
|
||||
/// Determine whether neighbor unpacking should be used
|
||||
|
@ -62,9 +57,11 @@ class PairGPUNbor {
|
|||
* 2 if gpu_nbor is true, and host needs a full nbor list
|
||||
* \param pre_cut True if cutoff test will be performed in separate kernel
|
||||
* than the force kernel **/
|
||||
bool init(const int inum, const int host_inum, const int max_nbors,
|
||||
const int maxspecial, UCL_Device &dev, const bool gpu_nbor,
|
||||
const int gpu_host, const bool pre_cut);
|
||||
bool init(PairGPUNborShared *shared, const int inum, const int host_inum,
|
||||
const int max_nbors, const int maxspecial, UCL_Device &dev,
|
||||
const bool gpu_nbor, const int gpu_host, const bool pre_cut,
|
||||
const int block_cell_2d, const int block_cell_id,
|
||||
const int block_nbor_build);
|
||||
|
||||
/// Set the size of the cutoff+skin
|
||||
inline void cell_size(const double size) { _cell_size=size; }
|
||||
|
@ -131,18 +128,18 @@ class PairGPUNbor {
|
|||
inline int max_nbors() const { return _max_nbors; }
|
||||
|
||||
/// Loop through neighbor count array and return maximum nbors for a particle
|
||||
inline int max_nbor_loop(const int inum, int *numj) const {
|
||||
inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
|
||||
int mn=0;
|
||||
for (int i=0; i<inum; i++)
|
||||
mn=std::max(mn,numj[i]);
|
||||
mn=std::max(mn,numj[ilist[i]]);
|
||||
return mn;
|
||||
}
|
||||
|
||||
/// Build nbor list on the device
|
||||
template <class numtyp, class acctyp>
|
||||
void build_nbor_list(const int inum, const int host_inum, const int nall,
|
||||
PairGPUAtom<numtyp,acctyp> &atom, double *boxlo,
|
||||
double *boxhi, int *tag, int **nspecial, int **special,
|
||||
PairGPUAtom<numtyp,acctyp> &atom, double *sublo,
|
||||
double *subhi, int *tag, int **nspecial, int **special,
|
||||
bool &success, int &max_nbors);
|
||||
|
||||
/// Return the number of bytes used on device
|
||||
|
@ -176,31 +173,31 @@ class PairGPUNbor {
|
|||
UCL_H_Vec<int> host_nbor;
|
||||
/// Device storage for neighbor list matrix that will be copied to host
|
||||
/** - 1st row is numj
|
||||
* - Remaining rows are nbors **/
|
||||
* - Remaining rows are by atom, columns are nbors **/
|
||||
UCL_D_Vec<int> dev_host_nbor;
|
||||
UCL_D_Vec<int> dev_host_numj;
|
||||
UCL_H_Vec<int> host_ilist;
|
||||
UCL_H_Vec<int*> host_jlist;
|
||||
/// Device storage for special neighbor counts
|
||||
UCL_D_Vec<int> dev_nspecial;
|
||||
/// Device storage for special neighbors
|
||||
UCL_D_Vec<int> dev_special, dev_special_t;
|
||||
/// Texture for cached position/type access with CUDA
|
||||
UCL_Texture neigh_tex;
|
||||
|
||||
/// Device timers
|
||||
UCL_Timer time_nbor, time_kernel;
|
||||
|
||||
private:
|
||||
PairGPUNborShared *_shared;
|
||||
UCL_Device *dev;
|
||||
UCL_Program *nbor_program, *build_program;
|
||||
UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
|
||||
UCL_Kernel k_transpose, k_special;
|
||||
bool _allocated, _use_packing, _compiled;
|
||||
void compile_kernels(UCL_Device &dev);
|
||||
bool _allocated, _use_packing;
|
||||
int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
|
||||
bool _gpu_nbor, _gpu_host, _alloc_packed;
|
||||
double _cell_size;
|
||||
|
||||
double _gpu_bytes, _c_bytes, _cell_bytes;
|
||||
void alloc(bool &success);
|
||||
|
||||
int _block_cell_2d, _block_cell_id, _block_nbor_build;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
|
|||
#define acctyp4 _lgpu_float4
|
||||
#endif
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
#define MAX_BIO_SHARED_TYPES 128
|
||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue