git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2011-05-02 15:02:52 +00:00
parent 2be078632d
commit 5f799182b3
70 changed files with 4489 additions and 2253 deletions

View File

@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include
CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
BIN_DIR = ./

View File

@ -17,16 +17,16 @@
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/
CUDA_HOME = /sw/analysis-x64/cuda/3.2/centos5.5_binary/
NVCC = nvcc
CUDA_ARCH = -arch=sm_13
CUDA_PRECISION = -D_SINGLE_SINGLE
CUDA_PRECISION = -D_SINGLE_DOUBLE
CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib64
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpic++ -DMPI_GERYON -openmp
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp
CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
BIN_DIR = ./

View File

@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpic++ -DMPI_GERYON
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT
CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
BIN_DIR = ./

View File

@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib64
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
BIN_DIR = ./

View File

@ -17,7 +17,7 @@
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -lOpenCL
OCL_PREC = -D_SINGLE_SINGLE

View File

@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
BIN_DIR = ./

View File

@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11
CUDA_PRECISION = -D_SINGLE_SINGLE
CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32
CUDR_CPP = mpic++
CUDR_OPTS = -O2 -m32 -g

View File

@ -17,7 +17,7 @@
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT
OCL_LINK = -framework OpenCL
OCL_PREC = -D_SINGLE_SINGLE

View File

@ -13,7 +13,8 @@
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Peng Wang (Nvidia), penwang@nvidia.com
# Inderaj Bains (NVIDIA), ibains@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
@ -28,10 +29,11 @@ GPU_LIB = $(LIB_DIR)/libgpu.a
# Headers for Geryon
UCL_H = $(wildcard ./geryon/ucl*.h)
NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H)
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H)
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) nv_kernel_def.h
# Headers for Pair Stuff
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
pair_gpu_device.h pair_gpu_balance.h
PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
pair_gpu_balance.h pppm_gpu_memory.h
ALL_H = $(NVD_H) $(PAIR_H)
@ -39,28 +41,37 @@ EXECS = $(BIN_DIR)/nvc_get_devices
CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
$(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
$(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
$(OBJ_DIR)/charge_gpu_memory.o \
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
$(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
$(OBJ_DIR)/pair_gpu_device.o \
$(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
$(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
$(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
$(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
$(CUDPP)
PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
$(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
$(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
$(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \
$(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \
$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
$(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
$(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
$(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
$(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \
$(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \
$(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
@ -93,6 +104,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H)
$(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
@ -105,11 +119,20 @@ $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
$(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
$(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
$(CUDR) -o $@ -c pair_gpu_device.cpp
$(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu
$(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h
$(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
$(CUDR) -o $@ -c atomic_gpu_memory.cpp
@ -117,6 +140,24 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c
$(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
$(CUDR) -o $@ -c charge_gpu_memory.cpp
$(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu
$(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h
$(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu
$(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h
$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h
$(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
$(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
@ -144,7 +185,7 @@ $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
@ -156,7 +197,7 @@ $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_c
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
@ -168,9 +209,21 @@ $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu
$(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h
$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
$(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
@ -180,7 +233,7 @@ $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ke
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
@ -192,9 +245,21 @@ $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj9
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu
$(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h
$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
$(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
@ -204,7 +269,7 @@ $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_c
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
@ -216,7 +281,7 @@ $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/c
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)

View File

@ -14,6 +14,7 @@
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Inderaj Bains (NVIDIA), ibains@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
@ -23,30 +24,37 @@ OCL_LIB = $(LIB_DIR)/libgpu.a
UCL_H = $(wildcard ./geryon/ucl*.h)
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H)
# Headers for Pair Stuff
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
pair_gpu_device.h pair_gpu_balance.h
PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
pair_gpu_balance.h pppm_gpu_memory.h
ALL_H = $(OCL_H) $(PAIR_H)
EXECS = $(BIN_DIR)/ocl_get_devices
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
$(OBJ_DIR)/charge_gpu_memory.o \
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
$(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
$(OBJ_DIR)/pair_gpu_device.o \
$(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
$(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
$(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
$(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o
KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \
$(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \
$(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
$(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
$(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
$(OBJ_DIR)/crml_gpu_cl.h \
$(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h
$(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \
$(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \
$(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \
$(OBJ_DIR)/cmmc_long_gpu_cl.h
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
all: $(OCL_LIB) $(EXECS)
@ -57,14 +65,23 @@ $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(OCL_H)
$(OCL) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
$(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
$(OCL) -o $@ -c pair_gpu_device.cpp
$(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h
$(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
$(OCL) -o $@ -c atomic_gpu_memory.cpp
@ -72,6 +89,15 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c
$(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
$(OCL) -o $@ -c charge_gpu_memory.cpp
$(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h;
$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h
$(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
$(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
@ -93,7 +119,7 @@ $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
@ -102,7 +128,7 @@ $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
@ -111,16 +137,25 @@ $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h;
$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
$(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
@ -129,16 +164,25 @@ $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h;
$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
$(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
@ -147,7 +191,7 @@ $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp

View File

@ -14,6 +14,7 @@
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
Peng Wang (Nvidia), penwang@nvidia.com
Inderaj Bains (NVIDIA), ibains@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */

View File

@ -23,23 +23,28 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor();
}
template <class numtyp, class acctyp>
AtomicGPUMemoryT::~AtomicGPUMemory() {
delete ans;
delete nbor;
}
template <class numtyp, class acctyp>
int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
nbor_time_avail=false;
screen=_screen;
@ -48,24 +53,30 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
_gpu_host,max_nbors,cell_size,false))
return false;
_threads_per_atom=device->threads_per_atom();
if (_threads_per_atom>1 && gpu_nbor==false) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;
nbor=&device->nbor;
_block_size=BLOCK_1D;
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
_block_size=ucl_device->group_size();
_block_size=device->pair_block_size();
compile_kernels(*ucl_device,pair_program);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_split);
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
@ -73,9 +84,14 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
pos_tex.bind_float(atom->dev_x,4);
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
return true;
return 0;
}
template <class numtyp, class acctyp>
void AtomicGPUMemoryT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
@ -83,7 +99,10 @@ void AtomicGPUMemoryT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
if (_compiled) {
k_pair_fast.clear();
@ -107,8 +126,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
success=true;
nbor_time_avail=true;
int mn=nbor->max_nbor_loop(inum,numj);
int mn=nbor->max_nbor_loop(inum,numj,ilist);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
@ -116,7 +134,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
@ -130,8 +148,8 @@ template <class numtyp, class acctyp>
inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
const int host_inum,
const int nall, double **host_x,
int *host_type, double *boxlo,
double *boxhi, int *tag,
int *host_type, double *sublo,
double *subhi, int *tag,
int **nspecial, int **special,
bool &success) {
nbor_time_avail=true;
@ -144,10 +162,10 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
@ -156,24 +174,25 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
const int inum_full, const int nall,
double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success) {
void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
nbor->gpu_nbor());
atom->inum(inum);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
host_start=inum;
if (ago==0) {
@ -187,7 +206,8 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
atom->add_x_data(host_x,host_type);
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
device->add_ans_object(ans);
hd_balancer.stop_timer();
}
@ -195,29 +215,32 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
const int inum_full, const int nall,
double **host_x, int *host_type, double *boxlo,
double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag,
int **nspecial, int **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return NULL;
}
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
atom->inum(inum);
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
boxlo, boxhi, tag, nspecial, special, success);
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
hd_balancer.start_timer();
@ -226,19 +249,21 @@ int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom);
ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return device->nbor.host_nbor.begin();
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double AtomicGPUMemoryT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
sizeof(AtomicGPUMemory<numtyp,acctyp>);
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(AtomicGPUMemory<numtyp,acctyp>);
}
template <class numtyp, class acctyp>

View File

@ -18,8 +18,6 @@
#ifndef ATOMIC_GPU_MEMORY_H
#define ATOMIC_GPU_MEMORY_H
#define BLOCK_1D 64
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
@ -39,17 +37,28 @@ class AtomicGPUMemory {
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(inum, nall, success))
if (atom->resize(nall, success))
pos_tex.bind_float(atom->dev_x,4);
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
@ -85,13 +94,16 @@ class AtomicGPUMemory {
/// Accumulate timers
inline void acc_timers() {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
if (device->time_device()) {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
}
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
time_pair.add_to_total();
atom->acc_timers();
}
/// Zero timers
@ -99,6 +111,7 @@ class AtomicGPUMemory {
nbor_time_avail=false;
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
@ -108,24 +121,32 @@ class AtomicGPUMemory {
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, bool &success);
/// Pair loop with host neighboring
void compute(const int timestep, const int f_ago, const int inum_full,
void compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int * compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *boxlo,
double *boxhi, int *tag, int **nspecial,
int * compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
@ -148,6 +169,9 @@ class AtomicGPUMemory {
/// Atom Data
PairGPUAtom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
@ -167,8 +191,10 @@ class AtomicGPUMemory {
protected:
bool _compiled;
int _block_size;
int _block_size, _threads_per_atom;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const char *pair_string);

View File

@ -23,23 +23,28 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor();
}
template <class numtyp, class acctyp>
ChargeGPUMemoryT::~ChargeGPUMemory() {
delete ans;
delete nbor;
}
template <class numtyp, class acctyp>
int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
nbor_time_avail=false;
screen=_screen;
@ -48,24 +53,31 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
_gpu_host,max_nbors,cell_size,false))
return false;
_threads_per_atom=device->threads_per_charge();
if (_threads_per_atom>1 && gpu_nbor==false) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;
nbor=&device->nbor;
_block_size=BLOCK_1D;
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
_block_size=ucl_device->group_size();
_block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_split);
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
@ -74,9 +86,14 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
return true;
return success;
}
template <class numtyp, class acctyp>
void ChargeGPUMemoryT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
@ -84,7 +101,10 @@ void ChargeGPUMemoryT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
if (_compiled) {
k_pair_fast.clear();
@ -109,7 +129,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
nbor_time_avail=true;
int mn=nbor->max_nbor_loop(inum,numj);
int mn=nbor->max_nbor_loop(inum,numj,ilist);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
@ -117,7 +137,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
@ -131,8 +151,8 @@ template <class numtyp, class acctyp>
inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
const int host_inum,
const int nall, double **host_x,
int *host_type, double *boxlo,
double *boxhi, int *tag,
int *host_type, double *sublo,
double *subhi, int *tag,
int **nspecial, int **special,
bool &success) {
nbor_time_avail=true;
@ -145,10 +165,10 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
@ -157,24 +177,26 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
const int inum_full, const int nall,
double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double *host_q) {
void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
nbor->gpu_nbor());
atom->inum(inum);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
host_start=inum;
if (ago==0) {
@ -187,10 +209,14 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
atom->cast_q_data(host_q);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
atom->add_other_data();
atom->add_q_data();
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
boxlo, prd);
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
device->add_ans_object(ans);
hd_balancer.stop_timer();
}
@ -198,30 +224,33 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
const int inum_full, const int nall,
double **host_x, int *host_type, double *boxlo,
double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag,
int** ChargeGPUMemoryT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag,
int **nspecial, int **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success,
double *host_q) {
double *host_q, double *boxlo, double *prd) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return NULL;
}
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
atom->inum(inum);
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
boxlo, boxhi, tag, nspecial, special, success);
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
atom->cast_q_data(host_q);
@ -232,20 +261,25 @@ int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_other_data();
atom->add_q_data();
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
boxlo, prd);
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom);
ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return device->nbor.host_nbor.begin();
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double ChargeGPUMemoryT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
sizeof(ChargeGPUMemory<numtyp,acctyp>);
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(ChargeGPUMemory<numtyp,acctyp>);
}
template <class numtyp, class acctyp>

View File

@ -18,8 +18,6 @@
#ifndef CHARGE_GPU_MEMORY_H
#define CHARGE_GPU_MEMORY_H
#define BLOCK_1D 64
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
@ -39,19 +37,30 @@ class ChargeGPUMemory {
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(inum, nall, success)) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
}
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
@ -87,13 +96,16 @@ class ChargeGPUMemory {
/// Accumulate timers
inline void acc_timers() {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
if (device->time_device()) {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
}
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
time_pair.add_to_total();
atom->acc_timers();
}
/// Zero timers
@ -101,6 +113,7 @@ class ChargeGPUMemory {
nbor_time_avail=false;
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
@ -110,24 +123,25 @@ class ChargeGPUMemory {
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, bool &success);
/// Pair loop with host neighboring
void compute(const int timestep, const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success,
double *charge);
void compute(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *charge,
const int nlocal, double *boxlo, double *prd);
/// Pair loop with device neighboring
int * compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *boxlo,
double *boxhi, int *tag, int **nspecial,
int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *charge);
int **ilist, int **numj, const double cpu_time, bool &success,
double *charge, double *boxlo, double *prd);
// -------------------------- DEVICE DATA -------------------------
@ -152,6 +166,10 @@ class ChargeGPUMemory {
PairGPUAtom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
@ -171,8 +189,10 @@ class ChargeGPUMemory {
protected:
bool _compiled;
int _block_size;
int _block_size, _block_bio_size, _threads_per_atom;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const char *pair_string);

View File

@ -28,12 +28,12 @@ static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
CMMMF.clear();
gpu_mode=CMMMF.device->gpu_mode();
double gpu_split=CMMMF.device->particle_split();
@ -54,13 +54,11 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
fflush(screen);
}
if (world_me==0) {
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
if (!init_ok)
return false;
}
int init_ok=0;
if (world_me==0)
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
CMMMF.device->world_barrier();
if (message)
@ -75,45 +73,45 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split,
screen);
if (!init_ok)
return false;
}
if (gpu_rank==i && world_me!=0)
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
CMMMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
if (init_ok==0)
CMMMF.estimate_gpu_overhead();
return init_ok;
}
void cmm_gpu_clear() {
CMMMF.clear();
}
int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full,
int** cmm_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success);
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void cmm_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success) {
CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}

View File

@ -18,8 +18,6 @@
#ifndef CMM_GPU_KERNEL
#define CMM_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
@ -46,7 +44,7 @@
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
@ -82,40 +82,56 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum, const int nall,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
@ -164,8 +180,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
@ -183,49 +238,64 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,__global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
__global numtyp* sp_lj_in,__global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum, const int nall,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (ii<4)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[ii]=lj3_in[ii];
lj3[tid]=lj3_in[tid];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
@ -273,8 +343,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;

View File

@ -42,22 +42,26 @@ int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cmm_cut_gpu_kernel);
int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cmm_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int cmm_types=ntypes;
shared_types=false;
if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
cmm_types=MAX_SHARED_TYPES;
int max_shared_types=this->device->max_shared_types();
if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
cmm_types=max_shared_types;
shared_types=true;
}
_cmm_types=cmm_types;
@ -84,7 +88,7 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return true;
return 0;
}
template <class numtyp, class acctyp>
@ -122,9 +126,10 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -133,16 +138,18 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch);
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch);
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -29,13 +29,20 @@ class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, int **host_cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/

View File

@ -28,14 +28,14 @@ static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
CMMLMF.clear();
gpu_mode=CMMLMF.device->gpu_mode();
double gpu_split=CMMLMF.device->particle_split();
@ -56,15 +56,12 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
fflush(screen);
}
if (world_me==0) {
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, inum,
nall, 300, maxspecial, cell_size, gpu_split,
screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e,g_ewald);
if (!init_ok)
return false;
}
int init_ok=0;
if (world_me==0)
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
CMMLMF.device->world_barrier();
if (message)
@ -79,48 +76,51 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, inum,
nall, 300, maxspecial, cell_size, gpu_split,
screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald);
if (!init_ok)
return false;
}
if (gpu_rank==i && world_me!=0)
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e, g_ewald);
CMMLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
if (init_ok==0)
CMMLMF.estimate_gpu_overhead();
return init_ok;
}
void cmml_gpu_clear() {
CMMLMF.clear();
}
int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
int** cmml_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q) {
return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success, host_q);
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q,boxlo,prd);
}
void cmml_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q) {
CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q);
host_q,nlocal,boxlo,prd);
}
double cmml_gpu_bytes() {

View File

@ -18,8 +18,6 @@
#ifndef CMML_GPU_KERNEL
#define CMML_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
@ -54,7 +52,7 @@
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_ , const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum, const int nall,
const int nbor_pitch, __global numtyp *q_ ,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
@ -213,8 +229,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
@ -234,51 +291,67 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_ , const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
const numtyp qqrd2e, const numtyp g_ewald,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (ii<8)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
lj3[ii]=lj3_in[ii];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
lj3[tid]=lj3_in[tid];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
@ -351,8 +424,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
}
} // for nbor
} // if ii
// Store answers
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;

View File

@ -43,26 +43,30 @@ int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq,
const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cmmc_long_gpu_kernel);
int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq,
const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cmmc_long_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
@ -95,7 +99,7 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return true;
return 0;
}
template <class numtyp, class acctyp>
@ -133,9 +137,10 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -144,19 +149,21 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald);
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald);
&_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -29,15 +29,22 @@ class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq, int ** cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, int ** cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/

View File

@ -28,16 +28,16 @@ static CRML_GPU_Memory<PRECISION,ACC_PRECISION> CRMLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald, const double cut_lj_innersq,
const double denom_lj, double **epsilon,
double **sigma, const bool mix_arithmetic) {
int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald, const double cut_lj_innersq,
const double denom_lj, double **epsilon,
double **sigma, const bool mix_arithmetic) {
CRMLMF.clear();
gpu_mode=CRMLMF.device->gpu_mode();
double gpu_split=CRMLMF.device->particle_split();
@ -58,16 +58,13 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
fflush(screen);
}
if (world_me==0) {
bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
epsilon,sigma,mix_arithmetic);
if (!init_ok)
return false;
}
int init_ok=0;
if (world_me==0)
CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
epsilon,sigma,mix_arithmetic);
CRMLMF.device->world_barrier();
if (message)
@ -82,50 +79,54 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split,
screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald,
cut_lj_innersq, denom_lj, epsilon, sigma,
mix_arithmetic);
if (!init_ok)
return false;
}
if (gpu_rank==i && world_me!=0)
init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon,
sigma, mix_arithmetic);
CRMLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
if (init_ok==0)
CRMLMF.estimate_gpu_overhead();
return init_ok;
}
void crml_gpu_clear() {
CRMLMF.clear();
}
int * crml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
int** crml_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q) {
return CRMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success, host_q);
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void crml_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q) {
CRMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q);
void crml_gpu_compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q, const int nlocal,
double *boxlo, double *prd) {
CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
nlocal,boxlo,prd);
}
double crml_gpu_bytes() {

View File

@ -54,7 +54,7 @@
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
@ -90,6 +90,7 @@ __inline float fetch_q(const int& i, const float *q)
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_BIO_PAIR 64
#endif
@ -98,18 +99,22 @@ __inline float fetch_q(const int& i, const float *q)
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
const int lj_types, __global numtyp *sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_, const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald,
const numtyp denom_lj, const numtyp cut_bothsq,
const numtyp cut_ljsq, const numtyp cut_lj_innersq) {
const numtyp cut_ljsq, const numtyp cut_lj_innersq,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
@ -120,29 +125,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
@ -219,8 +236,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
@ -240,50 +298,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_, const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald,
const numtyp denom_lj, const numtyp cut_bothsq,
const numtyp cut_ljsq,
const numtyp cut_lj_innersq) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum, const int nall,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const numtyp denom_lj,
const numtyp cut_bothsq, const numtyp cut_ljsq,
const numtyp cut_lj_innersq,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
__local numtyp sp_lj[8];
if (ii<8)
sp_lj[ii]=sp_lj_in[ii];
ljd[ii]=ljd_in[ii];
ljd[ii+64]=ljd_in[ii+64];
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
ljd[tid]=ljd_in[tid];
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
@ -366,8 +439,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
}
} // for nbor
} // if ii
// Store answers
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;

View File

@ -43,7 +43,7 @@ int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
bool CRML_GPU_MemoryT::init(const int ntypes,
int CRML_GPU_MemoryT::init(const int ntypes,
double host_cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
@ -56,20 +56,24 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
const double g_ewald, const double cut_lj_innersq,
const double denom_lj, double **epsilon,
double **sigma, const bool mix_arithmetic) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,crml_gpu_kernel);
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,crml_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (this->_block_size>=64 && mix_arithmetic)
if (this->_block_bio_size>=64 && mix_arithmetic)
shared_types=true;
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
int h_size=lj_types*lj_types;
if (h_size<MAX_BIO_SHARED_TYPES)
h_size=MAX_BIO_SHARED_TYPES;
int max_bio_shared_types=this->device->max_bio_shared_types();
if (h_size<max_bio_shared_types)
h_size=max_bio_shared_types;
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<h_size*32; i++)
@ -79,7 +83,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_lj3,host_lj4);
ljd.alloc(MAX_BIO_SHARED_TYPES,*(this->ucl_device),UCL_READ_ONLY);
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
@ -99,7 +103,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
_allocated=true;
this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
return true;
return 0;
}
template <class numtyp, class acctyp>
@ -125,7 +129,7 @@ double CRML_GPU_MemoryT::host_memory_usage() const {
template <class numtyp, class acctyp>
void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
const int BX=this->_block_bio_size;
int eflag, vflag;
if (_eflag)
eflag=1;
@ -137,9 +141,10 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -147,21 +152,24 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
&sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
&_cut_ljsq, &_cut_lj_innersq);
&_cut_ljsq, &_cut_lj_innersq,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
&this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -29,17 +29,24 @@ class CRML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double host_cut_bothsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald,
const double cut_lj_innersq, const double denom_lj,
double **epsilon, double **sigma, const bool mix_arithmetic);
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double host_cut_bothsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald,
const double cut_lj_innersq, const double denom_lj,
double **epsilon, double **sigma, const bool mix_arithmetic);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/

View File

@ -49,14 +49,14 @@ void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool gb_gpu_init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const double cell_size, int &gpu_mode, FILE *screen) {
int gb_gpu_init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const double cell_size, int &gpu_mode, FILE *screen) {
GBMF.clear();
gpu_mode=GBMF.device->gpu_mode();
double gpu_split=GBMF.device->particle_split();
@ -77,14 +77,12 @@ bool gb_gpu_init(const int ntypes, const double gamma,
fflush(screen);
}
if (world_me==0) {
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
sigma, epsilon, host_lshape, form, host_lj1,
host_lj2, host_lj3, host_lj4, offset, special_lj,
inum, nall, max_nbors, cell_size, gpu_split, screen);
if (!init_ok)
return false;
}
int init_ok=0;
if (world_me==0)
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
sigma, epsilon, host_lshape, form, host_lj1,
host_lj2, host_lj3, host_lj4, offset, special_lj,
inum, nall, max_nbors, cell_size, gpu_split, screen);
GBMF.device->world_barrier();
if (message)
@ -99,22 +97,22 @@ bool gb_gpu_init(const int ntypes, const double gamma,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
sigma, epsilon, host_lshape, form, host_lj1,
host_lj2, host_lj3, host_lj4, offset, special_lj,
inum, nall, max_nbors, cell_size, gpu_split,
screen);
if (!init_ok)
return false;
}
if (gpu_rank==i && world_me!=0)
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma,
epsilon, host_lshape, form, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, inum, nall,
max_nbors, cell_size, gpu_split, screen);
GBMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
if (init_ok==0)
GBMF.estimate_gpu_overhead();
return init_ok;
}
// ---------------------------------------------------------------------------
@ -131,8 +129,8 @@ template <class gbmtyp>
inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
const int host_inum, const int nall,
double **host_x, double **host_quat,
int *host_type, double *boxlo,
double *boxhi, bool &success) {
int *host_type, double *sublo,
double *subhi, bool &success) {
gbm.nbor_time_avail=true;
success=true;
@ -144,7 +142,7 @@ inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
gbm.atom->cast_copy_x(host_x,host_type);
int mn;
gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
boxlo, boxhi, NULL, NULL, NULL, success, mn);
sublo, subhi, NULL, NULL, NULL, success, mn);
gbm.nbor->copy_unpacked(inum,mn);
gbm.last_ellipse=inum;
gbm.max_last_ellipse=inum;
@ -163,7 +161,7 @@ void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
gbm.nbor_time_avail=true;
int mn=gbm.nbor->max_nbor_loop(inum,numj);
int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist);
gbm.resize_atom(inum,nall,success);
gbm.resize_local(inum,0,mn,osize,success);
if (!success)
@ -216,9 +214,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/
(BX/gbm._threads_per_atom)));
int stride=gbm.nbor->nbor_pitch();
int ainum=gbm.atom->inum();
int ainum=gbm.ans->inum();
int anall=gbm.atom->nall();
if (gbm.multiple_forms) {
@ -226,7 +225,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
if (gbm.last_ellipse>0) {
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
static_cast<double>(BX)));
(BX/gbm._threads_per_atom)));
gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
@ -237,11 +236,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
&stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
&stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall,
&gbm._threads_per_atom);
gbm.time_gayberne.stop();
if (gbm.last_ellipse==gbm.atom->inum()) {
if (gbm.last_ellipse==gbm.ans->inum()) {
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
@ -254,9 +254,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
// ------------ SPHERE_ELLIPSE ---------------
gbm.time_kernel2.start();
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
gbm.last_ellipse)/BX));
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
gbm.last_ellipse)/
(BX/gbm._threads_per_atom)));
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
SPHERE_ELLIPSE,SPHERE_ELLIPSE);
gbm.time_kernel2.stop();
@ -266,13 +267,14 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
&gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(),
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
&vflag, &gbm.last_ellipse, &ainum, &anall);
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
&vflag, &gbm.last_ellipse, &ainum, &anall,
&gbm._threads_per_atom);
gbm.time_gayberne2.stop();
} else {
gbm.atom->dev_ans.zero();
gbm.atom->dev_engv.zero();
gbm.ans->dev_ans.zero();
gbm.ans->dev_engv.zero();
gbm.time_kernel.stop();
gbm.time_gayberne.start();
gbm.time_gayberne.stop();
@ -284,29 +286,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
// ------------ LJ ---------------
gbm.time_pair.start();
if (gbm.last_ellipse<gbm.atom->inum()) {
if (gbm.last_ellipse<gbm.ans->inum()) {
if (gbm.shared_types) {
GBMF.k_lj_fast.set_size(GX,BX);
GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
&gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
&stride, &gbm.nbor->dev_packed.begin(),
&gbm.atom->dev_ans.begin(),
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
&gbm.ans->dev_ans.begin(),
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
&gbm._threads_per_atom);
} else {
GBMF.k_lj.set_size(GX,BX);
GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
&gbm.lj3.begin(), &gbm._lj_types,
&gbm.gamma_upsilon_mu.begin(), &stride,
&gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
&gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
&gbm._threads_per_atom);
}
}
gbm.time_pair.stop();
} else {
gbm.time_kernel.start();
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
@ -315,9 +319,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
&gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
&stride, &gbm.atom->dev_ans.begin(), &ainum,
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &ainum, &anall);
&stride, &gbm.ans->dev_ans.begin(), &ainum,
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom);
gbm.time_gayberne.stop();
}
}
@ -326,30 +330,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
// Reneighbor on GPU if necessary and then compute forces, torques, energies
// ---------------------------------------------------------------------------
template <class gbmtyp>
inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
const int inum_full, const int nall,
double **host_x, int *host_type,
double *boxlo, double *boxhi, const bool eflag,
const bool vflag, const bool eatom,
inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
const int inum_full, const int nall,
double **host_x, int *host_type,
double *sublo, double *subhi, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success,
double **host_quat) {
int **ilist, int **jnum, const double cpu_time,
bool &success, double **host_quat) {
gbm.acc_timers();
if (inum_full==0) {
host_start=0;
gbm.zero_timers();
return NULL;
}
gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full);
gbm.atom->inum(inum);
gbm.hd_balancer.balance(cpu_time);
int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
gbm.ans->inum(inum);
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
host_quat, host_type, boxlo, boxhi, success);
host_quat, host_type, sublo, subhi, success);
if (!success)
return NULL;
gbm.atom->cast_quat_data(host_quat[0]);
@ -361,47 +366,49 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
gbm.atom->add_x_data(host_x,host_type);
}
gbm.atom->add_other_data();
gbm.atom->add_quat_data();
*ilist=gbm.nbor->host_ilist.begin();
*jnum=gbm.nbor->host_acc.begin();
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
gbm.device->add_ans_object(gbm.ans);
gbm.hd_balancer.stop_timer();
return gbm.device->nbor.host_nbor.begin();
return gbm.nbor->host_jlist.begin()-host_start;
}
int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success,
double **host_quat) {
return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x,
host_type, boxlo, boxhi, eflag, vflag, eatom, vatom,
host_start, cpu_time, success, host_quat);
int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double **host_quat) {
return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo,
subhi, eflag, vflag, eatom, vatom, host_start, ilist,
jnum, cpu_time, success, host_quat);
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, torques,..
// ---------------------------------------------------------------------------
template <class gbmtyp>
inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
const int inum_full,const int nall,double **host_x,
int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success,
double **host_quat) {
inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
const int nall,double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double **host_quat) {
gbm.acc_timers();
if (inum_full==0) {
host_start=0;
gbm.zero_timers();
return NULL;
}
int ago=gbm.hd_balancer.ago_first(f_ago);
int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time,
gbm.nbor->gpu_nbor());
gbm.atom->inum(inum);
int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time);
gbm.ans->inum(inum);
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
host_start=inum;
@ -421,21 +428,21 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
gbm.atom->cast_quat_data(host_quat[0]);
gbm.hd_balancer.start_timer();
gbm.atom->add_x_data(host_x,host_type);
gbm.atom->add_other_data();
gbm.atom->add_quat_data();
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
gbm.device->add_ans_object(gbm.ans);
gbm.hd_balancer.stop_timer();
return list;
}
int * gb_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double **host_quat) {
return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x,
int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double **host_quat) {
return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x,
host_type, ilist, numj, firstneigh, eflag, vflag,
eatom, vatom, host_start, cpu_time, success,
host_quat);

View File

@ -18,7 +18,6 @@
#ifndef GB_GPU_EXTRA_H
#define GB_GPU_EXTRA_H
#define MAX_SHARED_TYPES 8
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#ifdef _DOUBLE_DOUBLE
@ -47,7 +46,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
#else
@ -58,6 +57,8 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif

View File

@ -97,17 +97,17 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
__global acctyp4 *ans, const int astride,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int inum,
const int nall) {
const int nall, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
if (ii<4)
sp_lj[ii]=gum[ii+3];
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
__syncthreads();
if (ii<inum) {
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
@ -121,262 +121,309 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+mul24(stride,numj);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp a1[9], b1[9], g1[9];
numtyp4 ishape=shape[itype];
{
numtyp t[9];
gpu_quat_to_mat_trans(q,i,a1);
gpu_times3(ishape,a1,t);
gpu_transpose_times3(a1,t,g1);
gpu_times3(well[itype],a1,t);
gpu_transpose_times3(a1,t,b1);
}
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp a1[9], b1[9], g1[9];
numtyp4 ishape=shape[itype];
{
numtyp g2[9];
{
gpu_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
gpu_plus3(g1,g2,g12);
numtyp t[9];
gpu_quat_to_mat_trans(q,i,a1);
gpu_times3(ishape,a1,t);
gpu_transpose_times3(a1,t,g1);
gpu_times3(well[itype],a1,t);
gpu_transpose_times3(a1,t,b1);
}
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
numtyp g2[9];
{
gpu_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
gpu_plus3(g1,g2,g12);
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- replace r12 with r12 hat
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
}
// torque for particle 1
{
numtyp tempv[3], tempv2[3];
tempv[0] = -uslj_rsq*kappa[0];
tempv[1] = -uslj_rsq*kappa[1];
tempv[2] = -uslj_rsq*kappa[2];
gpu_row_times3(kappa,g1,tempv2);
gpu_cross3(tempv,tempv2,tUr);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
// Compute teta
numtyp temp[9], tempv[3], tempv2[3];
compute_eta_torque(g12,a1,ishape,temp);
numtyp temp1 = -eta*gum[1];
tempv[0] = temp1*temp[0];
tempv[1] = temp1*temp[1];
tempv[2] = temp1*temp[2];
gpu_cross3(a1,tempv,tempv2);
teta[0] = tempv2[0];
teta[1] = tempv2[1];
teta[2] = tempv2[2];
tempv[0] = temp1*temp[3];
tempv[1] = temp1*temp[4];
tempv[2] = temp1*temp[5];
gpu_cross3(a1+3,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
tempv[0] = temp1*temp[6];
tempv[1] = temp1*temp[7];
tempv[2] = temp1*temp[8];
gpu_cross3(a1+6,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
numtyp chi, dchi[3], tchi[3];
{ // Compute chi and dchi
// -- replace r12 with r12 hat
// Compute b12
numtyp b2[9], b12[9];
{
gpu_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
gpu_plus3(b1,b2,b12);
}
// compute chi_12
r12[0]*=r;
r12[1]*=r;
r12[2]*=r;
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
chi = gpu_dot3(r12,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
// compute t_chi
numtyp tempv[3];
gpu_row_times3(iota,b1,tempv);
gpu_cross3(tempv,iota,tchi);
temp1 = (numtyp)-4.0*ir*ir;
tchi[0] *= temp1;
tchi[1] *= temp1;
tchi[2] *= temp1;
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-r;
r12[1]*=-r;
r12[2]*=-r;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
// Torque on 1
temp1 = -u_r*eta*factor_lj;
temp2 = -u_r*chi*factor_lj;
numtyp temp3 = -chi*eta*factor_lj;
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
} // for nbor
} // if ii
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[7][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=tor.x;
red_acc[4][tid]=tor.y;
red_acc[5][tid]=tor.z;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
}
// torque for particle 1
{
numtyp tempv[3], tempv2[3];
tempv[0] = -uslj_rsq*kappa[0];
tempv[1] = -uslj_rsq*kappa[1];
tempv[2] = -uslj_rsq*kappa[2];
gpu_row_times3(kappa,g1,tempv2);
gpu_cross3(tempv,tempv2,tUr);
}
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
// Compute teta
numtyp temp[9], tempv[3], tempv2[3];
compute_eta_torque(g12,a1,ishape,temp);
numtyp temp1 = -eta*gum[1];
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
tor.x=red_acc[3][tid];
tor.y=red_acc[4][tid];
tor.z=red_acc[5][tid];
tempv[0] = temp1*temp[0];
tempv[1] = temp1*temp[1];
tempv[2] = temp1*temp[2];
gpu_cross3(a1,tempv,tempv2);
teta[0] = tempv2[0];
teta[1] = tempv2[1];
teta[2] = tempv2[2];
tempv[0] = temp1*temp[3];
tempv[1] = temp1*temp[4];
tempv[2] = temp1*temp[5];
gpu_cross3(a1+3,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
if (eflag>0 || vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
red_acc[6][tid]=energy;
tempv[0] = temp1*temp[6];
tempv[1] = temp1*temp[7];
tempv[2] = temp1*temp[8];
gpu_cross3(a1+6,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
}
numtyp chi, dchi[3], tchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b2[9], b12[9];
{
gpu_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
gpu_plus3(b1,b2,b12);
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<7; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
energy=red_acc[6][tid];
}
// compute chi_12
r12[0]*=r;
r12[1]*=r;
r12[2]*=r;
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
chi = gpu_dot3(r12,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
// compute t_chi
numtyp tempv[3];
gpu_row_times3(iota,b1,tempv);
gpu_cross3(tempv,iota,tchi);
temp1 = (numtyp)-4.0*ir*ir;
tchi[0] *= temp1;
tchi[1] *= temp1;
tchi[2] *= temp1;
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-r;
r12[1]*=-r;
r12[2]*=-r;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
// Torque on 1
temp1 = -u_r*eta*factor_lj;
temp2 = -u_r*chi*factor_lj;
numtyp temp3 = -chi*eta*factor_lj;
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=astride;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=astride;
}
}
ans[ii]=f;
ans[ii+astride]=tor;
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=astride;
}
}
ans[ii]=f;
ans[ii+astride]=tor;
} // if ii
}

View File

@ -34,33 +34,36 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag,const int start, const int inum,
const int nall) {
__local numtyp sp_lj[4];
const int nall, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
if (ii<4)
sp_lj[ii]=gum[ii+3];
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
__syncthreads();
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+stride*numj;
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
@ -69,7 +72,7 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
numtyp one_well=well[itype].x;
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=stride) {
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
@ -241,8 +244,47 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
@ -265,39 +307,42 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int nall) {
__local numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
if (ii<4)
sp_lj[ii]=gum[ii+3];
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
__syncthreads();
const int nall, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=stride) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
@ -338,8 +383,47 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
@ -361,50 +445,54 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp *gum,
const int stride,
__global int *dev_ij, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
const int eflag,const int vflag, const int start,
const int inum, const int nall) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int nall, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<4)
sp_lj[ii]=gum[ii+3];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
if (tid<4)
sp_lj[tid]=gum[tid+3];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[ii]=lj3_in[ii];
lj3[tid]=lj3_in[tid];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=stride) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
@ -443,8 +531,47 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;

View File

@ -18,8 +18,6 @@
#ifndef PAIR_GPU_KERNEL_H
#define PAIR_GPU_KERNEL_H
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
@ -32,7 +30,7 @@
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
#else
@ -42,6 +40,7 @@
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define MAX_SHARED_TYPES 8
#endif

View File

@ -32,30 +32,35 @@ template <class numtyp, class acctyp>
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
_max_bytes(0.0) {
device=&pair_gpu_device;
ans=new PairGPUAns<numtyp,acctyp>();
nbor=new PairGPUNbor;
}
template <class numtyp, class acctyp>
GB_GPU_MemoryT::~GB_GPU_Memory() {
clear();
delete ans;
delete nbor;
}
template <class numtyp, class acctyp>
int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
const double upsilon, const double mu,
double **host_shape, double **host_well,
double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape,
int **h_form, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4,
double **host_offset, const double *host_special_lj,
const int nlocal, const int nall,
const int max_nbors, const double cell_size,
const double gpu_split, FILE *_screen) {
int GB_GPU_MemoryT::init(const int ntypes, const double gamma,
const double upsilon, const double mu,
double **host_shape, double **host_well,
double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape,
int **h_form, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4,
double **host_offset, const double *host_special_lj,
const int nlocal, const int nall,
const int max_nbors, const double cell_size,
const double gpu_split, FILE *_screen) {
nbor_time_avail=false;
screen=_screen;
@ -64,24 +69,24 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host,
max_nbors,cell_size,true))
return false;
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
_gpu_host,max_nbors,cell_size,true);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;
nbor=&device->nbor;
_block_size=BLOCK_1D;
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
_block_size=ucl_device->group_size();
_block_size=device->pair_block_size();
compile_kernels(*ucl_device);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_split);
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
@ -90,8 +95,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
int max_shared_types=device->max_shared_types();
if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
@ -186,12 +192,19 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
}
if (multiple_forms)
atom->dev_ans.zero();
ans->dev_ans.zero();
_max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
_max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
// Memory for ilist ordered by particle type
return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
return 0;
else return -3;
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
@ -209,9 +222,9 @@ void GB_GPU_MemoryT::clear() {
// Output any timing information
acc_timers();
double single[6], times[6];
double single[9], times[9];
single[0]=atom->transfer_time();
single[0]=atom->transfer_time()+ans->transfer_time();
single[1]=nbor->time_nbor.total_seconds();
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
nbor->time_kernel.total_seconds();
@ -220,15 +233,18 @@ void GB_GPU_MemoryT::clear() {
single[4]=time_pair.total_seconds();
else
single[4]=0;
single[5]=atom->cast_time();
single[5]=atom->cast_time()+ans->cast_time();
single[6]=_gpu_overhead;
single[7]=_driver_overhead;
single[8]=ans->cpu_idle_time();
MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
double avg_split=hd_balancer.all_avg_split();
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
gamma_upsilon_mu.row_bytes();
gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
device->replica());
@ -255,10 +271,19 @@ void GB_GPU_MemoryT::clear() {
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
}
fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
fprintf(screen,"-------------------------------------");
fprintf(screen,"--------------------------------\n\n");
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
}
_max_bytes=0.0;
@ -299,10 +324,9 @@ void GB_GPU_MemoryT::clear() {
template <class numtyp, class acctyp>
double GB_GPU_MemoryT::host_memory_usage() const {
return device->atom.host_memory_usage()+
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
sizeof(GB_GPU_Memory<numtyp,acctyp>)+
device->nbor.max_atoms()*sizeof(int);
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
nbor->max_atoms()*sizeof(int);
}
template <class numtyp, class acctyp>

View File

@ -18,8 +18,6 @@
#ifndef GB_GPU_MEMORY_H
#define GB_GPU_MEMORY_H
#define BLOCK_1D 64
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
@ -35,23 +33,34 @@ class GB_GPU_Memory {
* \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \return false if there is not sufficient memory or device init prob **/
bool init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **host_shape,
double **host_well, double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape, int **h_form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
const double *host_special_lj, const int nlocal, const int nall,
const int max_nbors, const double cell_size,
const double gpu_split, FILE *screen);
* \return false if there is not sufficient memory or device init prob
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **host_shape,
double **host_well, double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape, int **h_form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
const double *host_special_lj, const int nlocal, const int nall,
const int max_nbors, const double cell_size,
const double gpu_split, FILE *screen);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
atom->resize(inum, nall, success);
if (multiple_forms) atom->dev_ans.zero();
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
atom->resize(nall, success);
ans->resize(inum, success);
if (multiple_forms) ans->dev_ans.zero();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_bytes)
_max_bytes=bytes;
}
@ -74,7 +83,7 @@ class GB_GPU_Memory {
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
}
nbor->resize(nlocal,host_inum,max_nbors,success);
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_bytes)
_max_bytes=bytes;
}
@ -91,19 +100,22 @@ class GB_GPU_Memory {
/// Accumulate timers
inline void acc_timers() {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
if (device->time_device()) {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
}
time_kernel.add_to_total();
time_gayberne.add_to_total();
if (multiple_forms) {
time_kernel2.add_to_total();
time_gayberne2.add_to_total();
time_pair.add_to_total();
}
atom->acc_timers();
ans->acc_timers();
}
time_kernel.add_to_total();
time_gayberne.add_to_total();
if (multiple_forms) {
time_kernel2.add_to_total();
time_gayberne2.add_to_total();
time_pair.add_to_total();
}
atom->acc_timers();
}
/// Accumulate timers
@ -117,6 +129,7 @@ class GB_GPU_Memory {
time_pair.zero();
}
atom->zero_timers();
ans->zero_timers();
}
// -------------------------- DEVICE DATA -------------------------
@ -168,6 +181,10 @@ class GB_GPU_Memory {
int last_ellipse, max_last_ellipse;
// ------------------------ FORCE/ENERGY DATA -----------------------
PairGPUAns<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
@ -183,10 +200,12 @@ class GB_GPU_Memory {
UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
inline int block_size() { return _block_size; }
int _threads_per_atom;
private:
bool _allocated, _compiled;
int _block_size;
double _max_bytes;
double _gpu_overhead, _driver_overhead;
void compile_kernels(UCL_Device &dev);
};

View File

@ -1,2 +1,2 @@
Geryon Version 10.280
Geryon Version 11.094

View File

@ -167,6 +167,7 @@ class UCL_Device {
int _device, _num_devices;
std::vector<cudaDeviceProp> _properties;
std::vector<cudaStream_t> _cq;
std::vector<int> _device_ids;
};
// Grabs the properties for all devices
@ -178,6 +179,7 @@ inline UCL_Device::UCL_Device() {
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
break;
_properties.push_back(deviceProp);
_device_ids.push_back(dev);
}
_device=-1;
_cq.push_back(cudaStream_t());
@ -194,7 +196,7 @@ inline void UCL_Device::set(int num) {
return;
for (int i=1; i<num_queues(); i++) pop_command_queue();
cudaThreadExit();
CUDA_SAFE_CALL_NS(cudaSetDevice(num));
CUDA_SAFE_CALL_NS(cudaSetDevice(_device_ids[num]));
_device=num;
}

View File

@ -42,6 +42,7 @@ inline void ucl_sync(CUstream &stream) {
}
struct NVDProperties {
int device_id;
std::string name;
int major;
int minor;
@ -208,15 +209,20 @@ inline UCL_Device::UCL_Device() {
for (int dev=0; dev<_num_devices; ++dev) {
CUdevice m;
CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
int major, minor;
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
if (major==9999)
continue;
_properties.push_back(NVDProperties());
_properties.back().device_id=dev;
_properties.back().major=major;
_properties.back().minor=minor;
char namecstr[1024];
CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
_properties.back().name=namecstr;
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
&_properties.back().minor,m));
CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
@ -262,9 +268,9 @@ inline void UCL_Device::set(int num) {
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
for (int i=1; i<num_queues(); i++) pop_command_queue();
}
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
_device=_properties[num].device_id;
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
_device=num;
}
// List all devices along with all properties

View File

@ -25,6 +25,7 @@
#define NVD_TIMER_H
#include "nvd_macros.h"
#include "nvd_device.h"
namespace ucl_cudadr {
@ -66,12 +67,23 @@ class UCL_Timer {
/// Stop timing on command queue
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
/// Block until the start event has been reached on device
inline void sync_start()
{ CU_SAFE_CALL(cuEventSynchronize(start_event)); }
/// Block until the stop event has been reached on device
inline void sync_stop()
{ CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
/// Set the time elapsed to zero (not the total_time)
inline void zero() {
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
}
/// Set the total time to zero
inline void zero_total() { _total_time=0.0; }
/// Add time from previous start and stop to total
/** Forces synchronization **/
inline double add_to_total()

View File

@ -25,6 +25,7 @@
#define OCL_TIMER_H
#include "ocl_macros.h"
#include "ocl_device.h"
namespace ucl_opencl {
@ -67,10 +68,21 @@ class UCL_Timer {
/// Stop timing on default command queue
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
/// Block until the start event has been reached on device
inline void sync_start()
{ CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
/// Block until the stop event has been reached on device
inline void sync_stop()
{ CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
/// Set the time elapsed to zero (not the total_time)
inline void zero()
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
/// Set the total time to zero
inline void zero_total() { _total_time=0.0; }
/// Add time from previous start and stop to total
/** Forces synchronization **/
inline double add_to_total()

View File

@ -13,7 +13,7 @@
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -206,6 +206,191 @@
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28, class t29>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28, class t29, class t30>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
}
// ---------------------------------------------------------------------------
@ -439,6 +624,211 @@
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28, class t29>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28, class t29, class t30>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
run();
}
// ---------------------------------------------------------------------------
template <class t1>
@ -671,3 +1061,208 @@
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28, class t29>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28, class t29, class t30>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
run(cq);
}

View File

@ -13,7 +13,7 @@
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -61,20 +61,23 @@ class UCL_D_Mat : public UCL_BaseMat {
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
clear();
_kind=kind;
_rows=rows;
_cols=cols;
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
_row_size=_pitch/sizeof(numtyp);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_row_size*cols;
#endif
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate "
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
exit(1);
#endif
return err;
}
_kind=kind;
_rows=rows;
_cols=cols;
_row_size=_pitch/sizeof(numtyp);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_row_size*cols;
#endif
#ifdef _OCL_MAT
_offset=0;
@ -94,20 +97,23 @@ class UCL_D_Mat : public UCL_BaseMat {
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
clear();
_kind=kind;
_rows=rows;
_cols=cols;
int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
_row_size=_pitch/sizeof(numtyp);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_row_size*cols;
#endif
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate "
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
exit(1);
#endif
return err;
}
_kind=kind;
_rows=rows;
_cols=cols;
_row_size=_pitch/sizeof(numtyp);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_row_size*cols;
#endif
#ifdef _OCL_MAT
_offset=0;

View File

@ -13,7 +13,7 @@
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -60,19 +60,24 @@ class UCL_D_Vec : public UCL_BaseMat {
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
clear();
_kind=kind;
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
int err=_device_alloc(*this,cq,_row_bytes,kind);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+cols;
#endif
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on device.\n";
_row_bytes=0;
exit(1);
#endif
_row_bytes=0;
return err;
}
_kind=kind;
_cols=cols;
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+cols;
#endif
#ifdef _OCL_MAT
_offset=0;
@ -90,19 +95,23 @@ class UCL_D_Vec : public UCL_BaseMat {
inline int alloc(const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
clear();
_kind=kind;
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
int err=_device_alloc(*this,device,_row_bytes,kind);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+cols;
#endif
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on device.\n";
_row_bytes=0;
exit(1);
#endif
_row_bytes=0;
return err;
}
_kind=kind;
_cols=cols;
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+cols;
#endif
#ifdef _OCL_MAT
_offset=0;

View File

@ -13,7 +13,7 @@
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -39,7 +39,11 @@ class UCL_H_Mat : public UCL_BaseMat {
};
typedef numtyp data_type;
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
#ifdef _OCL_MAT
_carray=(cl_mem)(0);
#endif
}
~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
/// Construct with specied number of rows and columns
@ -59,18 +63,23 @@ class UCL_H_Mat : public UCL_BaseMat {
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
clear();
_cols=cols;
_rows=rows;
_row_bytes=cols*sizeof(numtyp);
_kind=kind;
int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
#ifndef UCL_NO_EXIT
int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
<< " bytes on host.\n";
_row_bytes=0;
exit(1);
#endif
_row_bytes=0;
return err;
}
#endif
_cols=cols;
_rows=rows;
_kind=kind;
_end=_array+rows*cols;
return err;
}
@ -85,19 +94,24 @@ class UCL_H_Mat : public UCL_BaseMat {
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
clear();
_cols=cols;
_rows=rows;
_row_bytes=cols*sizeof(numtyp);
_kind=kind;
int err=_host_alloc(*this,device,_row_bytes*_rows,kind);
_end=_array+rows*cols;
#ifndef UCL_NO_EXIT
int err=_host_alloc(*this,device,_row_bytes*rows,kind);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
<< " bytes on host.\n";
_row_bytes=0;
exit(1);
#endif
_row_bytes=0;
return err;
}
#endif
_cols=cols;
_rows=rows;
_kind=kind;
_end=_array+rows*cols;
return err;
}

View File

@ -13,7 +13,7 @@
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -39,7 +39,11 @@ class UCL_H_Vec : public UCL_BaseMat {
};
typedef numtyp data_type;
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
#ifdef _OCL_MAT
_carray=(cl_mem)(0);
#endif
}
~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
/// Construct with n columns
@ -59,18 +63,24 @@ class UCL_H_Vec : public UCL_BaseMat {
inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
clear();
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
_kind=kind;
int err=_host_alloc(*this,cq,_row_bytes,kind);
_end=_array+cols;
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on host.\n";
_row_bytes=0;
exit(1);
#endif
_row_bytes=0;
return err;
}
#endif
_cols=cols;
_kind=kind;
_end=_array+cols;
return err;
}
@ -84,18 +94,24 @@ class UCL_H_Vec : public UCL_BaseMat {
inline int alloc(const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
clear();
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
_kind=kind;
int err=_host_alloc(*this,device,_row_bytes,kind);
_end=_array+cols;
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on host.\n";
_row_bytes=0;
exit(1);
#endif
_row_bytes=0;
return err;
}
#endif
_cols=cols;
_kind=kind;
_end=_array+cols;
return err;
}

View File

@ -13,7 +13,7 @@
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -25,8 +25,18 @@
#ifndef UCL_NV_KERNEL_H
#define UCL_NV_KERNEL_H
#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
#if (__CUDA_ARCH__ < 200)
#define mul24 __mul24
#define MEM_THREADS 16
#else
#define mul24(X,Y) (X)*(Y)
#define MEM_THREADS 32
#endif
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
@ -35,8 +45,9 @@
#define BLOCK_SIZE_Y blockDim.y
#define __kernel extern "C" __global__
#define __local __shared__
#define mul24 __mul24
#define __global
#define __inline static __inline__ __device__
#define atom_add atomicAdd
#endif

View File

@ -28,11 +28,11 @@ static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
LJ96MF.clear();
gpu_mode=LJ96MF.device->gpu_mode();
double gpu_split=LJ96MF.device->particle_split();
@ -53,13 +53,11 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
fflush(screen);
}
if (world_me==0) {
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
if (!init_ok)
return false;
}
int init_ok=0;
if (world_me==0)
init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
LJ96MF.device->world_barrier();
if (message)
@ -74,46 +72,46 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum,
nall, 300, maxspecial, cell_size, gpu_split,
screen);
if (!init_ok)
return false;
}
if (gpu_rank==i && world_me!=0)
init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
LJ96MF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
if (init_ok==0)
LJ96MF.estimate_gpu_overhead();
return init_ok;
}
void lj96_gpu_clear() {
LJ96MF.clear();
}
int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full,
int** lj96_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success);
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void lj96_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success) {
LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double lj96_gpu_bytes() {

View File

@ -18,8 +18,6 @@
#ifndef LJ96_GPU_KERNEL
#define LJ96_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
@ -46,7 +44,7 @@
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum, const int nall,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
@ -157,8 +172,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
@ -176,49 +230,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
const int nall, const int nbor_pitch,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (ii<4)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[ii]=lj3_in[ii];
lj3[tid]=lj3_in[tid];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
@ -258,8 +328,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;

View File

@ -42,7 +42,7 @@ int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
bool LJ96_GPU_MemoryT::init(const int ntypes,
int LJ96_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
@ -50,14 +50,18 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj96_cut_gpu_kernel);
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj96_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
@ -84,7 +88,7 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return true;
return 0;
}
template <class numtyp, class acctyp>
@ -122,9 +126,10 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -133,16 +138,18 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch);
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch);
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -29,13 +29,20 @@ class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/

View File

@ -28,12 +28,11 @@ static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool ljl_gpu_init(const int ntypes, double **cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
LJLMF.clear();
gpu_mode=LJLMF.device->gpu_mode();
double gpu_split=LJLMF.device->particle_split();
@ -54,13 +53,11 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
fflush(screen);
}
if (world_me==0) {
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
if (!init_ok)
return false;
}
int init_ok=0;
if (world_me==0)
init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
LJLMF.device->world_barrier();
if (message)
@ -75,45 +72,45 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split,
screen);
if (!init_ok)
return false;
}
if (gpu_rank==i && world_me!=0)
init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
LJLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
if (init_ok==0)
LJLMF.estimate_gpu_overhead();
return init_ok;
}
void ljl_gpu_clear() {
LJLMF.clear();
}
int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
int ** ljl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success);
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void ljl_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success) {
LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}

View File

@ -18,8 +18,6 @@
#ifndef LJ_GPU_KERNEL
#define LJ_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
@ -46,7 +44,7 @@
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum, const int nall,
const int nbor_pitch, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
@ -156,8 +171,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
@ -175,49 +229,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
const int nall, const int nbor_pitch,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (ii<4)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[ii]=lj3_in[ii];
lj3[tid]=lj3_in[tid];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
@ -256,8 +326,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;

View File

@ -42,22 +42,26 @@ int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
bool LJL_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_cut_gpu_kernel);
int LJL_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
@ -84,7 +88,7 @@ bool LJL_GPU_MemoryT::init(const int ntypes,
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return true;
return 0;
}
template <class numtyp, class acctyp>
@ -122,9 +126,10 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -133,16 +138,18 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch);
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch);
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -29,13 +29,20 @@ class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/

View File

@ -28,13 +28,13 @@ static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
LJCMF.clear();
gpu_mode=LJCMF.device->gpu_mode();
double gpu_split=LJCMF.device->particle_split();
@ -55,15 +55,12 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
fflush(screen);
}
if (world_me==0) {
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e);
if (!init_ok)
return false;
}
int init_ok=0;
if (world_me==0)
init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e);
LJCMF.device->world_barrier();
if (message)
@ -78,48 +75,51 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split,
screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e);
if (!init_ok)
return false;
}
if (gpu_rank==i && world_me!=0)
init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e);
LJCMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
if (init_ok==0)
LJCMF.estimate_gpu_overhead();
return init_ok;
}
void ljc_gpu_clear() {
LJCMF.clear();
}
int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full,
int** ljc_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q) {
return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success, host_q);
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void ljc_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q) {
LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q);
void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
nlocal,boxlo,prd);
}
double ljc_gpu_bytes() {

View File

@ -18,8 +18,6 @@
#ifndef LJC_GPU_KERNEL
#define LJC_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
@ -46,7 +44,7 @@
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
@ -82,6 +80,8 @@ __inline float fetch_q(const int& i, const float *q)
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
@ -92,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_ , __global numtyp *cutsq,
const numtyp qqrd2e) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum, const int nall,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp *cutsq, const numtyp qqrd2e,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
@ -109,29 +113,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
@ -188,8 +204,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
@ -209,54 +266,69 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_ , __global numtyp *_cutsq,
const numtyp qqrd2e) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
const numtyp qqrd2e, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (ii<8)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
cutsq[ii]=_cutsq[ii];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
cutsq[tid]=_cutsq[tid];
if (eflag>0)
lj3[ii]=lj3_in[ii];
lj3[tid]=lj3_in[tid];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
@ -312,8 +384,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
}
} // for nbor
} // if ii
// Store answers
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;

View File

@ -43,24 +43,28 @@ int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
bool LJC_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,ljc_cut_gpu_kernel);
int LJC_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,ljc_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
@ -95,7 +99,7 @@ bool LJC_GPU_MemoryT::init(const int ntypes,
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
sp_lj.row_bytes();
return true;
return 0;
}
template <class numtyp, class acctyp>
@ -134,9 +138,10 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -145,19 +150,20 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &cutsq.begin(),
&_qqrd2e);
&_qqrd2e, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&cutsq.begin(), &_qqrd2e);
&cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -29,15 +29,22 @@ class LJC_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
double **host_cut_coulsq, double *host_special_coul,
const double qqrd2e);
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
double **host_cut_coulsq, double *host_special_coul,
const double qqrd2e);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/

View File

@ -28,14 +28,14 @@ static LJCL_GPU_Memory<PRECISION,ACC_PRECISION> LJCLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
LJCLMF.clear();
gpu_mode=LJCLMF.device->gpu_mode();
double gpu_split=LJCLMF.device->particle_split();
@ -56,15 +56,12 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
fflush(screen);
}
if (world_me==0) {
bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e,g_ewald);
if (!init_ok)
return false;
}
int init_ok=0;
if (world_me==0)
init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
LJCLMF.device->world_barrier();
if (message)
@ -79,48 +76,51 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split,
screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald);
if (!init_ok)
return false;
}
if (gpu_rank==i && world_me!=0)
init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
LJCLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
if (init_ok==0)
LJCLMF.estimate_gpu_overhead();
return init_ok;
}
void ljcl_gpu_clear() {
LJCLMF.clear();
}
int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
int** ljcl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q) {
return LJCLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success, host_q);
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void ljcl_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q) {
LJCLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q);
host_q,nlocal,boxlo,prd);
}
double ljcl_gpu_bytes() {

View File

@ -18,8 +18,6 @@
#ifndef LJCL_GPU_KERNEL
#define LJCL_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
@ -54,7 +52,7 @@
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_ , const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum, const int nall,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
@ -204,8 +220,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
// Store answers
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
@ -225,52 +282,68 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_ , const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
const numtyp qqrd2e, const numtyp g_ewald,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (ii<8)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[ii]=lj3_in[ii];
lj3[tid]=lj3_in[tid];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=nbor_pitch) {
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
@ -334,8 +407,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
}
} // for nbor
} // if ii
// Store answers
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;

View File

@ -43,7 +43,7 @@ int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
bool LJCL_GPU_MemoryT::init(const int ntypes,
int LJCL_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
@ -54,14 +54,18 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
double **host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,ljcl_cut_gpu_kernel);
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,ljcl_cut_gpu_kernel);
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
@ -94,7 +98,7 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return true;
return 0;
}
template <class numtyp, class acctyp>
@ -132,9 +136,10 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->atom->inum();
int ainum=this->ans->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
@ -143,19 +148,21 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald);
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald);
&_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -29,15 +29,22 @@ class LJCL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/

View File

@ -29,9 +29,8 @@ __win_sort _win_sort;
#endif
template <class numtyp, class acctyp>
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
_vflag(false),_inum(0),_ilist(NULL),
_newton(false) {
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
_max_gpu_bytes(0) {
#ifndef USE_OPENCL
sort_config.op = CUDPP_ADD;
sort_config.datatype = CUDPP_UINT;
@ -56,28 +55,20 @@ int PairGPUAtomT::bytes_per_atom() const {
int id_space=0;
if (_gpu_nbor)
id_space=2;
int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
int bytes=4*sizeof(numtyp)+id_space;
if (_rot)
bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
bytes+=4*sizeof(numtyp);
if (_charge)
bytes+=sizeof(numtyp);
return bytes;
}
template <class numtyp, class acctyp>
bool PairGPUAtomT::alloc(const int inum, const int nall) {
bool PairGPUAtomT::alloc(const int nall) {
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
if (_newton)
_max_local=_max_atoms;
else
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
bool success=true;
int ans_elements=4;
if (_rot)
ans_elements+=4;
// Ignore host/device transfers?
bool cpuview=false;
if (dev->device_type()==UCL_CPU)
@ -107,8 +98,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
success=success && (host_x.alloc(_max_atoms*4,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
#endif
success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
// Buffer for casting only if different precisions
if (_charge)
success=success && (host_q.alloc(_max_atoms,*dev,
@ -120,15 +109,13 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
// --------------------------- Device allocations
_gpu_bytes=0;
int gpu_bytes=0;
if (cpuview) {
#ifdef GPU_CAST
assert(0==1);
#else
dev_x.view(host_x);
#endif
dev_engv.view(host_engv);
dev_ans.view(host_ans);
if (_rot)
dev_quat.view(host_quat);
if (_charge)
@ -140,49 +127,80 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
success=success && (UCL_SUCCESS==
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
_gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
#else
success=success && (UCL_SUCCESS==
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
#endif
success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (dev_ans.alloc(ans_elements*_max_local,
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
if (_charge) {
success=success && (dev_q.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_q.row_bytes();
gpu_bytes+=dev_q.row_bytes();
}
if (_rot) {
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_quat.row_bytes();
gpu_bytes+=dev_quat.row_bytes();
}
}
if (_gpu_nbor) {
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
_gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
if (_bonds) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
_gpu_bytes+=dev_tag.row_bytes();
gpu_bytes+=dev_tag.row_bytes();
}
}
_gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
gpu_bytes+=dev_x.row_bytes();
if (gpu_bytes>_max_gpu_bytes)
_max_gpu_bytes=gpu_bytes;
_allocated=true;
return success;
}
template <class numtyp, class acctyp>
bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
const bool rot, UCL_Device &devi, const bool gpu_nbor,
bool PairGPUAtomT::add_fields(const bool charge, const bool rot,
const bool gpu_nbor, const bool bonds) {
bool realloc=false;
if (charge && _charge==false) {
_charge=true;
realloc=true;
}
if (rot && _rot==false) {
_rot=true;
realloc=true;
}
if (gpu_nbor && _gpu_nbor==false) {
_gpu_nbor=true;
realloc=true;
}
if (bonds && _bonds==false) {
_bonds=true;
realloc=true;
}
if (realloc) {
_other=_charge || _rot;
int max_atoms=_max_atoms;
clear_resize();
return alloc(max_atoms);
}
return true;
}
template <class numtyp, class acctyp>
bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
UCL_Device &devi, const bool gpu_nbor,
const bool bonds) {
clear();
bool success=true;
_x_avail=false;
_q_avail=false;
_quat_avail=false;
_resized=false;
_gpu_nbor=gpu_nbor;
_bonds=bonds;
_charge=charge;
@ -190,33 +208,25 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
_other=_charge || _rot;
dev=&devi;
_e_fields=1;
if (_charge)
_e_fields++;
_ev_fields=6+_e_fields;
// Initialize atom and nbor data
int ef_inum=inum;
if (ef_inum==0)
ef_inum=1000;
int ef_nall=nall;
if (ef_nall<=ef_inum)
ef_nall=ef_inum*2;
if (ef_nall==0)
ef_nall=2000;
// Initialize timers for the selected device
time_pos.init(*dev);
time_other.init(*dev);
time_answer.init(*dev);
time_q.init(*dev);
time_quat.init(*dev);
time_pos.zero();
time_other.zero();
time_answer.zero();
time_q.zero();
time_quat.zero();
_time_cast=0.0;
#ifdef GPU_CAST
compile_kernels(*dev);
#endif
return success && alloc(ef_inum,ef_nall);
return success && alloc(ef_nall);
}
template <class numtyp, class acctyp>
@ -234,16 +244,12 @@ void PairGPUAtomT::clear_resize() {
dev_quat.clear();
host_quat.clear();
}
dev_ans.clear();
dev_engv.clear();
#ifndef GPU_CAST
host_x.clear();
#else
host_x_cast.clear();
host_type_cast.clear();
#endif
host_ans.clear();
host_engv.clear();
dev_cell_id.clear();
dev_particle_id.clear();
dev_tag.clear();
@ -261,17 +267,14 @@ void PairGPUAtomT::clear_resize() {
template <class numtyp, class acctyp>
void PairGPUAtomT::clear() {
_gpu_bytes=0;
_max_gpu_bytes=0;
if (!_allocated)
return;
time_pos.clear();
time_other.clear();
time_answer.clear();
time_q.clear();
time_quat.clear();
clear_resize();
_inum=0;
_eflag=false;
_vflag=false;
#ifdef GPU_CAST
if (_compiled) {
@ -289,255 +292,10 @@ double PairGPUAtomT::host_memory_usage() const {
atom_bytes+=1;
if (_rot)
atom_bytes+=4;
int ans_bytes=atom_bytes+_ev_fields;
return _max_atoms*atom_bytes*sizeof(numtyp)+
ans_bytes*(_max_local)*sizeof(acctyp)+
sizeof(PairGPUAtom<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom) {
time_answer.start();
_eflag=eflag;
_vflag=vflag;
_ef_atom=ef_atom;
_vf_atom=vf_atom;
int csize=_ev_fields;
if (!eflag)
csize-=_e_fields;
if (!vflag)
csize-=6;
if (csize>0)
ucl_copy(host_engv,dev_engv,_inum*csize,true);
if (_rot)
ucl_copy(host_ans,dev_ans,_inum*4*2,true);
else
ucl_copy(host_ans,dev_ans,_inum*4,true);
time_answer.stop();
}
template <class numtyp, class acctyp>
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom,
int *ilist) {
_ilist=ilist;
copy_answers(eflag,vflag,ef_atom,vf_atom);
}
template <class numtyp, class acctyp>
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
double *virial) {
if (_eflag==false && _vflag==false)
return 0.0;
double evdwl=0.0;
if (_gpu_nbor) {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[i]+=*ap*0.5;
ap+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=*ap*0.5;
virial[j]+=*ap;
ap+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial[j]+=*ap;
ap+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]*=0.5;
} else {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[ii]+=*ap*0.5;
ap+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=*ap*0.5;
virial[j]+=*ap;
ap+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial[j]+=*ap;
ap+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]*=0.5;
}
evdwl*=0.5;
return evdwl;
}
template <class numtyp, class acctyp>
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
double *virial, double &ecoul) {
if (_eflag==false && _vflag==false) {
ecoul=0.0;
return 0.0;
}
if (_charge==false)
return energy_virial(eatom,vatom,virial);
double evdwl=0.0;
double _ecoul=0.0;
if (_gpu_nbor) {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[i]+=*ap*0.5;
ap+=_inum;
_ecoul+=*ap;
eatom[i]+=*ap*0.5;
ap+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
_ecoul+=*ap;
ap+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=*ap*0.5;
virial[j]+=*ap;
ap+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial[j]+=*ap;
ap+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]*=0.5;
} else {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[ii]+=*ap*0.5;
ap+=_inum;
_ecoul+=*ap;
eatom[ii]+=*ap*0.5;
ap+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
_ecoul+=*ap;
ap+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=*ap*0.5;
virial[j]+=*ap;
ap+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial[j]+=*ap;
ap+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]*=0.5;
}
evdwl*=0.5;
ecoul+=_ecoul*0.5;
return evdwl;
}
template <class numtyp, class acctyp>
void PairGPUAtomT::get_answers(double **f, double **tor) {
acctyp *ap=host_ans.begin();
if (_gpu_nbor) {
for (int i=0; i<_inum; i++) {
f[i][0]+=*ap;
ap++;
f[i][1]+=*ap;
ap++;
f[i][2]+=*ap;
ap+=2;
}
if (_rot) {
for (int i=0; i<_inum; i++) {
tor[i][0]+=*ap;
ap++;
tor[i][1]+=*ap;
ap++;
tor[i][2]+=*ap;
ap+=2;
}
}
} else {
for (int i=0; i<_inum; i++) {
int ii=_ilist[i];
f[ii][0]+=*ap;
ap++;
f[ii][1]+=*ap;
ap++;
f[ii][2]+=*ap;
ap+=2;
}
if (_rot) {
for (int i=0; i<_inum; i++) {
int ii=_ilist[i];
tor[ii][0]+=*ap;
ap++;
tor[ii][1]+=*ap;
ap++;
tor[ii][2]+=*ap;
ap+=2;
}
}
}
}
// Sort arrays for neighbor list calculation
template <class numtyp, class acctyp>
void PairGPUAtomT::sort_neighbor(const int num_atoms) {

View File

@ -23,7 +23,6 @@
#ifdef USE_OPENCL
#include "geryon/ocl_device.h"
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h"
@ -32,7 +31,6 @@ using namespace ucl_opencl;
#else
#include "cudpp.h"
#include "geryon/nvd_device.h"
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
#include "geryon/nvd_kernel.h"
@ -40,10 +38,6 @@ using namespace ucl_cudadr;
#endif
#ifndef int2
struct int2 { int x; int y; };
#endif
#include "pair_gpu_precision.h"
template <class numtyp, class acctyp>
@ -56,13 +50,9 @@ class PairGPUAtom {
inline int max_atoms() const { return _max_atoms; }
/// Current number of local+ghost atoms stored
inline int nall() const { return _nall; }
/// Current number of local atoms stored
inline int inum() const { return _inum; }
/// Set number of local+ghost atoms for future copy operations
inline void nall(const int n) { _nall=n; }
/// Set number of local atoms for future copy operations
inline void inum(const int n) { _inum=n; }
/// Memory usage per atom in this class
int bytes_per_atom() const;
@ -70,21 +60,33 @@ class PairGPUAtom {
/// Clear any previous data and set up for a new LAMMPS run
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor True if neighboring will be performed on device **/
bool init(const int inum, const int nall, const bool charge, const bool rot,
bool init(const int nall, const bool charge, const bool rot,
UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
/// Check if we have enough device storage and realloc if not
inline bool resize(const int inum, const int nall, bool &success) {
_inum=inum;
/** Returns true if resized with any call during this timestep **/
inline bool resize(const int nall, bool &success) {
_nall=nall;
if (inum>_max_local || nall>_max_atoms) {
if (nall>_max_atoms) {
clear_resize();
success = success && alloc(inum,nall);
return true;
success = success && alloc(nall);
_resized=true;
}
return false;
return _resized;
}
/// If already initialized by another LAMMPS style, add fields as necessary
/** \param rot True if atom storage needs quaternions
* \param gpu_nbor True if neighboring will be performed on device **/
bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
const bool bonds);
/// Returns true if GPU is using charges
bool charge() { return _charge; }
/// Returns true if GPU is using quaternions
bool quat() { return _rot; }
/// Only free matrices of length inum or nall for resizing
void clear_resize();
@ -100,28 +102,42 @@ class PairGPUAtom {
/// Add copy times to timers
inline void acc_timers() {
time_pos.add_to_total();
time_answer.add_to_total();
if (_other)
time_other.add_to_total();
if (_charge)
time_q.add_to_total();
if (_rot)
time_quat.add_to_total();
}
/// Add copy times to timers
inline void zero_timers() {
time_pos.zero();
time_answer.zero();
if (_other)
time_other.zero();
if (_charge)
time_q.zero();
if (_rot)
time_quat.zero();
}
/// Return the total time for host/device data transfer
/** Zeros the total so that the atom times are only included once **/
inline double transfer_time() {
double total=time_pos.total_seconds()+time_answer.total_seconds();
if (_other) total+=time_other.total_seconds();
double total=time_pos.total_seconds();
time_pos.zero_total();
if (_charge) {
total+=time_q.total_seconds();
time_q.zero_total();
}
if (_rot) {
total+=time_q.total_seconds();
time_quat.zero_total();
}
return total;
}
/// Return the total time for data cast/pack
inline double cast_time() { return _time_cast; }
/** Zeros the time so that atom times are only included once **/
inline double cast_time()
{ double t=_time_cast; _time_cast=0.0; return t; }
/// Pack LAMMPS atom type constants into matrix and copy to device
template <class dev_typ, class t1>
@ -216,43 +232,52 @@ class PairGPUAtom {
// -------------------------COPY TO GPU ----------------------------------
/// Signal that we need to transfer atom data for next timestep
inline void data_unavail()
{ _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
/// Cast positions and types to write buffer
inline void cast_x_data(double **host_ptr, const int *host_type) {
double t=MPI_Wtime();
#ifdef GPU_CAST
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
#else
numtyp *_write_loc=host_x.begin();
for (int i=0; i<_nall; i++) {
*_write_loc=host_ptr[i][0];
_write_loc++;
*_write_loc=host_ptr[i][1];
_write_loc++;
*_write_loc=host_ptr[i][2];
_write_loc++;
*_write_loc=host_type[i];
_write_loc++;
if (_x_avail==false) {
double t=MPI_Wtime();
#ifdef GPU_CAST
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
#else
numtyp *_write_loc=host_x.begin();
for (int i=0; i<_nall; i++) {
*_write_loc=host_ptr[i][0];
_write_loc++;
*_write_loc=host_ptr[i][1];
_write_loc++;
*_write_loc=host_ptr[i][2];
_write_loc++;
*_write_loc=host_type[i];
_write_loc++;
}
#endif
_time_cast+=MPI_Wtime()-t;
}
#endif
_time_cast+=MPI_Wtime()-t;
}
}
/// Copy positions and types to device asynchronously
/** Copies nall() elements **/
inline void add_x_data(double **host_ptr, int *host_type) {
time_pos.start();
#ifdef GPU_CAST
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
int block_size=64;
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
k_cast_x.set_size(GX,block_size);
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
&_nall);
#else
ucl_copy(dev_x,host_x,_nall*4,true);
#endif
if (_x_avail==false) {
#ifdef GPU_CAST
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
int block_size=64;
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
k_cast_x.set_size(GX,block_size);
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
&_nall);
#else
ucl_copy(dev_x,host_x,_nall*4,true);
#endif
_x_avail=true;
}
time_pos.stop();
}
@ -262,87 +287,68 @@ class PairGPUAtom {
add_x_data(host_ptr,host_type);
}
/// Cast charges to write buffer
// Cast charges to write buffer
template<class cpytyp>
inline void cast_q_data(cpytyp *host_ptr) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_q.view((numtyp*)host_ptr,_nall,*dev);
dev_q.view(host_q);
} else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
if (_q_avail==false) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_q.view((numtyp*)host_ptr,_nall,*dev);
dev_q.view(host_q);
} else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
}
_time_cast+=MPI_Wtime()-t;
}
_time_cast+=MPI_Wtime()-t;
}
/// Copy charges to device asynchronously
// Copy charges to device asynchronously
inline void add_q_data() {
ucl_copy(dev_q,host_q,_nall,true);
if (_q_avail==false) {
ucl_copy(dev_q,host_q,_nall,true);
_q_avail=true;
}
}
/// Cast quaternions to write buffer
// Cast quaternions to write buffer
template<class cpytyp>
inline void cast_quat_data(cpytyp *host_ptr) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
dev_quat.view(host_quat);
} else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
if (_quat_avail==false) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
dev_quat.view(host_quat);
} else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
}
_time_cast+=MPI_Wtime()-t;
}
_time_cast+=MPI_Wtime()-t;
}
/// Copy quaternions to device
// Copy quaternions to device
/** Copies nall()*4 elements **/
inline void add_quat_data() {
ucl_copy(dev_quat,host_quat,_nall*4,true);
if (_quat_avail==false) {
ucl_copy(dev_quat,host_quat,_nall*4,true);
_quat_avail=true;
}
}
/// Copy data other than pos and data to device
inline void add_other_data() {
time_other.start();
if (_charge)
add_q_data();
if (_rot)
add_quat_data();
time_other.stop();
}
/// Return number of bytes used on device
inline double gpu_bytes() { return _gpu_bytes; }
// -------------------------COPY FROM GPU -------------------------------
/// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom);
/// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom, int *ilist);
/// Copy energy and virial data into LAMMPS memory
double energy_virial(double *eatom, double **vatom, double *virial);
/// Copy energy and virial data into LAMMPS memory
double energy_virial(double *eatom, double **vatom, double *virial,
double &ecoul);
/// Add forces and torques from the GPU into a LAMMPS pointer
void get_answers(double **f, double **tor);
inline double max_gpu_bytes()
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
// ------------------------------ DATA ----------------------------------
@ -352,10 +358,6 @@ class PairGPUAtom {
UCL_D_Vec<numtyp> dev_q;
/// Quaterions
UCL_D_Vec<numtyp> dev_quat;
/// Force and possibly torque
UCL_D_Vec<acctyp> dev_ans;
/// Energy and virial per-atom storage
UCL_D_Vec<acctyp> dev_engv;
#ifdef GPU_CAST
UCL_D_Vec<double> dev_x_cast;
@ -370,10 +372,6 @@ class PairGPUAtom {
UCL_H_Vec<numtyp> host_q;
/// Buffer for moving quat data to GPU
UCL_H_Vec<numtyp> host_quat;
/// Force and possibly torque data on host
UCL_H_Vec<acctyp> host_ans;
/// Energy/virial data on host
UCL_H_Vec<acctyp> host_engv;
/// Cell list identifiers for device nbor builds
UCL_D_Vec<unsigned> dev_cell_id;
@ -383,7 +381,7 @@ class PairGPUAtom {
UCL_D_Vec<int> dev_tag;
/// Device timers
UCL_Timer time_pos, time_other, time_answer;
UCL_Timer time_pos, time_q, time_quat;
/// Geryon device
UCL_Device *dev;
@ -396,19 +394,19 @@ class PairGPUAtom {
#endif
bool _compiled;
bool alloc(const int inum, const int nall);
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
// True if data has been copied to device already
bool _x_avail, _q_avail, _quat_avail, _resized;
bool alloc(const int nall);
bool _allocated, _rot, _charge, _other;
int _max_atoms, _nall;
bool _gpu_nbor, _bonds;
int *_ilist;
double _time_cast;
double _gpu_bytes;
double _max_gpu_bytes;
bool _newton;
#ifndef USE_OPENCL
CUDPPConfiguration sort_config;
CUDPPHandle sort_plan;

View File

@ -23,7 +23,7 @@
#define _HD_BALANCE_EVERY 25
#define _HD_BALANCE_WEIGHT 0.5
#define _HD_BALANCE_GAP 1.05
#define _HD_BALANCE_GAP 1.10
/// Host/device load balancer
template<class numtyp, class acctyp>
@ -33,7 +33,8 @@ class PairGPUBalance {
inline ~PairGPUBalance() { clear(); }
/// Clear any old data and setup for new LAMMPS run
inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const double split);
inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const bool gpu_nbor,
const double split);
/// Clear all host and device data
inline void clear() {
@ -43,23 +44,25 @@ class PairGPUBalance {
_init_done=false;
}
}
/// Return the timestep since initialization
inline int timestep() { return _timestep; }
/// Get a count of the number of particles host will handle for initial alloc
inline int first_host_count(const int nlocal,const bool gpu_nbor,
const double gpu_split) const {
inline int first_host_count(const int nlocal, const double gpu_split,
const bool gpu_nbor) const {
int host_nlocal=0;
if (gpu_nbor && gpu_split!=1.0) {
if (gpu_split>0)
host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
else
host_nlocal=static_cast<int>(ceil(0.1*nlocal));
host_nlocal=static_cast<int>(ceil(0.05*nlocal));
}
return host_nlocal;
}
/// Return the number of particles the device will handle this timestep
inline int get_gpu_count(const int timestep, const int ago,
const int inum_full);
inline int get_gpu_count(const int ago, const int inum_full);
/// Return the average fraction of particles handled by device on all procs
inline double all_avg_split() {
@ -82,10 +85,10 @@ class PairGPUBalance {
if (_measure_this_step) {
_device->gpu->sync();
_device->gpu_barrier();
_device->start_host_timer();
_device_time.start();
_device->gpu->sync();
_device->gpu_barrier();
_device->start_host_timer();
}
}
@ -95,34 +98,34 @@ class PairGPUBalance {
/// Calculate the new host/device split based on the cpu and device times
/** \note Only does calculation every _HD_BALANCE_EVERY timesteps
(and first 10) **/
inline void balance(const double cpu_time, const bool gpu_nbor);
inline void balance(const double cpu_time);
/// Calls balance() and then get_gpu_count()
inline int balance(const int timestep, const int ago, const int inum_full,
const double cpu_time, const bool gpu_nbor) {
balance(cpu_time,gpu_nbor);
return get_gpu_count(timestep,ago,inum_full);
inline int balance(const int ago,const int inum_full,const double cpu_time) {
balance(cpu_time);
return get_gpu_count(ago,inum_full);
}
private:
PairGPUDevice<numtyp,acctyp> *_device;
UCL_Timer _device_time;
bool _init_done;
bool _init_done, _gpu_nbor;
bool _load_balance;
double _actual_split, _avg_split, _desired_split, _max_split;
int _avg_count;
bool _measure_this_step;
int _inum, _inum_full;
int _inum, _inum_full, _timestep;
};
#define PairGPUBalanceT PairGPUBalance<numtyp,acctyp>
template <class numtyp, class acctyp>
void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
const double split) {
void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
const bool gpu_nbor, const double split) {
clear();
_gpu_nbor=gpu_nbor;
_init_done=true;
_device=gpu;
@ -130,7 +133,7 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
if (split<0.0) {
_load_balance=true;
_desired_split=0.9;
_desired_split=0.90;
} else {
_load_balance=false;
_desired_split=split;
@ -138,14 +141,14 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
_actual_split=_desired_split;
_avg_split=0.0;
_avg_count=0;
_timestep=0;
}
template <class numtyp, class acctyp>
int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
const int inum_full) {
int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) {
_measure_this_step=false;
if (_load_balance) {
if (_avg_count<11 || timestep%_HD_BALANCE_EVERY==0) {
if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
_measure_this_step=true;
_inum_full=inum_full;
}
@ -156,44 +159,44 @@ int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
}
_inum=static_cast<int>(floor(_actual_split*inum_full));
if (_inum==0) _inum++;
_timestep++;
return _inum;
}
template <class numtyp, class acctyp>
void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) {
void PairGPUBalanceT::balance(const double cpu_time) {
if (_measure_this_step) {
_measure_this_step=false;
double gpu_time=_device_time.seconds();
double max_gpu_time;
MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
_device->gpu_comm());
if (_inum_full==_inum) {
_desired_split=1.0;
return;
}
_measure_this_step=false;
double gpu_time=_device_time.seconds();
double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
double cpu_other_time=_device->host_time()-cpu_time;
int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
cpu_time_per_atom);
double cpu_gpu_time[3], max_times[3];
cpu_gpu_time[0]=cpu_time/(_inum_full-_inum);
cpu_gpu_time[1]=gpu_time/_inum;
cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full;
double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
_desired_split=split*_HD_BALANCE_GAP;
if (_desired_split>1.0)
_desired_split=1.0;
if (_desired_split<0.0)
_desired_split=0.0;
MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX,
_device->gpu_comm());
double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]);
split*=_HD_BALANCE_GAP;
if (split>1.0)
split=1.0;
if (_avg_count<10)
_desired_split=(_desired_split*_avg_count+split)/(_avg_count+1);
else
_desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+
_HD_BALANCE_WEIGHT*split;
if (!gpu_nbor) {
if (!_gpu_nbor) {
if (_desired_split<_max_split)
_actual_split=_desired_split;
else
_actual_split=_max_split;
}
//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl;
}
_avg_split+=_desired_split;
_avg_count++;

View File

@ -18,7 +18,7 @@
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#include "nv_kernel_def.h"
texture<float4> neigh_tex;
#ifdef _DOUBLE_DOUBLE
@ -36,6 +36,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
#else
#define fetch_pos(i,y) x_[i]
#define BLOCK_NBOR_BUILD 64
#endif
@ -54,29 +55,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
#define numtyp4 float4
#endif
#define CELL_BLOCK_SIZE 64
#define BLOCK_2D 8
#define BLOCK_CELL_2D 8
#define SBBITS 30
#define SBBITS 30
__kernel void transpose(int *out, int *in, int columns_in, int rows_in)
{
__local float block[BLOCK_2D][BLOCK_2D+1];
__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
unsigned ti=THREAD_ID_X;
unsigned tj=THREAD_ID_Y;
unsigned bi=BLOCK_ID_X;
unsigned bj=BLOCK_ID_Y;
unsigned i=bi*BLOCK_2D+ti;
unsigned j=bj*BLOCK_2D+tj;
unsigned i=bi*BLOCK_CELL_2D+ti;
unsigned j=bj*BLOCK_CELL_2D+tj;
if ((i<columns_in) && (j<rows_in))
block[tj][ti]=in[j*columns_in+i];
__syncthreads();
i=bj*BLOCK_2D+ti;
j=bi*BLOCK_2D+tj;
i=bj*BLOCK_CELL_2D+ti;
j=bi*BLOCK_CELL_2D+tj;
if ((i<rows_in) && (j<columns_in))
out[j*rows_in+i] = block[ti][tj];
}
@ -141,7 +143,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
int *cell_particle_id,
int *cell_counts,
int *nbor_list,
int *host_nbor_list,
int *host_nbor_list,
int *host_numj,
int neigh_bin_size,
numtyp cell_size,
int ncellx, int ncelly, int ncellz,
@ -154,8 +157,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
int icell = ix + iy*ncellx + iz*ncellx*ncelly;
__shared__ int cell_list_sh[CELL_BLOCK_SIZE];
__shared__ numtyp4 pos_sh[CELL_BLOCK_SIZE];
__shared__ int cell_list_sh[BLOCK_NBOR_BUILD];
__shared__ numtyp4 pos_sh[BLOCK_NBOR_BUILD];
int icell_begin = cell_counts[icell];
int icell_end = cell_counts[icell+1];
@ -185,9 +188,9 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
neigh_list=neigh_counts+stride;
nbor_list[pid_i]=pid_i;
} else {
stride=nt-inum;
neigh_counts=host_nbor_list+pid_i-inum;
neigh_list=neigh_counts+stride;
stride=1;
neigh_counts=host_numj+pid_i-inum;
neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
}
// loop through neighbors
@ -203,13 +206,13 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
int num_atom_cell = jcell_end - jcell_begin;
// load jcell to shared memory
int num_iter = (int)ceil((numtyp)num_atom_cell/CELL_BLOCK_SIZE);
int num_iter = (int)ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
for (int k = 0; k < num_iter; k++) {
int end_idx = min(CELL_BLOCK_SIZE, num_atom_cell-k*CELL_BLOCK_SIZE);
int end_idx = min(BLOCK_NBOR_BUILD, num_atom_cell-k*BLOCK_NBOR_BUILD);
if (tid < end_idx) {
pid_j = cell_particle_id[tid+k*CELL_BLOCK_SIZE+jcell_begin];
pid_j = cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
cell_list_sh[tid] = pid_j;
atom_j = fetch_pos(pid_j,pos); //[pid_j];
pos_sh[tid].x = atom_j.x;
@ -222,20 +225,18 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
for (int j = 0; j < end_idx; j++) {
int pid_j = cell_list_sh[j]; // gather from shared memory
if (pid_i<inum || pid_j<inum || pid_j>pid_i) {
diff.x = atom_i.x - pos_sh[j].x;
diff.y = atom_i.y - pos_sh[j].y;
diff.z = atom_i.z - pos_sh[j].z;
diff.x = atom_i.x - pos_sh[j].x;
diff.y = atom_i.y - pos_sh[j].y;
diff.z = atom_i.z - pos_sh[j].z;
r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
if (r2 < cell_size*cell_size && r2 > 1e-5) {
if (cnt < neigh_bin_size) {
*neigh_list = pid_j;
neigh_list+=stride;
}
cnt++;
}
}
r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
if (r2 < cell_size*cell_size && r2 > 1e-5) {
if (cnt < neigh_bin_size) {
*neigh_list = pid_j;
neigh_list+=stride;
}
cnt++;
}
}
}
__syncthreads();
@ -249,9 +250,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
}
__kernel void kernel_special(__global int *dev_nbor,
__global int *host_nbor_list, __global int *tag,
__global int *host_nbor_list,
__global int *host_numj, __global int *tag,
__global int *nspecial, __global int *special,
int inum, int nt, int nall) {
int inum, int nt, int nall, int max_nbors) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
@ -263,15 +265,17 @@ __kernel void kernel_special(__global int *dev_nbor,
int n2=nspecial[ii*3+1];
int n3=nspecial[ii*3+2];
int numj;
if (ii < inum) {
stride=inum;
list=dev_nbor+stride+ii;
numj=*list;
list+=stride;
} else {
stride=nt-inum;
list=host_nbor_list+ii-inum;
stride=1;
list=host_nbor_list+(ii-inum)*max_nbors;
numj=host_numj[ii-inum];
}
int numj=*list;
list+=stride;
list_end=list+numj*stride;
for ( ; list<list_end; list+=stride) {
@ -294,4 +298,3 @@ __kernel void kernel_special(__global int *dev_nbor,
}
} // if ii
}

View File

@ -19,13 +19,22 @@
#include "pair_gpu_precision.h"
#include <map>
#include <math.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#ifdef USE_OPENCL
#include "pair_gpu_dev_cl.h"
#else
#include "pair_gpu_dev_ptx.h"
#endif
#define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
template <class numtyp, class acctyp>
PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
_gpu_mode(GPU_FORCE), _first_device(0),
_last_device(0) {
_last_device(0), _compiled(false) {
}
template <class numtyp, class acctyp>
@ -34,14 +43,19 @@ PairGPUDeviceT::~PairGPUDevice() {
}
template <class numtyp, class acctyp>
bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
const int first_gpu, const int last_gpu,
const int gpu_mode, const double p_split,
const int nthreads) {
int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
const int first_gpu, const int last_gpu,
const int gpu_mode, const double p_split,
const int nthreads, const int t_per_atom) {
_nthreads=nthreads;
#ifdef _OPENMP
omp_set_num_threads(nthreads);
#endif
_threads_per_atom=t_per_atom;
_threads_per_charge=t_per_atom;
if (_device_init)
return true;
return 0;
_device_init=true;
_comm_world=world;
_comm_replica=replica;
@ -96,7 +110,12 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
// set the device ID
_procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
(last_gpu-first_gpu+1)));
int my_gpu=node_rank/_procs_per_gpu;
int my_gpu=node_rank/_procs_per_gpu+first_gpu;
// Time on the device only if 1 proc per gpu
_time_device=true;
if (_procs_per_gpu>1)
_time_device=false;
// Set up a per device communicator
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
@ -104,39 +123,109 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
gpu=new UCL_Device();
if (my_gpu>=gpu->num_devices())
return false;
return -2;
gpu->set(my_gpu);
return true;
_long_range_precompute=0;
int flag=compile_kernels();
return flag;
}
template <class numtyp, class acctyp>
bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
const int host_nlocal, const int nall,
const int maxspecial, const bool gpu_nbor,
const int gpu_host, const int max_nbors,
const double cell_size, const bool pre_cut) {
int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal,
const int host_nlocal, const int nall,
PairGPUNbor *nbor, const int maxspecial,
const int gpu_host, const int max_nbors,
const double cell_size, const bool pre_cut) {
if (!_device_init)
return false;
return -1;
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
return -5;
// Counts of data transfers for timing overhead estimates
_data_in_estimate=0;
_data_out_estimate=1;
// Initial number of local particles
int ef_nlocal=nlocal;
if (_particle_split<1.0 && _particle_split>0.0)
ef_nlocal=static_cast<int>(_particle_split*nlocal);
bool gpu_nbor=false;
if (_gpu_mode==GPU_NEIGH)
gpu_nbor=true;
if (_init_count==0) {
// Initialize atom and nbor data
int ef_nlocal=nlocal;
if (_particle_split<1.0 && _particle_split>0.0)
ef_nlocal=static_cast<int>(_particle_split*nlocal);
if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
gpu_nbor && maxspecial>0))
return false;
if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
gpu_host,pre_cut))
return false;
nbor.cell_size(cell_size);
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
return -3;
_data_in_estimate++;
if (charge)
_data_in_estimate++;
if (rot)
_data_in_estimate++;
} else {
if (cell_size>nbor.cell_size())
nbor.cell_size(cell_size);
if (atom.charge()==false && charge)
_data_in_estimate++;
if (atom.quat()==false && rot)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial))
return -3;
}
if (!ans.init(ef_nlocal,charge,rot,*gpu))
return -3;
if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
_block_cell_id, _block_nbor_build))
return -3;
nbor->cell_size(cell_size);
_init_count++;
return true;
return 0;
}
template <class numtyp, class acctyp>
int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
const int nall) {
if (!_device_init)
return -1;
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
return -5;
if (_init_count==0) {
// Initialize atom and nbor data
if (!atom.init(nall,true,false,*gpu,false,false))
return -3;
} else
if (!atom.add_fields(true,false,false,false))
return -3;
if (!ans.init(nlocal,true,false,*gpu))
return -3;
_init_count++;
return 0;
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::set_single_precompute
(PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) {
_long_range_precompute=1;
pppm_single=pppm;
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::set_double_precompute
(PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) {
_long_range_precompute=2;
pppm_double=pppm;
}
template <class numtyp, class acctyp>
@ -152,11 +241,17 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"-------------------------------------\n");
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu);
fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu);
#ifdef _OPENMP
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
#endif
fprintf(screen,"-------------------------------------");
fprintf(screen,"-------------------------------------\n");
for (int i=first_gpu; i<=last_gpu; i++) {
int last=last_gpu+1;
if (last>gpu->num_devices())
last=gpu->num_devices();
for (int i=first_gpu; i<last; i++) {
std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
" GHZ (";
@ -177,32 +272,152 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
const double max_bytes, FILE *screen) {
double single[5], times[5];
void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls,
double &gpu_overhead,
double &gpu_driver_overhead) {
UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
UCL_Timer over_timer(*gpu);
single[0]=atom.transfer_time();
if (_data_in_estimate>0) {
host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
timers_in=new UCL_Timer[_data_in_estimate];
}
if (_data_out_estimate>0) {
host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
timers_out=new UCL_Timer[_data_out_estimate];
}
if (kernel_calls>0) {
kernel_data=new UCL_D_Vec<int>[kernel_calls];
timers_kernel=new UCL_Timer[kernel_calls];
}
for (int i=0; i<_data_in_estimate; i++) {
host_data_in[i].alloc(1,*gpu);
dev_data_in[i].alloc(1,*gpu);
timers_in[i].init(*gpu);
}
for (int i=0; i<_data_out_estimate; i++) {
host_data_out[i].alloc(1,*gpu);
dev_data_out[i].alloc(1,*gpu);
timers_out[i].init(*gpu);
}
for (int i=0; i<kernel_calls; i++) {
kernel_data[i].alloc(1,*gpu);
timers_kernel[i].init(*gpu);
}
gpu_overhead=0.0;
gpu_driver_overhead=0.0;
for (int i=0; i<10; i++) {
gpu->sync();
gpu_barrier();
over_timer.start();
gpu->sync();
gpu_barrier();
double driver_time=MPI_Wtime();
for (int i=0; i<_data_in_estimate; i++) {
timers_in[i].start();
ucl_copy(dev_data_in[i],host_data_in[i],true);
timers_in[i].stop();
}
for (int i=0; i<kernel_calls; i++) {
timers_kernel[i].start();
zero(kernel_data[i],1);
timers_kernel[i].stop();
}
for (int i=0; i<_data_out_estimate; i++) {
timers_out[i].start();
ucl_copy(host_data_out[i],dev_data_out[i],true);
timers_out[i].stop();
}
over_timer.stop();
double time=over_timer.seconds();
driver_time=MPI_Wtime()-driver_time;
if (time_device()) {
for (int i=0; i<_data_in_estimate; i++)
timers_in[i].add_to_total();
for (int i=0; i<kernel_calls; i++)
timers_kernel[i].add_to_total();
for (int i=0; i<_data_out_estimate; i++)
timers_out[i].add_to_total();
}
double mpi_time, mpi_driver_time;
MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
gpu_overhead+=mpi_time;
gpu_driver_overhead+=mpi_driver_time;
}
gpu_overhead/=10.0;
gpu_driver_overhead/=10.0;
if (_data_in_estimate>0) {
delete [] host_data_in;
delete [] dev_data_in;
delete [] timers_in;
}
if (_data_out_estimate>0) {
delete [] host_data_out;
delete [] dev_data_out;
delete [] timers_out;
}
if (kernel_calls>0) {
delete [] kernel_data;
delete [] timers_kernel;
}
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::output_times(UCL_Timer &time_pair,
PairGPUAns<numtyp,acctyp> &ans,
PairGPUNbor &nbor, const double avg_split,
const double max_bytes,
const double gpu_overhead,
const double driver_overhead,
const int threads_per_atom, FILE *screen) {
double single[8], times[8];
single[0]=atom.transfer_time()+ans.transfer_time();
single[1]=nbor.time_nbor.total_seconds();
single[2]=nbor.time_kernel.total_seconds();
single[3]=time_pair.total_seconds();
single[4]=atom.cast_time();
single[4]=atom.cast_time()+ans.cast_time();
single[5]=gpu_overhead;
single[6]=driver_overhead;
single[7]=ans.cpu_idle_time();
MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
double my_max_bytes=max_bytes;
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
double max_mb=mpi_max_bytes/(1024.0*1024.0);
if (replica_me()==0)
if (screen && times[3]>0.0) {
if (screen && times[5]>0.0) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (procs_per_gpu()==1) {
if (time_device()) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
@ -212,7 +427,71 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size);
}
fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Threads / atom: %d.\n",threads_per_atom);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size);
fprintf(screen,"-------------------------------------");
fprintf(screen,"--------------------------------\n\n");
}
}
template <class numtyp, class acctyp>
void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
UCL_Timer &time_out,
UCL_Timer &time_map,
UCL_Timer &time_rho,
UCL_Timer &time_interp,
PairGPUAns<numtyp,acctyp> &ans,
const double max_bytes,
const double cpu_time,
const double idle_time, FILE *screen) {
double single[8], times[8];
single[0]=time_out.total_seconds();
single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
single[2]=time_map.total_seconds();
single[3]=time_rho.total_seconds();
single[4]=time_interp.total_seconds();
single[5]=ans.transfer_time()+ans.cast_time();
single[6]=cpu_time;
single[7]=idle_time;
MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
double mpi_max_bytes;
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
double max_mb=mpi_max_bytes/(1024.0*1024.0);
if (replica_me()==0)
if (screen && times[6]>0.0) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (time_device()) {
fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size);
fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size);
fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size);
fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size);
fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size);
fprintf(screen,"Total rho: %.4f s.\n",
(times[0]+times[2]+times[3])/_replica_size);
fprintf(screen,"Total interp: %.4f s.\n",
(times[1]+times[4])/_replica_size);
fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
fprintf(screen,"Total: %.4f s.\n",
(times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
_replica_size);
}
fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size);
fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
fprintf(screen,"-------------------------------------");
@ -223,10 +502,17 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
template <class numtyp, class acctyp>
void PairGPUDeviceT::clear() {
if (_init_count>0) {
_long_range_precompute=0;
_init_count--;
if (_init_count==0) {
atom.clear();
nbor.clear();
_nbor_shared.clear();
if (_compiled) {
k_zero.clear();
k_info.clear();
delete dev_program;
_compiled=false;
}
}
}
}
@ -241,21 +527,80 @@ void PairGPUDeviceT::clear_device() {
}
}
template <class numtyp, class acctyp>
int PairGPUDeviceT::compile_kernels() {
int flag=0;
if (_compiled)
return flag;
std::string flags="-cl-mad-enable";
dev_program=new UCL_Program(*gpu);
int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str());
if (success!=UCL_SUCCESS)
return -4;
k_zero.set_function(*dev_program,"kernel_zero");
k_info.set_function(*dev_program,"kernel_info");
_compiled=true;
UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
k_info.set_size(1,1);
k_info.run(&d_gpu_lib_data.begin());
ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
#ifndef USE_OPENCL
if (static_cast<double>(h_gpu_lib_data[0])/100.0>gpu->arch())
return -4;
#endif
_num_mem_threads=h_gpu_lib_data[1];
_warp_size=h_gpu_lib_data[2];
if (_threads_per_atom<1)
_threads_per_atom=h_gpu_lib_data[3];
if (_threads_per_charge<1)
_threads_per_charge=h_gpu_lib_data[13];
_pppm_max_spline=h_gpu_lib_data[4];
_pppm_block=h_gpu_lib_data[5];
_block_pair=h_gpu_lib_data[6];
_max_shared_types=h_gpu_lib_data[7];
_block_cell_2d=h_gpu_lib_data[8];
_block_cell_id=h_gpu_lib_data[9];
_block_nbor_build=h_gpu_lib_data[10];
_block_bio_pair=h_gpu_lib_data[11];
_max_bio_shared_types=h_gpu_lib_data[12];
if (static_cast<size_t>(_block_pair)>gpu->group_size())
_block_pair=gpu->group_size();
if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
_block_bio_pair=gpu->group_size();
if (_threads_per_atom>_warp_size)
_threads_per_atom=_warp_size;
if (_warp_size%_threads_per_atom!=0)
_threads_per_atom=1;
if (_threads_per_charge>_warp_size)
_threads_per_charge=_warp_size;
if (_warp_size%_threads_per_charge!=0)
_threads_per_charge=1;
return flag;
}
template <class numtyp, class acctyp>
double PairGPUDeviceT::host_memory_usage() const {
return atom.host_memory_usage()+
nbor.host_memory_usage()+4*sizeof(numtyp)+
return atom.host_memory_usage()+4*sizeof(numtyp)+
sizeof(PairGPUDevice<numtyp,acctyp>);
}
template class PairGPUDevice<PRECISION,ACC_PRECISION>;
PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads) {
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom) {
return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
particle_split,nthreads);
particle_split,nthreads,t_per_atom);
}
void lmp_clear_device() {
@ -264,14 +609,5 @@ void lmp_clear_device() {
double lmp_gpu_forces(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
if (pair_gpu_device.init_count()) {
pair_gpu_device.stop_host_timer();
pair_gpu_device.gpu->sync();
double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
pair_gpu_device.atom.get_answers(f,tor);
return evdw;
}
return 0.0;
return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
}

View File

@ -19,11 +19,17 @@
#define PAIR_GPU_DEVICE_H
#include "pair_gpu_atom.h"
#include "pair_gpu_ans.h"
#include "pair_gpu_nbor.h"
#include "pppm_gpu_memory.h"
#include "mpi.h"
#include <sstream>
#include "stdio.h"
#include <string>
#include <queue>
template <class numtyp, class acctyp,
class grdtyp, class grdtyp4> class PPPMGPUMemory;
template <class numtyp, class acctyp>
class PairGPUDevice {
@ -33,10 +39,15 @@ class PairGPUDevice {
/// Initialize the device for use by this process
/** Sets up a per-device MPI communicator for load balancing and initializes
* the device (>=first_gpu and <=last_gpu) that this proc will be using **/
bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
* the device (>=first_gpu and <=last_gpu) that this proc will be using
* Returns:
* - 0 if successfull
* - -2 if GPU not found
* - -4 if GPU library not compiled for GPU **/
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads);
const double particle_split, const int nthreads,
const int t_per_atom);
/// Initialize the device for Atom and Neighbor storage
/** \param rot True if quaternions need to be stored
@ -50,19 +61,67 @@ class PairGPUDevice {
* \param max_nbors Initial number of rows in the neighbor matrix
* \param cell_size cutoff+skin
* \param pre_cut True if cutoff test will be performed in separate kernel
* than the force kernel **/
bool init(const bool charge, const bool rot, const int nlocal,
const int host_nlocal, const int nall, const int maxspecial,
const bool gpu_nbor, const int gpu_host, const int max_nbors,
const double cell_size, const bool pre_cut);
* than the force kernel
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
const int nlocal, const int host_nlocal, const int nall,
PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
const int max_nbors, const double cell_size, const bool pre_cut);
/// Initialize the device for Atom storage only
/** \param nlocal Total number of local particles to allocate memory for
* \param nall Total number of local+ghost particles
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
/// Output a message for pair_style acceleration with device stats
void init_message(FILE *screen, const char *name,
const int first_gpu, const int last_gpu);
/// Perform charge assignment asynchronously for PPPM
void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
float,_lgpu_float4> *pppm);
/// Perform charge assignment asynchronously for PPPM
void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
double,_lgpu_double4> *pppm);
/// Esimate the overhead from GPU calls from multiple procs
/** \param kernel_calls Number of kernel calls/timestep for timing estimated
* overhead
* \param gpu_overhead Estimated gpu overhead per timestep (sec)
* \param driver_overhead Estimated overhead from driver per timestep (s) **/
void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
double &gpu_driver_overhead);
/// Returns true if double precision is supported on card
inline bool double_precision() { return gpu->double_precision(); }
/// Output a message with timing information
void output_times(UCL_Timer &time_pair, const double avg_split,
const double max_bytes, FILE *screen);
void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans,
PairGPUNbor &nbor, const double avg_split,
const double max_bytes, const double gpu_overhead,
const double driver_overhead,
const int threads_per_atom, FILE *screen);
/// Output a message with timing information
void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
UCL_Timer & time_map, UCL_Timer & time_rho,
UCL_Timer &time_interp,
PairGPUAns<numtyp,acctyp> &ans,
const double max_bytes, const double cpu_time,
const double cpu_idle_time, FILE *screen);
/// Clear all memory on host and device associated with atom and nbor data
void clear();
@ -70,11 +129,37 @@ class PairGPUDevice {
/// Clear all memory on host and device
void clear_device();
/// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
{ ans_queue.push(ans); }
/// Add "answers" (force,energies,etc.) into LAMMPS structures
inline double fix_gpu(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
atom.data_unavail();
if (ans_queue.empty()==false) {
stop_host_timer();
double evdw=0.0;
while (ans_queue.empty()==false) {
evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
ans_queue.pop();
}
return evdw;
}
return 0.0;
}
/// Start timer on host
inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
inline void start_host_timer()
{ _cpu_full=MPI_Wtime(); _host_timer_started=true; }
/// Stop timer on host
inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
inline void stop_host_timer() {
if (_host_timer_started) {
_cpu_full=MPI_Wtime()-_cpu_full;
_host_timer_started=false;
}
}
/// Return host time
inline double host_time() { return _cpu_full; }
@ -114,6 +199,42 @@ class PairGPUDevice {
inline double particle_split() const { return _particle_split; }
/// Return the initialization count for the device
inline int init_count() const { return _init_count; }
/// True if device is being timed
inline bool time_device() const { return _time_device; }
/// Return the number of threads accessing memory simulatenously
inline int num_mem_threads() const { return _num_mem_threads; }
/// Return the number of threads per atom for pair styles
inline int threads_per_atom() const { return _threads_per_atom; }
/// Return the number of threads per atom for pair styles using charge
inline int threads_per_charge() const { return _threads_per_charge; }
/// Return the min of the pair block size or the device max block size
inline int pair_block_size() const { return _block_pair; }
/// Return the maximum number of atom types that can be used with shared mem
inline int max_shared_types() const { return _max_shared_types; }
/// Return the maximum order for PPPM splines
inline int pppm_max_spline() const { return _pppm_max_spline; }
/// Return the block size for PPPM kernels
inline int pppm_block() const { return _pppm_block; }
/// Return the block size for neighbor binning
inline int block_cell_2d() const { return _block_cell_2d; }
/// Return the block size for atom mapping for neighbor builds
inline int block_cell_id() const { return _block_cell_id; }
/// Return the block size for neighbor build kernel
inline int block_nbor_build() const { return _block_nbor_build; }
/// Return the block size for "bio" pair styles
inline int block_bio_pair() const { return _block_bio_pair; }
/// Return the maximum number of atom types for shared mem with "bio" styles
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
// -------------------- SHARED DEVICE ROUTINES --------------------
// Perform asynchronous zero of integer array
void zero(UCL_D_Vec<int> &mem, const int numel) {
int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
_block_pair));
k_zero.set_size(num_blocks,_block_pair);
k_zero.run(&mem.begin(),&numel);
}
// -------------------------- DEVICE DATA -------------------------
@ -130,11 +251,30 @@ class PairGPUDevice {
// --------------------------- NBOR DATA ----------------------------
/// Neighbor Data
PairGPUNbor nbor;
PairGPUNborShared _nbor_shared;
// ------------------------ LONG RANGE DATA -------------------------
// Long Range Data
int _long_range_precompute;
PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
/// Precomputations for long range charge assignment (asynchronously)
inline void precompute(const int ago, const int nlocal, const int nall,
double **host_x, int *host_type, bool &success,
double *charge, double *boxlo, double *prd) {
if (_long_range_precompute==1)
pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
boxlo,prd);
else if (_long_range_precompute==2)
pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
boxlo,prd);
}
private:
std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
int _init_count;
bool _device_init;
bool _device_init, _host_timer_started, _time_device;
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
_replica_size;
@ -142,6 +282,19 @@ class PairGPUDevice {
double _particle_split;
double _cpu_full;
int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
int _pppm_max_spline, _pppm_block;
int _block_pair, _max_shared_types;
int _block_cell_2d, _block_cell_id, _block_nbor_build;
int _block_bio_pair, _max_bio_shared_types;
UCL_Program *dev_program;
UCL_Kernel k_zero, k_info;
bool _compiled;
int compile_kernels();
int _data_in_estimate, _data_out_estimate;
template <class t>
inline std::string toa(const t& in) {
std::ostringstream o;

View File

@ -18,15 +18,9 @@
#include "pair_gpu_precision.h"
#include "pair_gpu_nbor.h"
#include "pair_gpu_device.h"
#include "math.h"
#ifdef USE_OPENCL
#include "pair_gpu_nbor_cl.h"
#else
#include "pair_gpu_nbor_ptx.h"
#include "pair_gpu_build_ptx.h"
#endif
int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
if (_gpu_nbor)
return (max_nbors+2)*sizeof(int);
@ -36,12 +30,18 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
return (max_nbors+3)*sizeof(int);
}
bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
const int host_inum, const int max_nbors,
const int maxspecial, UCL_Device &devi,
const bool gpu_nbor, const int gpu_host,
const bool pre_cut) {
const bool pre_cut, const int block_cell_2d,
const int block_cell_id, const int block_nbor_build) {
clear();
_block_cell_2d=block_cell_2d;
_block_cell_id=block_cell_id;
_block_nbor_build=block_nbor_build;
_shared=shared;
dev=&devi;
_gpu_nbor=gpu_nbor;
if (gpu_host==0)
@ -80,8 +80,11 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
alloc(success);
if (!success)
return false;
if (_use_packing==false)
compile_kernels(devi);
_shared->compile_kernels(devi,gpu_nbor);
return success;
}
@ -89,13 +92,14 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
void PairGPUNbor::alloc(bool &success) {
dev_nbor.clear();
host_acc.clear();
int nt=_max_atoms+_max_host;
if (_use_packing==false || _gpu_nbor)
success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
else
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev,
success=success && (host_acc.alloc(nt*2,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
_c_bytes=dev_nbor.row_bytes();
@ -108,11 +112,31 @@ void PairGPUNbor::alloc(bool &success) {
if (_max_host>0) {
host_nbor.clear();
dev_host_nbor.clear();
success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev,
dev_host_numj.clear();
host_ilist.clear();
host_jlist.clear();
success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host,
success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
_c_bytes+=dev_host_nbor.row_bytes();
success=success && (dev_host_numj.alloc(_max_host,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success)
return;
for (int i=0; i<nt; i++)
host_ilist[i]=i;
success=success && (host_jlist.alloc(_max_host,*dev,
UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success)
return;
int *ptr=host_nbor.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=_max_nbors;
}
_c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
}
if (_maxspecial>0) {
dev_nspecial.clear();
@ -145,6 +169,9 @@ void PairGPUNbor::clear() {
dev_host_nbor.clear();
dev_packed.clear();
host_nbor.clear();
dev_host_numj.clear();
host_ilist.clear();
host_jlist.clear();
dev_nspecial.clear();
dev_special.clear();
dev_special_t.clear();
@ -152,27 +179,13 @@ void PairGPUNbor::clear() {
time_kernel.clear();
time_nbor.clear();
}
if (_compiled) {
if (_gpu_nbor) {
k_cell_id.clear();
k_cell_counts.clear();
k_build_nbor.clear();
k_transpose.clear();
k_special.clear();
delete build_program;
} else {
k_nbor.clear();
delete nbor_program;
}
_compiled=false;
}
}
double PairGPUNbor::host_memory_usage() const {
if (_gpu_nbor) {
if (_gpu_host)
return host_nbor.row_bytes()*host_nbor.rows();
return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
host_jlist.row_bytes();
else
return 0;
} else
@ -186,7 +199,7 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
UCL_H_Vec<int> ilist_view;
ilist_view.view(ilist,inum,*dev);
ucl_copy(dev_nbor,ilist_view,true);
ucl_copy(dev_nbor,ilist_view,false);
UCL_D_Vec<int> nbor_offset;
UCL_H_Vec<int> host_offset;
@ -238,46 +251,20 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
if (_use_packing==false) {
time_kernel.start();
int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
k_nbor.set_size(GX,block_size);
k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
_shared->k_nbor.set_size(GX,block_size);
_shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
time_kernel.stop();
}
}
void PairGPUNbor::compile_kernels(UCL_Device &dev) {
std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
if (_gpu_nbor==false) {
nbor_program=new UCL_Program(dev);
nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
k_nbor.set_function(*nbor_program,"kernel_unpack");
} else {
build_program=new UCL_Program(dev);
#ifdef USE_OPENCL
std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
exit(1);
#else
build_program->load_string(pair_gpu_build_kernel,flags.c_str());
#endif
k_cell_id.set_function(*build_program,"calc_cell_id");
k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
k_transpose.set_function(*build_program,"transpose");
k_special.set_function(*build_program,"kernel_special");
neigh_tex.get_texture(*build_program,"neigh_tex");
}
_compiled=true;
}
template <class numtyp, class acctyp>
void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
const int nall,
PairGPUAtom<numtyp,acctyp> &atom,
double *boxlo, double *boxhi, int *tag,
double *sublo, double *subhi, int *tag,
int **nspecial, int **special, bool &success,
int &mn) {
const int nt=inum+host_inum;
if (_maxspecial>0) {
time_nbor.start();
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
@ -290,25 +277,25 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
time_nbor.stop();
time_nbor.add_to_total();
time_kernel.start();
const int b2x=8;
const int b2y=8;
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
k_transpose.set_size(g2x,g2y,b2x,b2y);
k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial,
&nt);
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
&_maxspecial,&nt);
} else
time_kernel.start();
_nbor_pitch=inum;
neigh_tex.bind_float(atom.dev_x,4);
_shared->neigh_tex.bind_float(atom.dev_x,4);
int ncellx, ncelly, ncellz, ncell_3d;
ncellx = static_cast<int>(ceil(((boxhi[0] - boxlo[0]) +
ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
2.0*_cell_size)/_cell_size));
ncelly = static_cast<int>(ceil(((boxhi[1] - boxlo[1]) +
ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
2.0*_cell_size)/_cell_size));
ncellz = static_cast<int>(ceil(((boxhi[2] - boxlo[2]) +
ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
2.0*_cell_size)/_cell_size));
ncell_3d = ncellx * ncelly * ncellz;
UCL_D_Vec<int> cell_counts;
@ -316,35 +303,36 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
_cell_bytes=cell_counts.row_bytes();
/* build cell list on GPU */
const int neigh_block=128;
const int neigh_block=_block_cell_id;
const int GX=(int)ceil((float)nall/neigh_block);
const numtyp boxlo0=static_cast<numtyp>(boxlo[0]);
const numtyp boxlo1=static_cast<numtyp>(boxlo[1]);
const numtyp boxlo2=static_cast<numtyp>(boxlo[2]);
const numtyp boxhi0=static_cast<numtyp>(boxhi[0]);
const numtyp boxhi1=static_cast<numtyp>(boxhi[1]);
const numtyp boxhi2=static_cast<numtyp>(boxhi[2]);
const numtyp sublo0=static_cast<numtyp>(sublo[0]);
const numtyp sublo1=static_cast<numtyp>(sublo[1]);
const numtyp sublo2=static_cast<numtyp>(sublo[2]);
const numtyp subhi0=static_cast<numtyp>(subhi[0]);
const numtyp subhi1=static_cast<numtyp>(subhi[1]);
const numtyp subhi2=static_cast<numtyp>(subhi[2]);
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
k_cell_id.set_size(GX,neigh_block);
k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
&atom.dev_particle_id.begin(),
&boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1,
&boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
_shared->k_cell_id.set_size(GX,neigh_block);
_shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
&atom.dev_particle_id.begin(),
&sublo0, &sublo1, &sublo2, &subhi0, &subhi1,
&subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
atom.sort_neighbor(nall);
/* calculate cell count */
k_cell_counts.set_size(GX,neigh_block);
k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall,
&ncell_3d);
_shared->k_cell_counts.set_size(GX,neigh_block);
_shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(),
&nall, &ncell_3d);
/* build the neighbor list */
const int cell_block=64;
k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
&cell_counts.begin(), &dev_nbor.begin(),
&dev_host_nbor.begin(), &_max_nbors, &cell_size_cast,
&ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
const int cell_block=_block_nbor_build;
_shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
_shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
&cell_counts.begin(), &dev_nbor.begin(),
&dev_host_nbor.begin(), &dev_host_numj.begin(),
&_max_nbors,&cell_size_cast,
&ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
/* Get the maximum number of nbors and realloc if necessary */
UCL_D_Vec<int> numj;
@ -353,7 +341,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
if (nt>inum) {
UCL_H_Vec<int> host_offset;
host_offset.view_offset(inum,host_acc,nt-inum);
ucl_copy(host_offset,dev_host_nbor,nt-inum,false);
ucl_copy(host_offset,dev_host_numj,nt-inum,false);
}
mn=host_acc[0];
for (int i=1; i<nt; i++)
@ -368,10 +356,15 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
if (_max_host>0) {
host_nbor.clear();
dev_host_nbor.clear();
success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor,
success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
success=success && (dev_host_nbor.alloc((mn+1)*_max_host,
success=success && (dev_host_nbor.alloc(mn*_max_host,
dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
int *ptr=host_nbor.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=mn;
}
_gpu_bytes+=dev_host_nbor.row_bytes();
}
if (_alloc_packed) {
@ -385,28 +378,29 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
_max_nbors=mn;
time_kernel.stop();
time_kernel.add_to_total();
build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial,
build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
special, success, mn);
return;
}
if (_maxspecial>0) {
const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
k_special.set_size(GX2,cell_block);
k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
&atom.dev_tag.begin(), &dev_nspecial.begin(),
&dev_special.begin(), &inum, &nt, &nall);
_shared->k_special.set_size(GX2,cell_block);
_shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
&dev_host_numj.begin(), &atom.dev_tag.begin(),
&dev_nspecial.begin(), &dev_special.begin(),
&inum, &nt, &nall, &_max_nbors);
}
time_kernel.stop();
time_nbor.start();
if (_gpu_host)
ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);
ucl_copy(host_nbor,dev_host_nbor,false);
time_nbor.stop();
}
template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
(const int inum, const int host_inum, const int nall,
PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *boxlo, double *boxhi,
(const int inum, const int host_inum, const int nall,
PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
int *, int **, int **, bool &success, int &mn);

View File

@ -19,32 +19,27 @@
#define PAIR_GPU_NBOR_H
#include "pair_gpu_atom.h"
#include "pair_gpu_nbor_shared.h"
#define IJ_SIZE 131072
#ifdef USE_OPENCL
#include "geryon/ocl_device.h"
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h"
#include "geryon/ocl_texture.h"
using namespace ucl_opencl;
#else
#include "geryon/nvd_device.h"
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
#include "geryon/nvd_kernel.h"
#include "geryon/nvd_texture.h"
using namespace ucl_cudadr;
#endif
class PairGPUNbor {
public:
PairGPUNbor() : _allocated(false), _use_packing(false), _compiled(false) {}
PairGPUNbor() : _allocated(false), _use_packing(false) {}
~PairGPUNbor() { clear(); }
/// Determine whether neighbor unpacking should be used
@ -62,9 +57,11 @@ class PairGPUNbor {
* 2 if gpu_nbor is true, and host needs a full nbor list
* \param pre_cut True if cutoff test will be performed in separate kernel
* than the force kernel **/
bool init(const int inum, const int host_inum, const int max_nbors,
const int maxspecial, UCL_Device &dev, const bool gpu_nbor,
const int gpu_host, const bool pre_cut);
bool init(PairGPUNborShared *shared, const int inum, const int host_inum,
const int max_nbors, const int maxspecial, UCL_Device &dev,
const bool gpu_nbor, const int gpu_host, const bool pre_cut,
const int block_cell_2d, const int block_cell_id,
const int block_nbor_build);
/// Set the size of the cutoff+skin
inline void cell_size(const double size) { _cell_size=size; }
@ -131,18 +128,18 @@ class PairGPUNbor {
inline int max_nbors() const { return _max_nbors; }
/// Loop through neighbor count array and return maximum nbors for a particle
inline int max_nbor_loop(const int inum, int *numj) const {
inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
int mn=0;
for (int i=0; i<inum; i++)
mn=std::max(mn,numj[i]);
mn=std::max(mn,numj[ilist[i]]);
return mn;
}
/// Build nbor list on the device
template <class numtyp, class acctyp>
void build_nbor_list(const int inum, const int host_inum, const int nall,
PairGPUAtom<numtyp,acctyp> &atom, double *boxlo,
double *boxhi, int *tag, int **nspecial, int **special,
PairGPUAtom<numtyp,acctyp> &atom, double *sublo,
double *subhi, int *tag, int **nspecial, int **special,
bool &success, int &max_nbors);
/// Return the number of bytes used on device
@ -176,31 +173,31 @@ class PairGPUNbor {
UCL_H_Vec<int> host_nbor;
/// Device storage for neighbor list matrix that will be copied to host
/** - 1st row is numj
* - Remaining rows are nbors **/
* - Remaining rows are by atom, columns are nbors **/
UCL_D_Vec<int> dev_host_nbor;
UCL_D_Vec<int> dev_host_numj;
UCL_H_Vec<int> host_ilist;
UCL_H_Vec<int*> host_jlist;
/// Device storage for special neighbor counts
UCL_D_Vec<int> dev_nspecial;
/// Device storage for special neighbors
UCL_D_Vec<int> dev_special, dev_special_t;
/// Texture for cached position/type access with CUDA
UCL_Texture neigh_tex;
/// Device timers
UCL_Timer time_nbor, time_kernel;
private:
PairGPUNborShared *_shared;
UCL_Device *dev;
UCL_Program *nbor_program, *build_program;
UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
UCL_Kernel k_transpose, k_special;
bool _allocated, _use_packing, _compiled;
void compile_kernels(UCL_Device &dev);
bool _allocated, _use_packing;
int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
bool _gpu_nbor, _gpu_host, _alloc_packed;
double _cell_size;
double _gpu_bytes, _c_bytes, _cell_bytes;
void alloc(bool &success);
int _block_cell_2d, _block_cell_id, _block_nbor_build;
};
#endif

View File

@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
#define acctyp4 _lgpu_float4
#endif
#define MAX_SHARED_TYPES 8
#define MAX_BIO_SHARED_TYPES 128
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#endif