Changes from Mike Brown.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
pscrozi 2010-11-23 00:40:35 +00:00
parent ae536ce7d0
commit 5a82c99485
130 changed files with 24967 additions and 4802 deletions

View File

@ -1,72 +0,0 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
BIN_DIR = .
OBJ_DIR = .
AR = ar
CUDA_CPP = /cygdrive/c/CUDA/bin/nvcc -I/cygdrive/c/CUDA/include -O3 -DWINDLL -DUNIX -Xptxas -v --use_fast_math
CUDA_ARCH = -arch=sm_13
CUDA_PREC = -D_SINGLE_SINGLE
CUDA_LINK = -L/cygdrive/c/CUDA/lib -lcudart $(CUDA_LIB)
CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
CUDA_LIB = $(OBJ_DIR)/gpu.dll
# Headers for CUDA Stuff
NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
# Headers for Pair Stuff
PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
# Dependencies for the Texture Tar
TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
ALL_H = $(NVC_H) $(PAIR_H)
EXECS = $(BIN_DIR)/nvc_get_devices
OBJS = $(OBJ_DIR)/nvc_device.obj $(OBJ_DIR)/pair_gpu_nbor.obj \
$(OBJ_DIR)/pair_tex_tar.obj $(OBJ_DIR)/pair_gpu_cell.obj
all: $(CUDA_LIB) $(EXECS)
$(OBJ_DIR)/nvc_device.obj : nvc_device.cu $(NVC_H)
$(CUDA) -o $@ -c nvc_device.cu
$(OBJ_DIR)/pair_gpu_nbor.obj: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
$(CUDA) -o $@ -c pair_gpu_nbor.cu
$(OBJ_DIR)/pair_tex_tar.obj: $(TAR_H)
$(CUDA) -o $@ -c pair_tex_tar.cu
$(OBJ_DIR)/pair_gpu_cell.obj: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
$(CUDA) -o $@ -c pair_gpu_cell.cu
$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.obj
$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.obj
$(CUDA_LIB): $(OBJS) $(TAR_H)
$(CUDA) -o $@ -shared $(OBJS)
clean:
rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.exe *.exp *.lib *.dll *.linkinfo
veryclean: clean
rm -rf *~ *.linkinfo

39
lib/gpu/Makefile.fermi Normal file
View File

@ -0,0 +1,39 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
CUDA_HOME = $(HOME)/cuda
NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_ARCH = -arch=sm_13
CUDA_PRECISION = -D_SINGLE_DOUBLE
CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include
CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
BIN_DIR = ./
OBJ_DIR = ./obj
LIB_DIR = ./
AR = ar
BSH = /bin/sh
include Nvidia.makefile

39
lib/gpu/Makefile.lens Normal file
View File

@ -0,0 +1,39 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/
NVCC = nvcc
CUDA_ARCH = -arch=sm_13
CUDA_PRECISION = -D_SINGLE_SINGLE
CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib64
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpic++ -DMPI_GERYON
CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
BIN_DIR = ./
OBJ_DIR = ./obj
LIB_DIR = ./
AR = ar
BSH = /bin/sh
include Nvidia.makefile

36
lib/gpu/Makefile.lincoln Normal file
View File

@ -0,0 +1,36 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Makefile for NCSA's lincoln GPU cluster. Tested with "soft +cuda-2.3"
# ------------------------------------------------------------------------- */
CUDA_HOME = /usr/local/cuda-2.3
NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_ARCH = -arch=sm_13
CUDA_PRECISION = -D_SINGLE_SINGLE
CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpic++ -DMPI_GERYON
CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
BIN_DIR = ./
OBJ_DIR = ./obj
LIB_DIR = ./
AR = ar
include Nvidia.makefile

39
lib/gpu/Makefile.linux Normal file
View File

@ -0,0 +1,39 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
CUDA_HOME = /usr/local/cuda
NVCC = nvcc
CUDA_ARCH = -arch=sm_13
CUDA_PRECISION = -D_SINGLE_SINGLE
CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib64
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
BIN_DIR = ./
OBJ_DIR = ./obj
LIB_DIR = ./
AR = ar
BSH = /bin/sh
include Nvidia.makefile

View File

@ -0,0 +1,31 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -lOpenCL
OCL_PREC = -D_SINGLE_SINGLE
BIN_DIR = ./
OBJ_DIR = ./ocl_obj
LIB_DIR = ./
AR = ar
BSH = /bin/sh
include Opencl.makefile

35
lib/gpu/Makefile.longhorn Normal file
View File

@ -0,0 +1,35 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Makefile for the TACC longhorn cluster. Use "module load cuda".
# ------------------------------------------------------------------------- */
CUDA_HOME = $(TACC_CUDA_DIR)
NVCC = nvcc
CUDA_ARCH = -arch=sm_13
CUDA_PRECISION = -D_SINGLE_SINGLE
CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
BIN_DIR = ./
OBJ_DIR = ./obj
LIB_DIR = ./
AR = ar
include Nvidia.makefile

39
lib/gpu/Makefile.mac Normal file
View File

@ -0,0 +1,39 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
CUDA_HOME = /usr/local/cuda
NVCC = nvcc
CUDA_ARCH = -arch=sm_11
CUDA_PRECISION = -D_SINGLE_SINGLE
CUDA_INCLUDE = -I$(CUDA_HOME)/include
CUDA_LIB = -L$(CUDA_HOME)/lib
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
CUDR_CPP = mpic++
CUDR_OPTS = -O2 -m32 -g
BIN_DIR = ./
OBJ_DIR = ./obj
LIB_DIR = ./
AR = ar
BSH = /bin/sh
include Nvidia.makefile

View File

@ -0,0 +1,31 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
OCL_LINK = -framework OpenCL
OCL_PREC = -D_SINGLE_SINGLE
BIN_DIR = ./
OBJ_DIR = ./ocl_obj
LIB_DIR = ./
AR = ar
BSH = /bin/sh
include Opencl.makefile

View File

@ -1,72 +0,0 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
BIN_DIR = .
OBJ_DIR = .
AR = ar
CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
CUDA_ARCH = -arch=sm_13
CUDA_PREC = -D_SINGLE_SINGLE
CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
CUDA_LIB = $(OBJ_DIR)/libgpu.a
# Headers for CUDA Stuff
NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
# Headers for Pair Stuff
PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
# Dependencies for the Texture Tar
TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
ALL_H = $(NVC_H) $(PAIR_H)
EXECS = $(BIN_DIR)/nvc_get_devices
OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/pair_gpu_nbor.cu_o \
$(OBJ_DIR)/pair_tex_tar.cu_o $(OBJ_DIR)/pair_gpu_cell.cu_o
all: $(CUDA_LIB) $(EXECS)
$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H)
$(CUDA) -o $@ -c nvc_device.cu
$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
$(CUDA) -o $@ -c pair_gpu_nbor.cu
$(OBJ_DIR)/pair_tex_tar.cu_o: $(TAR_H)
$(CUDA) -o $@ -c pair_tex_tar.cu
$(OBJ_DIR)/pair_gpu_cell.cu_o: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
$(CUDA) -o $@ -c pair_gpu_cell.cu
$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o
$(CUDA_LIB): $(OBJS)
$(AR) -crusv $(CUDA_LIB) $(OBJS)
clean:
rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo
veryclean: clean
rm -rf *~ *.linkinfo

218
lib/gpu/Nvidia.makefile Normal file
View File

@ -0,0 +1,218 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
CUDA = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
$(CUDA_PRECISION)
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
-Icudpp_mini
CUDA_LINK = $(CUDA_LIB) -lcudart
GPU_LIB = $(LIB_DIR)/libgpu.a
# Headers for Geryon
UCL_H = $(wildcard ./geryon/ucl*.h)
NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H)
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H)
# Headers for Pair Stuff
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
pair_gpu_device.h pair_gpu_balance.h
ALL_H = $(NVD_H) $(PAIR_H)
EXECS = $(BIN_DIR)/nvc_get_devices
CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
$(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
$(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
$(OBJ_DIR)/charge_gpu_memory.o \
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
$(CUDPP)
PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
$(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
$(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
$(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
all: $(GPU_LIB) $(EXECS)
$(OBJ_DIR)/cudpp.o: cudpp_mini/cudpp.cpp
$(CUDR) -o $@ -c cudpp_mini/cudpp.cpp -Icudpp_mini
$(OBJ_DIR)/cudpp_plan.o: cudpp_mini/cudpp_plan.cpp
$(CUDR) -o $@ -c cudpp_mini/cudpp_plan.cpp -Icudpp_mini
$(OBJ_DIR)/cudpp_maximal_launch.o: cudpp_mini/cudpp_maximal_launch.cpp
$(CUDR) -o $@ -c cudpp_mini/cudpp_maximal_launch.cpp -Icudpp_mini
$(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
$(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
$(OBJ_DIR)/pair_gpu_atom_kernel.ptx: pair_gpu_atom_kernel.cu
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_atom_kernel.cu
$(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
$(OBJ_DIR)/pair_gpu_nbor_ptx.h: $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h
$(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_build_kernel.cu
$(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
$(CUDR) -o $@ -c pair_gpu_device.cpp
$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
$(CUDR) -o $@ -c atomic_gpu_memory.cpp
$(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
$(CUDR) -o $@ -c charge_gpu_memory.cpp
$(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
$(OBJ_DIR)/gb_gpu_kernel_lj.ptx: gb_gpu_kernel_lj.cu pair_gpu_precision.h gb_gpu_extra.h
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_lj.cu
$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx: gb_gpu_kernel_nbor.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_nbor.cu
$(OBJ_DIR)/gb_gpu_ptx.h: $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h
$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_ptx.h
$(CUDR) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
$(CUDR) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_cut_gpu_kernel.ptx: lj_cut_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lj_cut_gpu_kernel.cu
$(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ ljc_cut_gpu_kernel.cu
$(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ ljcl_cut_gpu_kernel.cu
$(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu
$(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
$(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
$(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_long_gpu_kernel.cu
$(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK)
$(GPU_LIB): $(OBJS)
$(AR) -crusv $(GPU_LIB) $(OBJS)
clean:
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo
veryclean: clean
rm -rf *~ *.linkinfo

155
lib/gpu/Opencl.makefile Normal file
View File

@ -0,0 +1,155 @@
# /* ----------------------------------------------------------------------
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
# http://lammps.sandia.gov, Sandia National Laboratories
# Steve Plimpton, sjplimp@sandia.gov
#
# Copyright (2003) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
# certain rights in this software. This software is distributed under
# the GNU General Public License.
#
# See the README file in the top-level LAMMPS directory.
# ------------------------------------------------------------------------- */
#
# /* ----------------------------------------------------------------------
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
# Peng Wang (Nvidia), penwang@nvidia.com
# Paul Crozier (SNL), pscrozi@sandia.gov
# ------------------------------------------------------------------------- */
OCL = $(OCL_CPP) $(OCL_PREC) -DUSE_OPENCL
OCL_LIB = $(LIB_DIR)/libgpu.a
# Headers for Geryon
UCL_H = $(wildcard ./geryon/ucl*.h)
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H)
# Headers for Pair Stuff
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
pair_gpu_device.h pair_gpu_balance.h
ALL_H = $(OCL_H) $(PAIR_H)
EXECS = $(BIN_DIR)/ocl_get_devices
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
$(OBJ_DIR)/charge_gpu_memory.o \
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o
KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
$(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
$(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
$(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
$(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
all: $(OCL_LIB) $(EXECS)
$(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_cl.h
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
$(OCL) -o $@ -c pair_gpu_device.cpp
$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
$(OCL) -o $@ -c atomic_gpu_memory.cpp
$(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
$(OCL) -o $@ -c charge_gpu_memory.cpp
$(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
$(OBJ_DIR)/gb_gpu_cl.h: gb_gpu_kernel.cu gb_gpu_kernel_lj.cu gb_gpu_extra.h
cat gb_gpu_extra.h gb_gpu_kernel.cu > $(OBJ_DIR)/gb_gpu_kernel.tar; \
cat gb_gpu_extra.h gb_gpu_kernel_lj.cu > $(OBJ_DIR)/gb_gpu_kernel_lj.tar; \
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_cl.h; \
rm -f $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar
$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h
$(OCL) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
$(OCL) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_cl.h;
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_cl.h;
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_cl.h;
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h;
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
$(BSH) ./geryon/file_to_cstr.sh cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_cl.h;
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
$(OCL_LIB): $(OBJS) $(PTXS)
$(AR) -crusv $(OCL_LIB) $(OBJS)
opencl: $(OCL_EXECS)
clean:
rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo
veryclean: clean
rm -rf *~ *.linkinfo

View File

@ -12,7 +12,7 @@
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
@ -20,57 +20,91 @@
GENERAL NOTES
This library, libgpu.a, provides routines for GPU acceleration
of LAMMPS pair styles. Currently, only CUDA enabled GPUs are
supported. Compilation of this library requires installing the CUDA
GPU driver and CUDA toolkit for your operating system. In addition to
the LAMMPS library, the binary nvc_get_devices will also be
built. This can be used to query the names and properties of GPU
devices on your system.
of LAMMPS pair styles. Compilation of this library requires
installing the CUDA GPU driver and CUDA toolkit for your operating
system. In addition to the LAMMPS library, the binary nvc_get_devices
will also be built. This can be used to query the names and
properties of GPU devices on your system. A Makefile for OpenCL
compilation is provided, but support for OpenCL use is not currently
provided by the developers.
NOTE: Installation of the CUDA SDK is not required.
Current pair styles supporting GPU acceleration:
1. lj/cut/gpu
2. gayberne/gpu
2. lj/cut/coul/cut/gpu
3. lj/cut/coul/long/gpu
4. lj96/cut/gpu
5. gayberne/gpu
6. cmm/cg/gpu
7. cmm/cg/coul/long/gpu
MULTIPLE LAMMPS PROCESSES
When using GPU acceleration, you are restricted to one physical GPU
per LAMMPS process. This can be multiple GPUs on a single node or
across multiple nodes. Intructions on GPU assignment can be found in
the LAMMPS documentation.
SPEEDUPS
The speedups that can be obtained using this library are highly
dependent on the GPU architecture and the computational expense of the
pair potential. When comparing a single precision Tesla C1060 run to a
serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for
lj/cut with a cutoff of 2.5. For gayberne with a cutoff of 7, the
speedup is >103x for 8000 particles. The speedup will improve with an
increase in the number of particles or an increase in the cutoff.
Multiple LAMMPS MPI processes can share GPUs on the system, but multiple
GPUs cannot be utilized by a single MPI process. In many cases, the
best performance will be obtained by running as many MPI processes as
CPU cores available with the condition that the number of MPI processes
is an integer multiple of the number of GPUs being used. See the
LAMMPS user manual for details on running with GPU acceleration.
BUILDING AND PRECISION MODES
To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for
your machine. Type make. Additionally, the GPU package must be installed and
compiled for LAMMPS. The library supports 3 precision modes as determined by
the CUDA_PREC variable:
To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME, NVCC, CUDA_INCLUD,
CUDA_LIB and CUDA_OPTS variables in one of the Makefiles. CUDA_ARCH should
be set based on the compute capability of your GPU. This can be verified by
running the nvc_get_devices executable after the build is complete.
Additionally, the GPU package must be installed and compiled for LAMMPS.
This may require editing the gpu_SYSPATH variable in the LAMMPS makefile.
Please note that the GPU library accesses the CUDA driver library directly,
so it needs to be linked not only to the CUDA runtime library (libcudart.so)
that ships with the CUDA toolkit, but also with the CUDA driver library
(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS
on the head node of a GPU cluster, this library may not be installed,
so you may need to copy it over from one of the compute nodes (best into
this directory).
The gpu library supports 3 precision modes as determined by
the CUDA_PRECISION variable:
CUDA_PREC = -D_SINGLE_SINGLE # Single precision for all calculations
CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations
CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
NOTE: For the lj/cut pair style, only single precision will be used, even
if double precision is specified.
NOTE: Double precision is only supported on certain GPUS (with
NOTE: Double precision is only supported on certain GPUs (with
compute capability>=1.3).
NOTE: For Tesla and other graphics cards with compute capability>=1.3,
make sure that -arch=sm_13 is set on the CUDA_ARCH line.
NOTE: For Fermi, make sure that -arch=sm_20 is set on the CUDA_ARCH line.
NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
package has been installed before installing the GPU package in LAMMPS.
NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be
installed if the USER-CG-CMM package has been installed before
installing the GPU package in LAMMPS.
NOTE: The lj/cut/coul/long/gpu and cg/cmm/coul/long/gpu style will only be
installed if the KSPACE package has been installed before installing
the GPU package in LAMMPS.
EXAMPLE BUILD PROCESS
cd ~/lammps/lib/gpu
emacs Makefile.linux
make -f Makefile.linux
./nvc_get_devices
cd ../../src
emacs ./MAKE/Makefile.linux
make yes-asphere
make yes-kspace
make yes-gpu
make linux
------------------------------------------------------------------------
Last merge with gpulammps: r561 on 2010-11-12
------------------------------------------------------------------------

View File

@ -0,0 +1,262 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include "atomic_gpu_memory.h"
#define AtomicGPUMemoryT AtomicGPUMemory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) {
device=&pair_gpu_device;
}
template <class numtyp, class acctyp>
AtomicGPUMemoryT::~AtomicGPUMemory() {
}
template <class numtyp, class acctyp>
int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
nbor_time_avail=false;
screen=_screen;
bool gpu_nbor=false;
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
if (host_nlocal>0)
_gpu_host=1;
if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
_gpu_host,max_nbors,cell_size,false))
return false;
ucl_device=device->gpu;
atom=&device->atom;
nbor=&device->nbor;
_block_size=BLOCK_1D;
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
_block_size=ucl_device->group_size();
compile_kernels(*ucl_device,pair_program);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->dev_x,4);
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
return true;
}
template <class numtyp, class acctyp>
void AtomicGPUMemoryT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
if (_compiled) {
k_pair_fast.clear();
k_pair.clear();
delete pair_program;
_compiled=false;
}
time_pair.clear();
hd_balancer.clear();
device->clear();
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
int *numj, int **firstneigh, bool &success) {
success=true;
nbor_time_avail=true;
int mn=nbor->max_nbor_loop(inum,numj);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return false;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
const int host_inum,
const int nall, double **host_x,
int *host_type, double *boxlo,
double *boxhi, int *tag,
int **nspecial, int **special,
bool &success) {
nbor_time_avail=true;
success=true;
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
nspecial, special, success, mn);
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
const int inum_full, const int nall,
double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success) {
acc_timers();
if (inum_full==0) {
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
nbor->gpu_nbor());
atom->inum(inum);
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
const int inum_full, const int nall,
double **host_x, int *host_type, double *boxlo,
double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
acc_timers();
if (inum_full==0) {
zero_timers();
return NULL;
}
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
atom->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
boxlo, boxhi, tag, nspecial, special, success);
if (!success)
return NULL;
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom);
hd_balancer.stop_timer();
return device->nbor.host_nbor.begin();
}
template <class numtyp, class acctyp>
double AtomicGPUMemoryT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
sizeof(AtomicGPUMemory<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void AtomicGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
if (_compiled)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
k_pair.set_function(*pair_program,"kernel_pair");
pos_tex.get_texture(*pair_program,"pos_tex");
_compiled=true;
}
template class AtomicGPUMemory<PRECISION,ACC_PRECISION>;

180
lib/gpu/atomic_gpu_memory.h Normal file
View File

@ -0,0 +1,180 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef ATOMIC_GPU_MEMORY_H
#define ATOMIC_GPU_MEMORY_H
#define BLOCK_1D 64
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
template <class numtyp, class acctyp>
class AtomicGPUMemory {
public:
AtomicGPUMemory();
virtual ~AtomicGPUMemory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(inum, nall, success))
pos_tex.bind_float(atom->dev_x,4);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
}
time_pair.add_to_total();
atom->acc_timers();
}
/// Zero timers
inline void zero_timers() {
nbor_time_avail=false;
time_pair.zero();
atom->zero_timers();
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, bool &success);
/// Pair loop with host neighboring
void compute(const int timestep, const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int * compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *boxlo,
double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
PairGPUDevice<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
PairGPUBalance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
PairGPUAtom<numtyp,acctyp> *atom;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
PairGPUNbor *nbor;
/// True if we need to accumulate time for neighboring
bool nbor_time_avail;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_pair_fast, k_pair;
inline int block_size() { return _block_size; }
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
protected:
bool _compiled;
int _block_size;
double _max_bytes, _max_an_bytes;
void compile_kernels(UCL_Device &dev, const char *pair_string);
virtual void loop(const bool _eflag, const bool _vflag) = 0;
};
#endif

View File

@ -0,0 +1,270 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include "charge_gpu_memory.h"
#define ChargeGPUMemoryT ChargeGPUMemory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
device=&pair_gpu_device;
}
template <class numtyp, class acctyp>
ChargeGPUMemoryT::~ChargeGPUMemory() {
}
template <class numtyp, class acctyp>
int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
nbor_time_avail=false;
screen=_screen;
bool gpu_nbor=false;
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
if (host_nlocal>0)
_gpu_host=1;
if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
_gpu_host,max_nbors,cell_size,false))
return false;
ucl_device=device->gpu;
atom=&device->atom;
nbor=&device->nbor;
_block_size=BLOCK_1D;
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
_block_size=ucl_device->group_size();
compile_kernels(*ucl_device,pair_program);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
return true;
}
template <class numtyp, class acctyp>
void ChargeGPUMemoryT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
if (_compiled) {
k_pair_fast.clear();
k_pair.clear();
delete pair_program;
_compiled=false;
}
time_pair.clear();
hd_balancer.clear();
device->clear();
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
int *numj, int **firstneigh, bool &success) {
success=true;
nbor_time_avail=true;
int mn=nbor->max_nbor_loop(inum,numj);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return false;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
const int host_inum,
const int nall, double **host_x,
int *host_type, double *boxlo,
double *boxhi, int *tag,
int **nspecial, int **special,
bool &success) {
nbor_time_avail=true;
success=true;
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
nspecial, special, success, mn);
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
const int inum_full, const int nall,
double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double *host_q) {
acc_timers();
if (inum_full==0) {
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
nbor->gpu_nbor());
atom->inum(inum);
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
atom->add_other_data();
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
const int inum_full, const int nall,
double **host_x, int *host_type, double *boxlo,
double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success,
double *host_q) {
acc_timers();
if (inum_full==0) {
zero_timers();
return NULL;
}
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
atom->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
boxlo, boxhi, tag, nspecial, special, success);
if (!success)
return NULL;
atom->cast_q_data(host_q);
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_other_data();
loop(eflag,vflag);
atom->copy_answers(eflag,vflag,eatom,vatom);
hd_balancer.stop_timer();
return device->nbor.host_nbor.begin();
}
template <class numtyp, class acctyp>
double ChargeGPUMemoryT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
sizeof(ChargeGPUMemory<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void ChargeGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
if (_compiled)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
k_pair.set_function(*pair_program,"kernel_pair");
pos_tex.get_texture(*pair_program,"pos_tex");
q_tex.get_texture(*pair_program,"q_tex");
_compiled=true;
}
template class ChargeGPUMemory<PRECISION,ACC_PRECISION>;

183
lib/gpu/charge_gpu_memory.h Normal file
View File

@ -0,0 +1,183 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CHARGE_GPU_MEMORY_H
#define CHARGE_GPU_MEMORY_H
#define BLOCK_1D 64
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
template <class numtyp, class acctyp>
class ChargeGPUMemory {
public:
ChargeGPUMemory();
virtual ~ChargeGPUMemory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(inum, nall, success)) {
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
}
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
}
time_pair.add_to_total();
atom->acc_timers();
}
/// Zero timers
inline void zero_timers() {
nbor_time_avail=false;
time_pair.zero();
atom->zero_timers();
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, bool &success);
/// Pair loop with host neighboring
void compute(const int timestep, const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success,
double *charge);
/// Pair loop with device neighboring
int * compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *boxlo,
double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *charge);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
PairGPUDevice<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
PairGPUBalance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
PairGPUAtom<numtyp,acctyp> *atom;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
PairGPUNbor *nbor;
/// True if we need to accumulate time for neighboring
bool nbor_time_avail;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_pair_fast, k_pair;
inline int block_size() { return _block_size; }
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
UCL_Texture q_tex;
protected:
bool _compiled;
int _block_size;
double _max_bytes, _max_an_bytes;
void compile_kernels(UCL_Device &dev, const char *pair_string);
virtual void loop(const bool _eflag, const bool _vflag) = 0;
};
#endif

124
lib/gpu/cmm_cut_gpu.cpp Normal file
View File

@ -0,0 +1,124 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "cmm_cut_gpu_memory.h"
using namespace std;
static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
CMMMF.clear();
gpu_mode=CMMMF.device->gpu_mode();
double gpu_split=CMMMF.device->particle_split();
int first_gpu=CMMMF.device->first_device();
int last_gpu=CMMMF.device->last_device();
int world_me=CMMMF.device->world_me();
int gpu_rank=CMMMF.device->gpu_rank();
int procs_per_gpu=CMMMF.device->procs_per_gpu();
CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu);
bool message=false;
if (world_me==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
if (world_me==0) {
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
if (!init_ok)
return false;
}
MPI_Barrier(MPI_COMM_WORLD);
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split,
screen);
if (!init_ok)
return false;
}
MPI_Barrier(CMMMF.device->gpu_comm);
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
}
void cmm_gpu_clear() {
CMMMF.clear();
}
int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success);
}
void cmm_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success) {
CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double cmm_gpu_bytes() {
return CMMMF.host_memory_usage();
}

View File

@ -0,0 +1,296 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMM_GPU_KERNEL
#define CMM_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (r2inv<lj1[mtype].x) {
r2inv=(numtyp)1.0/r2inv;
numtyp inv1,inv2;
if (lj1[mtype].y == 2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj1[mtype].y == 1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
lj3[mtype].z;
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,__global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (ii<4)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
if (eflag>0)
lj3[ii]=lj3_in[ii];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].x) {
r2inv=(numtyp)1.0/r2inv;
numtyp inv1,inv2;
if (lj1[mtype].y == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj1[mtype].y == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
lj3[mtype].z;
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -0,0 +1,150 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "cmm_cut_gpu_cl.h"
#else
#include "cmm_cut_gpu_ptx.h"
#endif
#include "cmm_cut_gpu_memory.h"
#include <cassert>
#define CMM_GPU_MemoryT CMM_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
CMM_GPU_MemoryT::CMM_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
CMM_GPU_MemoryT::~CMM_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cmm_cut_gpu_kernel);
// If atom type constants fit in shared memory use fast kernel
int cmm_types=ntypes;
shared_types=false;
if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
cmm_types=MAX_SHARED_TYPES;
shared_types=true;
}
_cmm_types=cmm_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<cmm_types*cmm_types; i++)
host_write[i]=0.0;
lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
host_cg_type,host_lj1,host_lj2);
lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return true;
}
template <class numtyp, class acctyp>
void CMM_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double CMM_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(CMM_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int ainum=this->atom->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch);
}
this->time_pair.stop();
}
template class CMM_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -0,0 +1,71 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMM_GPU_MEMORY_H
#define CMM_GPU_MEMORY_H
#include "atomic_gpu_memory.h"
template <class numtyp, class acctyp>
class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
public:
CMM_GPU_Memory();
~CMM_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = cutsq, lj1.y=cg_type, lj1.z = lj1, lj1.w = lj2
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _cmm_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

130
lib/gpu/cmmc_long_gpu.cpp Normal file
View File

@ -0,0 +1,130 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "cmmc_long_gpu_memory.h"
using namespace std;
static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
CMMLMF.clear();
gpu_mode=CMMLMF.device->gpu_mode();
double gpu_split=CMMLMF.device->particle_split();
int first_gpu=CMMLMF.device->first_device();
int last_gpu=CMMLMF.device->last_device();
int world_me=CMMLMF.device->world_me();
int gpu_rank=CMMLMF.device->gpu_rank();
int procs_per_gpu=CMMLMF.device->procs_per_gpu();
CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu);
bool message=false;
if (world_me==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
if (world_me==0) {
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, inum,
nall, 300, maxspecial, cell_size, gpu_split,
screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e,g_ewald);
if (!init_ok)
return false;
}
MPI_Barrier(MPI_COMM_WORLD);
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, inum,
nall, 300, maxspecial, cell_size, gpu_split,
screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald);
if (!init_ok)
return false;
}
MPI_Barrier(CMMLMF.device->gpu_comm);
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
}
void cmml_gpu_clear() {
CMMLMF.clear();
}
int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q) {
return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success, host_q);
}
void cmml_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q) {
CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q);
}
double cmml_gpu_bytes() {
return CMMLMF.host_memory_usage();
}

View File

@ -0,0 +1,378 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMML_GPU_KERNEL
#define CMML_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define EWALD_F (numtyp)1.12837917
#define EWALD_P (numtyp)0.3275911
#define A1 (numtyp)0.254829592
#define A2 (numtyp)-0.284496736
#define A3 (numtyp)1.421413741
#define A4 (numtyp)-1.453152027
#define A5 (numtyp)1.061405429
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_ , const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp e_coul=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
numtyp factor_lj, factor_coul;
if (j < nall) {
factor_lj = (numtyp)1.0;
factor_coul = (numtyp)0.0;
} else {
factor_lj = sp_lj[j/nall];
factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
j %= nall;
}
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].x) {
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
numtyp r2inv=(numtyp)1.0/rsq;
if (rsq < lj1[mtype].y) {
if (lj3[mtype].x == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj3[mtype].x == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else {
forcecoul = (numtyp)0.0;
prefactor = (numtyp)0.0;
}
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch,
__global numtyp *q_ , const numtyp cut_coulsq,
const numtyp qqrd2e, const numtyp g_ewald) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (ii<8)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
lj3[ii]=lj3_in[ii];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp e_coul=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
numtyp factor_lj, factor_coul;
if (j < nall) {
factor_lj = (numtyp)1.0;
factor_coul = (numtyp)0.0;
} else {
factor_lj = sp_lj[j/nall];
factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
j %= nall;
}
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].x) {
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
numtyp r2inv=(numtyp)1.0/rsq;
if (rsq < lj1[mtype].y) {
if (lj3[mtype].x == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj3[mtype].x == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else {
forcecoul = (numtyp)0.0;
prefactor = (numtyp)0.0;
}
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -0,0 +1,164 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "cmmc_long_gpu_cl.h"
#else
#include "cmmc_long_gpu_ptx.h"
#endif
#include "cmmc_long_gpu_memory.h"
#include <cassert>
#define CMML_GPU_MemoryT CMML_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
CMML_GPU_MemoryT::CMML_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
CMML_GPU_MemoryT::~CMML_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
int **host_cg_type, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq,
const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cmmc_long_gpu_kernel);
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq,
host_cut_ljsq,host_lj1,host_lj2);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3,
host_lj4,host_offset);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_g_ewald=g_ewald;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return true;
}
template <class numtyp, class acctyp>
void CMML_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double CMML_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(CMML_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int ainum=this->atom->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald);
}
this->time_pair.stop();
}
template class CMML_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMML_GPU_MEMORY_H
#define CMML_GPU_MEMORY_H
#include "charge_gpu_memory.h"
template <class numtyp, class acctyp>
class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
public:
CMML_GPU_Memory();
~CMML_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq, int ** cg_type,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -0,0 +1,5 @@
This is a stripped down and customized version
of the CUDA performance primitives library for
use with the GPU package in LAMMPS.
Don't use for anything else, get the real thing
from http://code.google.com/p/cudpp/ instead!

View File

@ -0,0 +1,337 @@
// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#include <cudpp_globals.h>
#include "cudpp_radixsort.h"
#include "cta/scan_cta.cu"
#include <cudpp.h>
#include <stdio.h>
#include <cudpp_util.h>
#include <math.h>
#include "sharedmem.h"
#ifdef __DEVICE_EMULATION__
#define __EMUSYNC __syncthreads()
#else
#define __EMUSYNC
#endif
/**
* @file
* sort_cta.cu
*
* @brief CUDPP CTA-level sort routines
*/
/** \addtogroup cudpp_cta
* @{
*/
/** @name Radix Sort Functions
* @{
*/
typedef unsigned int uint;
/**
* @brief Flips bits of single-precision floating-point number (parameterized by doFlip)
*
* flip a float for sorting
* finds SIGN of fp number.
* if it's 1 (negative float), it flips all bits
* if it's 0 (positive float), it flips the sign only
* @param[in] f floating-point input (passed as unsigned int)
* @see floatUnflip
**/
template <bool doFlip>
__device__ uint floatFlip(uint f)
{
if (doFlip)
{
uint mask = -int(f >> 31) | 0x80000000;
return f ^ mask;
}
else
return f;
}
/**
* @brief Reverses bit-flip of single-precision floating-point number (parameterized by doFlip)
*
* flip a float back (invert FloatFlip)
* signed was flipped from above, so:
* if sign is 1 (negative), it flips the sign bit back
* if sign is 0 (positive), it flips all bits back
* @param[in] f floating-point input (passed as unsigned int)
* @see floatFlip
**/
template <bool doFlip>
__device__ uint floatUnflip(uint f)
{
if (doFlip)
{
uint mask = ((f >> 31) - 1) | 0x80000000;
return f ^ mask;
}
else
return f;
}
/**
* @brief Scans one warp quickly, optimized for 32-element warps, using shared memory
*
* Scans each warp in parallel ("warp-scan"), one element per thread.
* uses 2 numElements of shared memory per thread (64 numElements per warp)
*
* @param[in] val Elements per thread to scan
* @param[in,out] sData
**/
template<class T, int maxlevel>
__device__ T scanwarp(T val, volatile T* sData)
{
// The following is the same as 2 * WARP_SIZE * warpId + threadInWarp =
// 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE - 1))
int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
sData[idx] = 0;
idx += WARP_SIZE;
T t = sData[idx] = val; __EMUSYNC;
#ifdef __DEVICE_EMULATION__
t = sData[idx - 1]; __EMUSYNC;
sData[idx] += t; __EMUSYNC;
t = sData[idx - 2]; __EMUSYNC;
sData[idx] += t; __EMUSYNC;
t = sData[idx - 4]; __EMUSYNC;
sData[idx] += t; __EMUSYNC;
t = sData[idx - 8]; __EMUSYNC;
sData[idx] += t; __EMUSYNC;
t = sData[idx - 16]; __EMUSYNC;
sData[idx] += t; __EMUSYNC;
#else
if (0 <= maxlevel) { sData[idx] = t = t + sData[idx - 1]; } __EMUSYNC;
if (1 <= maxlevel) { sData[idx] = t = t + sData[idx - 2]; } __EMUSYNC;
if (2 <= maxlevel) { sData[idx] = t = t + sData[idx - 4]; } __EMUSYNC;
if (3 <= maxlevel) { sData[idx] = t = t + sData[idx - 8]; } __EMUSYNC;
if (4 <= maxlevel) { sData[idx] = t = t + sData[idx -16]; } __EMUSYNC;
#endif
return sData[idx] - val; // convert inclusive -> exclusive
}
/**
* @brief Scans 4*CTA_SIZE unsigned ints in a block
*
* scan4 scans 4*CTA_SIZE numElements in a block (4 per
* thread), using a warp-scan algorithm
*
* @param[in] idata 4-vector of integers to scan
**/
__device__ uint4 scan4(uint4 idata)
{
extern __shared__ uint ptr[];
uint idx = threadIdx.x;
uint4 val4 = idata;
uint sum[3];
sum[0] = val4.x;
sum[1] = val4.y + sum[0];
sum[2] = val4.z + sum[1];
uint val = val4.w + sum[2];
val = scanwarp<uint, 4>(val, ptr);
__syncthreads();
if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
{
ptr[idx >> 5] = val + val4.w + sum[2];
}
__syncthreads();
#ifndef __DEVICE_EMULATION__
if (idx < WARP_SIZE)
#endif
{
ptr[idx] = scanwarp<uint, 2>(ptr[idx], ptr);
}
__syncthreads();
val += ptr[idx >> 5];
val4.x = val;
val4.y = val + sum[0];
val4.z = val + sum[1];
val4.w = val + sum[2];
return val4;
}
/**
* @brief Computes output position for each thread given predicate; trues come first then falses
*
* Rank is the core of the radix sort loop. Given a predicate, it
* computes the output position for each thread in an ordering where all
* True threads come first, followed by all False threads.
* This version handles 4 predicates per thread; hence, "rank4".
*
* @param[in] preds true/false values for each of the 4 elements in this thread
*
* @todo is the description of "preds" correct?
**/
template <int ctasize>
__device__ uint4 rank4(uint4 preds)
{
uint4 address = scan4(preds);
__shared__ uint numtrue;
if (threadIdx.x == ctasize-1)
{
numtrue = address.w + preds.w;
}
__syncthreads();
uint4 rank;
uint idx = threadIdx.x << 2;
rank.x = (preds.x) ? address.x : numtrue + idx - address.x;
rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;
return rank;
}
/**
* @brief Sorts one block
*
* Uses rank to sort one bit at a time: Sorts a block according
* to bits startbit -> nbits + startbit
* @param[in,out] key
* @param[in,out] value
**/
template<uint nbits, uint startbit>
__device__ void radixSortBlock(uint4 &key, uint4 &value)
{
extern __shared__ uint sMem1[];
for(uint shift = startbit; shift < (startbit + nbits); ++shift)
{
uint4 lsb;
lsb.x = !((key.x >> shift) & 0x1);
lsb.y = !((key.y >> shift) & 0x1);
lsb.z = !((key.z >> shift) & 0x1);
lsb.w = !((key.w >> shift) & 0x1);
uint4 r = rank4<256>(lsb);
#if 1
// This arithmetic strides the ranks across 4 SORT_CTA_SIZE regions
sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w;
__syncthreads();
// The above allows us to read without 4-way bank conflicts:
key.x = sMem1[threadIdx.x];
key.y = sMem1[threadIdx.x + SORT_CTA_SIZE];
key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
__syncthreads();
sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = value.x;
sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = value.y;
sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = value.z;
sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = value.w;
__syncthreads();
value.x = sMem1[threadIdx.x];
value.y = sMem1[threadIdx.x + SORT_CTA_SIZE];
value.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
value.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
#else
sMem1[r.x] = key.x;
sMem1[r.y] = key.y;
sMem1[r.z] = key.z;
sMem1[r.w] = key.w;
__syncthreads();
// This access has 4-way bank conflicts
key = sMem[threadIdx.x];
__syncthreads();
sMem1[r.x] = value.x;
sMem1[r.y] = value.y;
sMem1[r.z] = value.z;
sMem1[r.w] = value.w;
__syncthreads();
value = sMem[threadIdx.x];
#endif
__syncthreads();
}
}
/**
* @brief Sorts one block. Key-only version.
*
* Uses rank to sort one bit at a time: Sorts a block according
* to bits startbit -> nbits + startbit
* @param[in,out] key
**/
template<uint nbits, uint startbit>
__device__ void radixSortBlockKeysOnly(uint4 &key)
{
extern __shared__ uint sMem1[];
for(uint shift = startbit; shift < (startbit + nbits); ++shift)
{
uint4 lsb;
lsb.x = !((key.x >> shift) & 0x1);
lsb.y = !((key.y >> shift) & 0x1);
lsb.z = !((key.z >> shift) & 0x1);
lsb.w = !((key.w >> shift) & 0x1);
uint4 r = rank4<256>(lsb);
#if 1
// This arithmetic strides the ranks across 4 CTA_SIZE regions
sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w;
__syncthreads();
// The above allows us to read without 4-way bank conflicts:
key.x = sMem1[threadIdx.x];
key.y = sMem1[threadIdx.x + SORT_CTA_SIZE];
key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
#else
sMem1[r.x] = key.x;
sMem1[r.y] = key.y;
sMem1[r.z] = key.z;
sMem1[r.w] = key.w;
__syncthreads();
// This access has 4-way bank conflicts
key = sMem[threadIdx.x];
#endif
__syncthreads();
}
}
/** @} */ // end radix sort functions
/** @} */ // end cudpp_cta

View File

@ -0,0 +1,619 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5633 $
// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* scan_cta.cu
*
* @brief CUDPP CTA-level scan routines
*/
/** \defgroup cudpp_cta CUDPP CTA-Level API
* The CUDPP CTA-Level API contains functions that run on the GPU
* device. These are CUDA \c __device__ functions that are called
* from within other CUDA device functions (typically
* \link cudpp_kernel CUDPP Kernel-Level API\endlink functions).
* They are called CTA-level functions because they typically process
* s_data "owned" by each CTA within shared memory, and are agnostic of
* any other CTAs that may be running (or how many CTAs are running),
* other than to compute appropriate global memory addresses.
* @{
*/
/** @name Scan Functions
* @{
*/
#include <cudpp_globals.h>
#include <cudpp_util.h>
#include <math.h>
#include <cudpp.h>
/**
* @brief Macro to insert necessary __syncthreads() in device emulation mode
*/
#ifdef __DEVICE_EMULATION__
#define __EMUSYNC __syncthreads()
#else
#define __EMUSYNC
#endif
/**
* @brief Template class containing compile-time parameters to the scan functions
*
* ScanTraits is passed as a template parameter to all scan functions. By
* using these compile-time functions we can enable generic code while
* maintaining the highest performance. This is crucial for the performance
* of low-level workhorse algorithms like scan.
*
* @param T The datatype of the scan
* @param oper The ::CUDPPOperator to use for the scan (add, max, etc.)
* @param multiRow True if this is a multi-row scan
* @param unroll True if scan inner loops should be unrolled
* @param sums True if each block should write it's sum to the d_blockSums array (false for single-block scans)
* @param backward True if this is a backward scan
* @param fullBlock True if all blocks in this scan are full (CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements)
* @param exclusive True for exclusive scans, false for inclusive scans
*/
template <class T, CUDPPOperator oper, bool backward, bool exclusive,
bool multiRow, bool sums, bool fullBlock>
class ScanTraits
{
public:
//! Returns true if this is a backward scan
static __device__ bool isBackward() { return backward; };
//! Returns true if this is an exclusive scan
static __device__ bool isExclusive() { return exclusive; };
//! Returns true if this a multi-row scan.
static __device__ bool isMultiRow() { return multiRow; };
//! Returns true if this scan writes the sum of each block to the d_blockSums array (multi-block scans)
static __device__ bool writeSums() { return sums; };
//! Returns true if this is a full scan -- all blocks process CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements
static __device__ bool isFullBlock() { return fullBlock; };
//! The operator function used for the scan
static __device__ T op(const T a, const T b)
{
return Operator<T, oper>::op(a, b);
}
//! The identity value used by the scan
static __device__ T identity() { return Operator<T, oper>::identity(); }
};
//! This is used to insert syncthreads to avoid perf loss caused by 128-bit
//! load overlap that happens on G80. This gives about a 15% boost on scans on
//! G80.
//! @todo Parameterize this in case this perf detail changes on future GPUs.
#define DISALLOW_LOADSTORE_OVERLAP 1
/**
* @brief Handles loading input s_data from global memory to shared memory
* (vec4 version)
*
* Load a chunk of 8*blockDim.x elements from global memory into a
* shared memory array. Each thread loads two T4 elements (where
* T4 is, e.g. int4 or float4), computes the scan of those two vec4s in
* thread local arrays (in registers), and writes the two total sums of the
* vec4s into shared memory, where they will be cooperatively scanned with
* the other partial sums by all threads in the CTA.
*
* @param[out] s_out The output (shared) memory array
* @param[out] threadScan0 Intermediate per-thread partial sums array 1
* @param[out] threadScan1 Intermediate per-thread partial sums array 2
* @param[in] d_in The input (device) memory array
* @param[in] numElements The number of elements in the array being scanned
* @param[in] iDataOffset the offset of the input array in global memory for this
* thread block
* @param[out] ai The shared memory address for the thread's first element
* (returned for reuse)
* @param[out] bi The shared memory address for the thread's second element
* (returned for reuse)
* @param[out] aiDev The device memory address for this thread's first element
* (returned for reuse)
* @param[out] biDev The device memory address for this thread's second element
* (returned for reuse)
*/
template <class T, class traits>
__device__ void loadSharedChunkFromMem4(T *s_out,
T threadScan0[4],
T threadScan1[4],
const T *d_in,
int numElements,
int iDataOffset,
int &ai,
int &bi,
int &aiDev,
int &biDev)
{
int thid = threadIdx.x;
aiDev = iDataOffset + thid;
biDev = aiDev + blockDim.x;
// convert to 4-vector
typename typeToVector<T,4>::Result tempData;
typename typeToVector<T,4>::Result* inData = (typename typeToVector<T,4>::Result*)d_in;
ai = thid;
bi = thid + blockDim.x;
// read into tempData;
if (traits::isBackward())
{
int i = aiDev * 4;
if (traits::isFullBlock() || i + 3 < numElements)
{
tempData = inData[aiDev];
threadScan0[3] = tempData.w;
threadScan0[2] = traits::op(tempData.z, threadScan0[3]);
threadScan0[1] = traits::op(tempData.y, threadScan0[2]);
threadScan0[0] = s_out[ai]
= traits::op(tempData.x, threadScan0[1]);
}
else
{
threadScan0[3] = traits::identity();
threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[3]);
threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[2]);
threadScan0[0] = s_out[ai]
= traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan0[1]);
}
#ifdef DISALLOW_LOADSTORE_OVERLAP
__syncthreads();
#endif
i = biDev * 4;
if (traits::isFullBlock() || i + 3 < numElements)
{
tempData = inData[biDev];
threadScan1[3] = tempData.w;
threadScan1[2] = traits::op(tempData.z, threadScan1[3]);
threadScan1[1] = traits::op(tempData.y, threadScan1[2]);
threadScan1[0] = s_out[bi]
= traits::op(tempData.x, threadScan1[1]);
}
else
{
threadScan1[3] = traits::identity();
threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[3]);
threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[2]);
threadScan1[0] = s_out[bi]
= traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan1[1]);
}
__syncthreads();
// reverse s_data in shared memory
if (ai < CTA_SIZE)
{
unsigned int leftIdx = ai;
unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
if (leftIdx < rightIdx)
{
T tmp = s_out[leftIdx];
s_out[leftIdx] = s_out[rightIdx];
s_out[rightIdx] = tmp;
}
}
__syncthreads();
}
else
{
int i = aiDev * 4;
if (traits::isFullBlock() || i + 3 < numElements)
{
tempData = inData[aiDev];
threadScan0[0] = tempData.x;
threadScan0[1] = traits::op(tempData.y, threadScan0[0]);
threadScan0[2] = traits::op(tempData.z, threadScan0[1]);
threadScan0[3] = s_out[ai]
= traits::op(tempData.w, threadScan0[2]);
}
else
{
threadScan0[0] = (i < numElements) ? d_in[i] : traits::identity();
threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[0]);
threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[1]);
threadScan0[3] = s_out[ai]
= traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan0[2]);
}
#ifdef DISALLOW_LOADSTORE_OVERLAP
__syncthreads();
#endif
i = biDev * 4;
if (traits::isFullBlock() || i + 3 < numElements)
{
tempData = inData[biDev];
threadScan1[0] = tempData.x;
threadScan1[1] = traits::op(tempData.y, threadScan1[0]);
threadScan1[2] = traits::op(tempData.z, threadScan1[1]);
threadScan1[3] = s_out[bi]
= traits::op(tempData.w, threadScan1[2]);
}
else
{
threadScan1[0] = (i < numElements) ? d_in[i] : traits::identity();
threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[0]);
threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[1]);
threadScan1[3] = s_out[bi]
= traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan1[2]);
}
__syncthreads();
}
}
/**
* @brief Handles storing result s_data from shared memory to global memory
* (vec4 version)
*
* Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory
* into a device memory array. Each thread stores reads two elements from shared
* memory, adds them to the intermediate sums computed in
* loadSharedChunkFromMem4(), and writes two T4 elements (where
* T4 is, e.g. int4 or float4) to global memory.
*
* @param[out] d_out The output (device) memory array
* @param[in] threadScan0 Intermediate per-thread partial sums array 1
* (contents computed in loadSharedChunkFromMem4())
* @param[in] threadScan1 Intermediate per-thread partial sums array 2
* (contents computed in loadSharedChunkFromMem4())
* @param[in] s_in The input (shared) memory array
* @param[in] numElements The number of elements in the array being scanned
* @param[in] oDataOffset the offset of the output array in global memory
* for this thread block
* @param[in] ai The shared memory address for the thread's first element
* (computed in loadSharedChunkFromMem4())
* @param[in] bi The shared memory address for the thread's second element
* (computed in loadSharedChunkFromMem4())
* @param[in] aiDev The device memory address for this thread's first element
* (computed in loadSharedChunkFromMem4())
* @param[in] biDev The device memory address for this thread's second element
* (computed in loadSharedChunkFromMem4())
*/
template <class T, class traits>
__device__ void storeSharedChunkToMem4(T *d_out,
T threadScan0[4],
T threadScan1[4],
T *s_in,
int numElements,
int oDataOffset,
int ai,
int bi,
int aiDev,
int biDev)
{
// Convert to 4-vector
typename typeToVector<T,4>::Result tempData;
typename typeToVector<T,4>::Result* outData = (typename typeToVector<T,4>::Result*)d_out;
// write results to global memory
if (traits::isBackward())
{
if (ai < CTA_SIZE)
{
unsigned int leftIdx = ai;
unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
if (leftIdx < rightIdx)
{
T tmp = s_in[leftIdx];
s_in[leftIdx] = s_in[rightIdx];
s_in[rightIdx] = tmp;
}
}
__syncthreads();
T temp = s_in[ai];
if (traits::isExclusive())
{
tempData.w = temp;
tempData.z = traits::op(temp, threadScan0[3]);
tempData.y = traits::op(temp, threadScan0[2]);
tempData.x = traits::op(temp, threadScan0[1]);
}
else
{
tempData.w = traits::op(temp, threadScan0[3]);
tempData.z = traits::op(temp, threadScan0[2]);
tempData.y = traits::op(temp, threadScan0[1]);
tempData.x = traits::op(temp, threadScan0[0]);
}
int i = aiDev * 4;
if (traits::isFullBlock() || i + 3 < numElements)
{
outData[aiDev] = tempData;
}
else
{
if (i < numElements) { d_out[i] = tempData.x;
if (i+1 < numElements) { d_out[i+1] = tempData.y;
if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}
}
#ifdef DISALLOW_LOADSTORE_OVERLAP
__syncthreads();
#endif
temp = s_in[bi];
if (traits::isExclusive())
{
tempData.w = temp;
tempData.z = traits::op(temp, threadScan1[3]);
tempData.y = traits::op(temp, threadScan1[2]);
tempData.x = traits::op(temp, threadScan1[1]);
}
else
{
tempData.w = traits::op(temp, threadScan1[3]);
tempData.z = traits::op(temp, threadScan1[2]);
tempData.y = traits::op(temp, threadScan1[1]);
tempData.x = traits::op(temp, threadScan1[0]);
}
i = biDev * 4;
if (traits::isFullBlock() || i + 3 < numElements)
{
outData[biDev] = tempData;
}
else
{
if (i < numElements) { d_out[i] = tempData.x;
if (i+1 < numElements) { d_out[i+1] = tempData.y;
if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}
}
}
else
{
T temp;
temp = s_in[ai];
if (traits::isExclusive())
{
tempData.x = temp;
tempData.y = traits::op(temp, threadScan0[0]);
tempData.z = traits::op(temp, threadScan0[1]);
tempData.w = traits::op(temp, threadScan0[2]);
}
else
{
tempData.x = traits::op(temp, threadScan0[0]);
tempData.y = traits::op(temp, threadScan0[1]);
tempData.z = traits::op(temp, threadScan0[2]);
tempData.w = traits::op(temp, threadScan0[3]);
}
int i = aiDev * 4;
if (traits::isFullBlock() || i + 3 < numElements)
{
outData[aiDev] = tempData;
}
else
{
// we can't use vec4 because the original array isn't a multiple of
// 4 elements
if ( i < numElements) { d_out[i] = tempData.x;
if ((i+1) < numElements) { d_out[i+1] = tempData.y;
if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
}
#ifdef DISALLOW_LOADSTORE_OVERLAP
__syncthreads();
#endif
temp = s_in[bi];
if (traits::isExclusive())
{
tempData.x = temp;
tempData.y = traits::op(temp, threadScan1[0]);
tempData.z = traits::op(temp, threadScan1[1]);
tempData.w = traits::op(temp, threadScan1[2]);
}
else
{
tempData.x = traits::op(temp, threadScan1[0]);
tempData.y = traits::op(temp, threadScan1[1]);
tempData.z = traits::op(temp, threadScan1[2]);
tempData.w = traits::op(temp, threadScan1[3]);
}
i = biDev * 4;
if (traits::isFullBlock() || i + 3 < numElements)
{
outData[biDev] = tempData;
}
else
{
// we can't use vec4 because the original array isn't a multiple of
// 4 elements
if ( i < numElements) { d_out[i] = tempData.x;
if ((i+1) < numElements) { d_out[i+1] = tempData.y;
if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
}
}
}
/** @brief Scan all warps of a CTA without synchronization
*
* The warp-scan algorithm breaks a block of data into warp-sized chunks, and
* scans the chunks independently with a warp of threads each. Because warps
* execute instructions in SIMD fashion, there is no need to synchronize in
* order to share data within a warp (only across warps). Also, in SIMD the
* most efficient algorithm is a step-efficient algorithm. Therefore, within
* each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps
* to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the
* work-efficient tree-based algorithm described by Guy Blelloch [1990] that
* takes 2 * log(N) steps and is in general more complex to implement.
* Previous versions of CUDPP used the Blelloch algorithm. For current GPUs,
* the warp size is 32, so this takes five steps per warp.
*
* Each thread is responsible for a single element of the array to be scanned.
* Each thread inputs a single value to the scan via \a val and returns
* its own scanned result element. The threads of each warp cooperate
* via the shared memory array \a s_data to scan WARP_SIZE elements.
*
* Template parameter \a maxlevel allows this warpscan to be performed on
* partial warps. For example, if only the first 8 elements of each warp need
* to be scanned, then warpscan only performs log2(8)=3 steps rather than 5.
*
* The computation uses 2 * WARP_SIZE elements of shared memory per warp to
* enable warps to offset beyond their input data and receive the identity
* element without using any branch instructions.
*
* \note s_data is declared volatile here to prevent the compiler from
* optimizing away writes to shared memory, and ensure correct intrawarp
* communication in the absence of __syncthreads.
*
* @return The result of the warp scan for the current thread
* @param[in] val The current threads's input to the scan
* @param[in,out] s_data A pointer to a temporary shared array of 2*CTA_SIZE
* elements used to compute the warp scans
*/
template<class T, class traits,int maxlevel>
__device__ T warpscan(T val, volatile T* s_data)
{
// The following is the same as 2 * 32 * warpId + threadInWarp =
// 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE-1))
int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE-1));
s_data[idx] = traits::identity();
idx += WARP_SIZE;
T t = s_data[idx] = val; __EMUSYNC;
// This code is needed because the warp size of device emulation
// is only 1 thread, so sync-less cooperation within a warp doesn't
// work.
#ifdef __DEVICE_EMULATION__
t = s_data[idx - 1]; __EMUSYNC;
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
t = s_data[idx - 2]; __EMUSYNC;
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
t = s_data[idx - 4]; __EMUSYNC;
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
t = s_data[idx - 8]; __EMUSYNC;
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
t = s_data[idx - 16]; __EMUSYNC;
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
#else
if (0 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 1]); }
if (1 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 2]); }
if (2 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 4]); }
if (3 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 8]); }
if (4 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx -16]); }
#endif
return s_data[idx-1]; // convert inclusive -> exclusive
}
/** @brief Perform a full CTA scan using the warp-scan algorithm
*
* As described in the comment for warpscan(), the warp-scan algorithm breaks
* a block of data into warp-sized chunks, and scans the chunks independently
* with a warp of threads each. To complete the scan, each warp <i>j</i> then
* writes its last element to element <i>j</i> of a temporary shared array.
* Then a single warp exclusive-scans these "warp sums". Finally, each thread
* adds the result of the warp sum scan to the result of the scan from the
* first pass.
*
* Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan
* twice.
*
* @param x The first input value for the current thread
* @param y The second input value for the current thread
* @param s_data Temporary shared memory space of 2*CTA_SIZE elements for
* performing the scan
*/
template <class T, class traits>
__device__ void scanWarps(T x, T y,
T *s_data)
{
T val = warpscan<T, traits, 4>(x, s_data);
__syncthreads();
T val2 = warpscan<T, traits, 4>(y, s_data);
int idx = threadIdx.x;
if ((idx & 31)==31)
{
s_data[idx >> 5] = traits::op(val, x);
s_data[(idx + blockDim.x) >> 5] = traits::op(val2, y);
}
__syncthreads();
#ifndef __DEVICE_EMULATION__
if (idx < 32)
#endif
{
s_data[idx] = warpscan<T,traits,(LOG_CTA_SIZE-LOG_WARP_SIZE+1)>(s_data[idx], s_data);
}
__syncthreads();
val = traits::op(val, s_data[idx >> 5]);
val2 = traits::op(val2, s_data[(idx + blockDim.x) >> 5]);
__syncthreads();
s_data[idx] = val;
s_data[idx+blockDim.x] = val2;
}
/**
* @brief CTA-level scan routine; scans s_data in shared memory in each thread block
*
* This function is the main CTA-level scan function. It may be called by other
* CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements.
* Each thread is responsible for one element in each half of the input array.
* \note This code is intended to be run on a CTA of 128 threads. Other sizes are
* untested.
*
* @param[in] s_data The array to be scanned in shared memory
* @param[out] d_blockSums Array of per-block sums
* @param[in] blockSumIndex Location in \a d_blockSums to which to write this block's sum
*/
template <class T, class traits>
__device__ void scanCTA(T *s_data,
T *d_blockSums,
unsigned int blockSumIndex)
{
T val = s_data[threadIdx.x];
T val2 = s_data[threadIdx.x + blockDim.x];
__syncthreads();
scanWarps<T,traits>(val, val2, s_data);
__syncthreads();
if (traits::writeSums() && threadIdx.x == blockDim.x - 1)
{
d_blockSums[blockSumIndex] = traits::op(val2, s_data[threadIdx.x + blockDim.x]);
}
#ifdef __DEVICE_EMULATION__
// must sync in emulation mode when doing backward scans, because otherwise the
// shared memory array will get reversed before the block sums are read!
if (traits::isBackward())
__syncthreads();
#endif
}
/** @} */ // end scan functions
/** @} */ // end cudpp_cta

View File

@ -0,0 +1,417 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5632 $
// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* cudpp.cpp
*
* @brief Main library source file. Implements wrappers for public
* interface.
*
* Main library source file. Implements wrappers for public
* interface. These wrappers call application-level operators.
* As this grows we may decide to partition into multiple source
* files.
*/
/**
* \defgroup publicInterface CUDPP Public Interface
* The CUDA public interface comprises the functions, structs, and enums
* defined in cudpp.h. Public interface functions call functions in the
* \link cudpp_app Application-Level\endlink interface. The public
* interface functions include Plan Interface functions and Algorithm
* Interface functions. Plan Inteface functions are used for creating
* CUDPP Plan objects which contain configuration details, intermediate
* storage space, and in the case of cudppSparseMatrix(), data. The
* Algorithm Interface is the set of functions that do the real work
* of CUDPP, such as cudppScan() and cudppSparseMatrixVectorMultiply.
*
* @{
*/
/** @name Algorithm Interface
* @{
*/
#include "cudpp.h"
#include "cudpp_plan_manager.h"
#include "cudpp_scan.h"
//#include "cudpp_segscan.h"
//#include "cudpp_compact.h"
//#include "cudpp_spmvmult.h"
#include "cudpp_radixsort.h"
//#include "cudpp_rand.h"
/**
* @brief Performs a scan operation of numElements on its input in
* GPU memory (d_in) and places the output in GPU memory
* (d_out), with the scan parameters specified in the plan pointed to by
* planHandle.
* The input to a scan operation is an input array, a binary associative
* operator (like + or max), and an identity element for that operator
* (+'s identity is 0). The output of scan is the same size as its input.
* Informally, the output at each element is the result of operator
* applied to each input that comes before it. For instance, the
* output of sum-scan at each element is the sum of all the input
* elements before that input.
*
* More formally, for associative operator
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly,
* <var>out<sub>i</sub></var> = <var>in<sub>0</sub></var>
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>1</sub></var>
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly ...
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>i-1</sub></var>.
*
* CUDPP supports "exclusive" and "inclusive" scans. For the ADD operator,
* an exclusive scan computes the sum of all input elements before the
* current element, while an inclusive scan computes the sum of all input
* elements up to and including the current element.
*
* Before calling scan, create an internal plan using cudppPlan().
*
* After you are finished with the scan plan, clean up with cudppDestroyPlan().
*
* @param[in] planHandle Handle to plan for this scan
* @param[out] d_out output of scan, in GPU memory
* @param[in] d_in input to scan, in GPU memory
* @param[in] numElements number of elements to scan
*
* @see cudppPlan, cudppDestroyPlan
*/
CUDPP_DLL
CUDPPResult cudppScan(CUDPPHandle planHandle,
void *d_out,
const void *d_in,
size_t numElements)
{
CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppScanDispatch(d_out, d_in, numElements, 1, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
}
/**
* @brief Performs a segmented scan operation of numElements on its input in
* GPU memory (d_idata) and places the output in GPU memory
* (d_out), with the scan parameters specified in the plan pointed to by
* planHandle.
* The input to a segmented scan operation is an input array of data,
* an input array of flags which demarcate segments, a binary associative
* operator (like + or max), and an identity element for that operator
* (+'s identity is 0). The array of flags is the same length as the input
* with 1 marking the the first element of a segment and 0 otherwise. The
* output of segmented scan is the same size as its input. Informally, the
* output at each element is the result of operator applied to each input
* that comes before it in that segment. For instance, the output of
* segmented sum-scan at each element is the sum of all the input elements
* before that input in that segment.
*
* More formally, for associative operator
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly,
* <var>out<sub>i</sub></var> = <var>in<sub>k</sub></var>
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>k+1</sub></var>
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly ...
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>i-1</sub></var>.
* <i>k</i> is the index of the first element of the segment in which <i>i</i> lies
*
* We support both "exclusive" and "inclusive" variants. For a segmented sum-scan,
* the exclusive variant computes the sum of all input elements before the
* current element in that segment, while the inclusive variant computes the
* sum of all input elements up to and including the current element, in
* that segment.
*
* Before calling segmented scan, create an internal plan using cudppPlan().
*
* After you are finished with the scan plan, clean up with cudppDestroyPlan().
* @param[in] planHandle Handle to plan for this scan
* @param[out] d_out output of segmented scan, in GPU memory
* @param[in] d_idata input data to segmented scan, in GPU memory
* @param[in] d_iflags input flags to segmented scan, in GPU memory
* @param[in] numElements number of elements to perform segmented scan on
*
* @see cudppPlan, cudppDestroyPlan
CUDPP_DLL
CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle,
void *d_out,
const void *d_idata,
const unsigned int *d_iflags,
size_t numElements)
{
CUDPPSegmentedScanPlan *plan =
(CUDPPSegmentedScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppSegmentedScanDispatch(d_out, d_idata, d_iflags, numElements, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
}
*/
/**
* @brief Performs numRows parallel scan operations of numElements
* each on its input (d_in) and places the output in d_out,
* with the scan parameters set by config. Exactly like cudppScan
* except that it runs on multiple rows in parallel.
*
* Note that to achieve good performance with cudppMultiScan one should
* allocate the device arrays passed to it so that all rows are aligned
* to the correct boundaries for the architecture the app is running on.
* The easy way to do this is to use cudaMallocPitch() to allocate a
* 2D array on the device. Use the \a rowPitch parameter to cudppPlan()
* to specify this pitch. The easiest way is to pass the device pitch
* returned by cudaMallocPitch to cudppPlan() via \a rowPitch.
*
* @param[in] planHandle handle to CUDPPScanPlan
* @param[out] d_out output of scan, in GPU memory
* @param[in] d_in input to scan, in GPU memory
* @param[in] numElements number of elements (per row) to scan
* @param[in] numRows number of rows to scan in parallel
*
* @see cudppScan, cudppPlan
CUDPP_DLL
CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
void *d_out,
const void *d_in,
size_t numElements,
size_t numRows)
{
CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppScanDispatch(d_out, d_in, numElements, numRows, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
}
*/
/**
* @brief Given an array \a d_in and an array of 1/0 flags in \a
* deviceValid, returns a compacted array in \a d_out of corresponding
* only the "valid" values from \a d_in.
*
* Takes as input an array of elements in GPU memory
* (\a d_in) and an equal-sized unsigned int array in GPU memory
* (\a deviceValid) that indicate which of those input elements are
* valid. The output is a packed array, in GPU memory, of only those
* elements marked as valid.
*
* Internally, uses cudppScan.
*
* Example:
* \code
* d_in = [ a b c d e f ]
* deviceValid = [ 1 0 1 1 0 1 ]
* d_out = [ a c d f ]
* \endcode
*
* @todo [MJH] We need to evaluate whether cudppCompact should be a core member
* of the public interface. It's not clear to me that what the user always
* wants is a final compacted array. Often one just wants the array of indices
* to which each input element should go in the output. The split() routine used
* in radix sort might make more sense to expose.
*
* @param[in] planHandle handle to CUDPPCompactPlan
* @param[out] d_out compacted output
* @param[out] d_numValidElements set during cudppCompact; is set with the
* number of elements valid flags in the d_isValid input array
* @param[in] d_in input to compact
* @param[in] d_isValid which elements in d_in are valid
* @param[in] numElements number of elements in d_in
CUDPP_DLL
CUDPPResult cudppCompact(CUDPPHandle planHandle,
void *d_out,
size_t *d_numValidElements,
const void *d_in,
const unsigned int *d_isValid,
size_t numElements)
{
CUDPPCompactPlan *plan = (CUDPPCompactPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppCompactDispatch(d_out, d_numValidElements, d_in, d_isValid,
numElements, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
}
}
*/
/**
* @brief Sorts key-value pairs or keys only
*
* Takes as input an array of keys in GPU memory
* (d_keys) and an optional array of corresponding values,
* and outputs sorted arrays of keys and (optionally) values in place.
* Key-value and key-only sort is selected through the configuration of
* the plan, using the options CUDPP_OPTION_KEYS_ONLY and
* CUDPP_OPTION_KEY_VALUE_PAIRS.
*
* Supported key types are CUDPP_FLOAT and CUDPP_UINT. Values can be
* any 32-bit type (internally, values are treated only as a payload
* and cast to unsigned int).
*
* @todo Determine if we need to provide an "out of place" sort interface.
*
* @param[in] planHandle handle to CUDPPSortPlan
* @param[out] d_keys keys by which key-value pairs will be sorted
* @param[in] d_values values to be sorted
* @param[in] keyBits the number of least significant bits in each element
* of d_keys to sort by
* @param[in] numElements number of elements in d_keys and d_values
*
* @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
*/
CUDPP_DLL
CUDPPResult cudppSort(CUDPPHandle planHandle,
void *d_keys,
void *d_values,
int keyBits,
size_t numElements)
{
CUDPPRadixSortPlan *plan = (CUDPPRadixSortPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppRadixSortDispatch(d_keys, d_values, numElements, keyBits, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
}
}
/** @brief Perform matrix-vector multiply y = A*x for arbitrary sparse matrix A and vector x
*
* Given a matrix object handle (which has been initialized using cudppSparseMatrix()),
* This function multiplies the input vector \a d_x by the matrix referred to by
* \a sparseMatrixHandle, returning the result in \a d_y.
*
* @param sparseMatrixHandle Handle to a sparse matrix object created with cudppSparseMatrix()
* @param d_y The output vector, y
* @param d_x The input vector, x
*
* @see cudppSparseMatrix, cudppDestroySparseMatrix
CUDPP_DLL
CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle,
void *d_y,
const void *d_x)
{
CUDPPSparseMatrixVectorMultiplyPlan *plan =
(CUDPPSparseMatrixVectorMultiplyPlan*)CUDPPPlanManager::GetPlan(sparseMatrixHandle);
if (plan != NULL)
{
cudppSparseMatrixVectorMultiplyDispatch(d_y, d_x, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
}
}
*/
/**
* @brief Rand puts \a numElements random 32-bit elements into \a d_out
*
* Outputs \a numElements random values to \a d_out. \a d_out must be of
* type unsigned int, allocated in device memory.
*
* The algorithm used for the random number generation is stored in \a planHandle.
* Depending on the specification of the pseudo random number generator(PRNG),
* the generator may have one or more seeds. To set the seed, use cudppRandSeed().
*
* @todo Currently only MD5 PRNG is supported. We may provide more rand routines in
* the future.
*
* @param[in] planHandle Handle to plan for rand
* @param[in] numElements number of elements in d_out.
* @param[out] d_out output of rand, in GPU memory. Should be an array of unsigned integers.
*
* @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
CUDPP_DLL
CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements)
{
CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
if(plan != NULL)
{
//dispatch the rand algorithm here
cudppRandDispatch(d_out, numElements, plan);
return CUDPP_SUCCESS;
}
else
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
*/
/**@brief Sets the seed used for rand
*
* The seed is crucial to any random number generator as it allows a
* sequence of random numbers to be replicated. Since there may be
* multiple different rand algorithms in CUDPP, cudppRandSeed
* uses \a planHandle to determine which seed to set. Each rand
* algorithm has its own unique set of seeds depending on what
* the algorithm needs.
*
* @param[in] planHandle the handle to the plan which specifies which rand seed to set
* @param[in] seed the value which the internal cudpp seed will be set to
CUDPP_DLL
CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed)
{
CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
//switch on the plan to figure out which seed to update
switch(plan->m_config.algorithm)
{
case CUDPP_RAND_MD5:
plan->m_seed = seed;
break;
default:
break;
}
return CUDPP_SUCCESS;
}//end cudppRandSeed
*/
/** @} */ // end Algorithm Interface
/** @} */ // end of publicInterface group
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:

525
lib/gpu/cudpp_mini/cudpp.h Normal file
View File

@ -0,0 +1,525 @@
// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* cudpp.h
*
* @brief Main library header file. Defines public interface.
*
* The CUDPP public interface is a C-only interface to enable
* linking with code written in other languages (e.g. C, C++,
* and Fortran). While the internals of CUDPP are not limited
* to C (C++ features are used), the public interface is
* entirely C (thus it is declared "extern C").
*/
/**
* \mainpage
*
* \section introduction Introduction
*
* CUDPP is the CUDA Data Parallel Primitives Library. CUDPP is a
* library of data-parallel algorithm primitives such as
* parallel-prefix-sum ("scan"), parallel sort and parallel reduction.
* Primitives such as these are important building blocks for a wide
* variety of data-parallel algorithms, including sorting, stream
* compaction, and building data structures such as trees and
* summed-area tables.
*
* \section overview Overview Presentation
*
* A brief set of slides that describe the features, design principles,
* applications and impact of CUDPP is available here:
* <a href="http://cudpp.googlecode.com/svn/trunk/cudpp/doc/CUDPP_slides.pdf">CUDPP Presentation</a>.
*
* \section homepage Homepage
* Homepage for CUDPP: http://code.google.com/p/cudpp
*
* Announcements and discussion of CUDPP are hosted on the
* <a href="http://groups.google.com/group/cudpp?hl=en">CUDPP Google Group</a>.
*
* \section getting-started Getting Started with CUDPP
*
* You may want to start by browsing the \link publicInterface CUDPP Public
* Interface\endlink. For information on building CUDPP, see
* \ref building-cudpp "Building CUDPP".
*
* The "apps" subdirectory included with CUDPP has a few source code samples
* that use CUDPP:
* - \ref example_simpleCUDPP "simpleCUDPP", a simple example of using
* cudppScan()
* - satGL, an example of using cudppMultiScan() to generate a summed-area
* table (SAT) of a scene rendered in real time. The SAT is then used to simulate
* depth of field blur.
* - cudpp_testrig, a comprehensive test application for all the functionality
* of CUDPP
*
* We have also provided a code walkthrough of the
* \ref example_simpleCUDPP "simpleCUDPP" example.
*
* \section getting-help Getting Help and Reporting Problems
*
* To get help using CUDPP, please use the
* <a href="http://groups.google.com/group/cudpp?hl=en">CUDPP Google Group</a>.
*
* To report CUDPP bugs or request features, you may use either the above
* CUDPP Google Group, or you can file an issue directly using
* <a href="http://code.google.com/p/cudpp/issues/list">Google Code</a>.
*
* \section release-notes Release Notes
*
* For specific release details see the \ref changelog "Change Log".
*
* This release (1.1.1) is a bugfix release to CUDPP 1.1 that includes
* fixes to support CUDA 3.0 and the new NVIDIA Fermi architecture,
* including GeForce 400 series and Tesla 20 series GPUs. It also has
* bug fixes for 64-bit OSes.
*
* \section opSys Operating System Support
*
* This release (1.1.1) has been thoroughly tested on the following OSes.
* - Windows XP (32-bit) (CUDA 2.2, 3.0)
* - Windows 7 (64-bit) (CUDA 3.0)
* - Redhat Enterprise Linux 5 (64-bit) (CUDA 3.0)
* - and Mac OS X 10.6 (Snow Leopard, 64-bit) (CUDA 3.0)
*
* We expect CUDPP to build and run correctly on other flavors of Linux
* and Windows, but these are not actively tested by the developers at
* this time.
*
* Notes: CUDPP is not compatible with CUDA 2.1. A compiler bug in 2.1
* causes the compiler to crash. Also, starting with CUDPP 1.1.1, we are
* no longer testing CUDA device emulation, because it is deprecated in
* CUDA 3.0 and will be removed from future CUDA versions.
*
* \section cuda CUDA
* CUDPP is implemented in
* <a href="http://developer.nvidia.com/cuda">CUDA C/C++</a>. It requires the
* CUDA Toolkit version 2.2 or later. Please see the NVIDIA
* <a href="http://developer.nvidia.com/cuda">CUDA</a> homepage to download
* CUDA as well as the CUDA Programming Guide and CUDA SDK, which includes many
* CUDA code examples. Some of the samples in the CUDA SDK (including
* "marchingCubes", "lineOfSight", and radixSort) also use CUDPP.
*
* \section design-goals Design Goals
* Design goals for CUDPP include:
*
* - Performance. We aim to provide best-of-class performance for our
* primitives. We welcome suggestions and contributions that will improve
* CUDPP performance. We also want to provide primitives that can be easily
* benchmarked, and compared against other implementations on GPUs and other
* processors.
* - Modularity. We want our primitives to be easily included in other
* applications. To that end we have made the following design decisions:
* - CUDPP is provided as a library that can link against other applications.
* - CUDPP calls run on the GPU on GPU data. Thus they can be used
* as standalone calls on the GPU (on GPU data initialized by the
* calling application) and, more importantly, as GPU components in larger
* CPU/GPU applications.
* - CUDPP is implemented as 4 layers:
* -# The \link publicInterface Public Interface\endlink is the external
* library interface, which is the intended entry point for most
* applications. The public interface calls into the
* \link cudpp_app Application-Level API\endlink.
* -# The \link cudpp_app Application-Level API\endlink comprises functions
* callable from CPU code. These functions execute code jointly on the
* CPU (host) and the GPU by calling into the
* \link cudpp_kernel Kernel-Level API\endlink below them.
* -# The \link cudpp_kernel Kernel-Level API\endlink comprises functions
* that run entirely on the GPU across an entire grid of thread blocks.
* These functions may call into the \link cudpp_cta CTA-Level API\endlink
* below them.
* -# The \link cudpp_cta CTA-Level API\endlink comprises functions that run
* entirely on the GPU within a single Cooperative Thread Array (CTA,
* aka thread block). These are low-level functions that implement core
* data-parallel algorithms, typically by processing data within shared
* (CUDA \c __shared__) memory.
*
* Programmers may use any of the lower three CUDPP layers in their own
* programs by building the source directly into their application. However,
* the typical usage of CUDPP is to link to the library and invoke functions in
* the CUDPP \link publicInterface Public Interface\endlink, as in the
* \ref example_simpleCUDPP "simpleCUDPP", satGL, and cudpp_testrig application
* examples included in the CUDPP distribution.
*
* In the future, if and when CUDA supports building device-level libraries, we
* hope to enhance CUDPP to ease the use of CUDPP internal algorithms at all
* levels.
*
* \subsection uses Use Cases
* We expect the normal use of CUDPP will be in one of two ways:
* -# Linking the CUDPP library against another application.
* -# Running our "test" application, cudpp_testrig, that exercises
* CUDPP functionality.
*
* \section references References
* The following publications describe work incorporated in CUDPP.
*
* - Mark Harris, Shubhabrata Sengupta, and John D. Owens. "Parallel Prefix Sum (Scan) with CUDA". In Hubert Nguyen, editor, <i>GPU Gems 3</i>, chapter 39, pages 851&ndash;876. Addison Wesley, August 2007. http://graphics.idav.ucdavis.edu/publications/print_pub?pub_id=916
* - Shubhabrata Sengupta, Mark Harris, Yao Zhang, and John D. Owens. "Scan Primitives for GPU Computing". In <i>Graphics Hardware 2007</i>, pages 97&ndash;106, August 2007. http://graphics.idav.ucdavis.edu/publications/print_pub?pub_id=915
* - Shubhabrata Sengupta, Mark Harris, and Michael Garland. "Efficient parallel scan algorithms for GPUs". NVIDIA Technical Report NVR-2008-003, December 2008. http://mgarland.org/papers.html#segscan-tr
* - Nadathur Satish, Mark Harris, and Michael Garland. "Designing Efficient Sorting Algorithms for Manycore GPUs". In <i>Proceedings of the 23rd IEEE International Parallel & Distributed Processing Symposium</i>, May 2009. http://mgarland.org/papers.html#gpusort
* - Stanley Tzeng, Li-Yi Wei. "Parallel White Noise Generation on a GPU via Cryptographic Hash". In <i>Proceedings of the 2008 Symposium on Interactive 3D Graphics and Games</i>, pages 79&ndash;87, February 2008. http://research.microsoft.com/apps/pubs/default.aspx?id=70502
*
* Many researchers are using CUDPP in their work, and there are many publications
* that have used it \ref cudpp_refs "(references)". If your work uses CUDPP, please
* let us know by sending us a reference (preferably in BibTeX format) to your work.
*
* \section citing Citing CUDPP
*
* If you make use of CUDPP primitives in your work and want to cite
* CUDPP (thanks!), we would prefer for you to cite the appropriate
* papers above, since they form the core of CUDPP. To be more specific,
* the GPU Gems paper describes (unsegmented) scan, multi-scan for
* summed-area tables, and stream compaction. The NVIDIA technical report
* describes the current scan and segmented scan algorithms used in the
* library, and the Graphics Hardware paper describes an earlier
* implementation of segmented scan, quicksort, and sparse matrix-vector
* multiply. The IPDPS paper describes the radix sort used in CUDPP, and
* the I3D paper describes the random number generation algorithm.
*
* \section credits Credits
* \subsection developers CUDPP Developers
* - <a href="http://www.markmark.net">Mark Harris</a>, NVIDIA Corporation
* - <a href="http://www.ece.ucdavis.edu/~jowens/">John D. Owens</a>, University of California, Davis
* - <a href="http://graphics.cs.ucdavis.edu/~shubho/">Shubho Sengupta</a>, University of California, Davis
* - Stanley Tzeng, University of California, Davis
* - <a href="http://www.ece.ucdavis.edu/~yaozhang/">Yao Zhang</a>, University of California, Davis
* - <a href="http://www.ece.ucdavis.edu/~aaldavid/">Andrew Davidson</a>, University of California, Davis (formerly Louisiana State University)
*
* \subsection contributors Other CUDPP Contributors
* - <a href="http://www.eecs.berkeley.edu/~nrsatish/">Nadatur Satish</a>, University of California, Berkeley
*
* \subsection acknowledgments Acknowledgments
*
* Thanks to Jim Ahrens, Timo Aila, Nathan Bell, Ian Buck, Guy Blelloch,
* Jeff Bolz, Michael Garland, Jeff Inman, Eric Lengyel, Samuli Laine,
* David Luebke, Pat McCormick, and Richard Vuduc for their contributions
* during the development of this library.
*
* CUDPP Developers from UC Davis thank their funding agencies:
* - Department of Energy Early Career Principal Investigator Award
* DE-FG02-04ER25609
* - SciDAC Institute for Ultrascale Visualization (http://www.iusv.org/)
* - Los Alamos National Laboratory
* - National Science Foundation (grant 0541448)
* - Generous hardware donations from NVIDIA
*
* \section license-overview CUDPP Copyright and Software License
* CUDPP is copyright The Regents of the University of California, Davis campus
* and NVIDIA Corporation. The library, examples, and all source code are
* released under the BSD license, designed to encourage reuse of this software
* in other projects, both commercial and non-commercial. For details, please
* see the \ref license page.
*
* Note that prior to release 1.1 of CUDPP, the license used was a modified
* BSD license. With release 1.1, this license was replaced with the pure BSD
* license to facilitate the use of open source hosting of the code.
*/
/**
* @page license CUDPP License
*
* \section licenseBSD CUDPP License
*
* CUDPP is released under the
* <a href="http://www.opensource.org/licenses/bsd-license.php">BSD license</a>.
*
* @include license.txt
*
*/
/**
* @page changelog CUDPP Change Log
*
* @include changelog.txt
*/
/**
* @page cudpp_refs Publications that use CUDPP
*
* @htmlinclude doc/bib/cudpp_refs.html
*/
/**
* @page cudpp_refs_bib Bibliography for publications that use CUDPP
*
* @htmlinclude doc/bib/cudpp_refs_bib.html
*/
/**
* @page building-cudpp Building CUDPP
*
* CUDPP has currently been tested in Windows XP, Windows Vista, Mac OS X
* and Linux. See \ref release-notes for release specific platform support.
*
* \section build-win32 Building CUDPP on Windows XP
*
* CUDPP can be built using either or MSVC 8 (2005) or MSVC 9 (2008). To
* build, open cudpp/cudpp.sln. Then you can build the library
* using the "build" command as you would with any other workspace. There are
* four configurations: debug, release, emudebug, and emurelease. The first
* two are self-explanatory. The second two are built to use CUDA device
* emulation, meaning they will be run (slowly) on the CPU.
*
* \section build-linux Building CUDPP on Linux and Mac OS X
*
* CUDPP can be built using standard g++ and Make tools on Linux, by typing
* "make" in the "cudpp/" subdirectory. Before building CUDPP, you should
* first build the CUDA Utility Library (libcutil) by typing "make; make dbg=1"
* in the "common/" subdirectory. This will generate libcutil.a and
* libcutilD.a.
*
* The makefile for CUDPP and all sample applications take the optional
* arguments "emu=1" and "dbg=1". The former builds CUDPP for device emulation,
* and the latter for debugging. The two flags can be combined. "verbose=1"
* can be used to see all compiler output.
*
* \section build-apps Building CUDPP Sample Applications
*
* The sample applications in the "apps/" subdirectory can be built exactly
* like CUDPP is--either by opening the appropriate .sln/.vcproj file in MSVC
* in Windows, or using "make" in Linux.
*
* On some Linux installations you will get linker errors relating to "-lXi"
* and "-lXmu". To fix this, you will need to install libXi and libXmu. On
* Debian and Ubuntu, for example, you can simply run
* "sudo apt-get install libxi-dev", and
* "sudo apt-get install libxmu-dev"
*
*/
#ifndef __CUDPP_H__
#define __CUDPP_H__
#include <stdlib.h> // for size_t
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief CUDPP Result codes returned by CUDPP API functions.
*/
enum CUDPPResult
{
CUDPP_SUCCESS = 0, /**< No error. */
CUDPP_ERROR_INVALID_HANDLE, /**< Specified handle (for example,
to a plan) is invalid. **/
CUDPP_ERROR_ILLEGAL_CONFIGURATION, /**< Specified configuration is
illegal. For example, an
invalid or illogical
combination of options. */
CUDPP_ERROR_UNKNOWN = 9999 /**< Unknown or untraceable error. */
};
/**
* @brief Options for configuring CUDPP algorithms.
*
* @see CUDPPConfiguration, cudppPlan, CUDPPAlgorithm
*/
enum CUDPPOption
{
CUDPP_OPTION_FORWARD = 0x1, /**< Algorithms operate forward:
* from start to end of input
* array */
CUDPP_OPTION_BACKWARD = 0x2, /**< Algorithms operate backward:
* from end to start of array */
CUDPP_OPTION_EXCLUSIVE = 0x4, /**< Exclusive (for scans) - scan
* includes all elements up to (but
* not including) the current
* element */
CUDPP_OPTION_INCLUSIVE = 0x8, /**< Inclusive (for scans) - scan
* includes all elements up to and
* including the current element */
CUDPP_OPTION_CTA_LOCAL = 0x10, /**< Algorithm performed only on
* the CTAs (blocks) with no
* communication between blocks.
* @todo Currently ignored. */
CUDPP_OPTION_KEYS_ONLY = 0x20, /**< No associated value to a key
* (for global radix sort) */
CUDPP_OPTION_KEY_VALUE_PAIRS = 0x40, /**< Each key has an associated value */
};
/**
* @brief Datatypes supported by CUDPP algorithms.
*
* @see CUDPPConfiguration, cudppPlan
*/
enum CUDPPDatatype
{
CUDPP_CHAR, //!< Character type (C char)
CUDPP_UCHAR, //!< Unsigned character (byte) type (C unsigned char)
CUDPP_INT, //!< Integer type (C int)
CUDPP_UINT, //!< Unsigned integer type (C unsigned int)
CUDPP_FLOAT //!< Float type (C float)
};
/**
* @brief Operators supported by CUDPP algorithms (currently scan and
* segmented scan).
*
* These are all binary associative operators.
*
* @see CUDPPConfiguration, cudppPlan
*/
enum CUDPPOperator
{
CUDPP_ADD, //!< Addition of two operands
CUDPP_MULTIPLY, //!< Multiplication of two operands
CUDPP_MIN, //!< Minimum of two operands
CUDPP_MAX //!< Maximum of two operands
};
/**
* @brief Algorithms supported by CUDPP. Used to create appropriate plans using
* cudppPlan.
*
* @see CUDPPConfiguration, cudppPlan
*/
enum CUDPPAlgorithm
{
CUDPP_SCAN, //!< Scan or prefix-sum
CUDPP_SEGMENTED_SCAN, //!< Segmented scan
CUDPP_COMPACT, //!< Stream compact
CUDPP_REDUCE, //!< Parallel reduction (NOTE: currently unimplemented)
CUDPP_SORT_RADIX, //!< Radix sort
CUDPP_SPMVMULT, //!< Sparse matrix-dense vector multiplication
CUDPP_RAND_MD5, //!< PseudoRandom Number Generator using MD5 hash algorithm
CUDPP_ALGORITHM_INVALID, //!< Placeholder at end of enum
};
/**
* @brief Configuration struct used to specify algorithm, datatype,
* operator, and options when creating a plan for CUDPP algorithms.
*
* @see cudppPlan
*/
struct CUDPPConfiguration
{
CUDPPAlgorithm algorithm; //!< The algorithm to be used
CUDPPOperator op; //!< The numerical operator to be applied
CUDPPDatatype datatype; //!< The datatype of the input arrays
unsigned int options; //!< Options to configure the algorithm
};
#define CUDPP_INVALID_HANDLE 0xC0DABAD1
typedef size_t CUDPPHandle;
/* To use CUDPP as a static library, #define CUDPP_STATIC_LIB before
* including cudpp.h
*/
#define CUDPP_STATIC_LIB
#ifndef CUDPP_DLL
#ifdef _WIN32
#ifdef CUDPP_STATIC_LIB
#define CUDPP_DLL
#else
#ifdef BUILD_DLL
#define CUDPP_DLL __declspec(dllexport)
#else
#define CUDPP_DLL __declspec(dllimport)
#endif
#endif
#else
#define CUDPP_DLL
#endif
#endif
// Plan allocation (for scan, sort, and compact)
CUDPP_DLL
CUDPPResult cudppPlan(CUDPPHandle *planHandle,
CUDPPConfiguration config,
size_t n,
size_t rows,
size_t rowPitch);
CUDPP_DLL
CUDPPResult cudppDestroyPlan(CUDPPHandle plan);
// Scan and sort algorithms
CUDPP_DLL
CUDPPResult cudppScan(CUDPPHandle planHandle,
void *d_out,
const void *d_in,
size_t numElements);
CUDPP_DLL
CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
void *d_out,
const void *d_in,
size_t numElements,
size_t numRows);
CUDPP_DLL
CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle,
void *d_out,
const void *d_idata,
const unsigned int *d_iflags,
size_t numElements);
CUDPP_DLL
CUDPPResult cudppCompact(CUDPPHandle planHandle,
void *d_out,
size_t *d_numValidElements,
const void *d_in,
const unsigned int *d_isValid,
size_t numElements);
CUDPP_DLL
CUDPPResult cudppSort(CUDPPHandle planHandle,
void *d_keys,
void *d_values,
int keybits,
size_t numElements);
// Sparse matrix allocation
CUDPP_DLL
CUDPPResult cudppSparseMatrix(CUDPPHandle *sparseMatrixHandle,
CUDPPConfiguration config,
size_t n,
size_t rows,
const void *A,
const unsigned int *h_rowIndices,
const unsigned int *h_indices);
CUDPP_DLL
CUDPPResult cudppDestroySparseMatrix(CUDPPHandle sparseMatrixHandle);
// Sparse matrix-vector algorithms
CUDPP_DLL
CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle,
void *d_y,
const void *d_x);
// random number generation algorithms
CUDPP_DLL
CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements);
CUDPP_DLL
CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed);
#ifdef __cplusplus
}
#endif
#endif
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:

View File

@ -0,0 +1,66 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* cudpp_globals.h
*
* @brief Global declarations defining machine characteristics of GPU target
* These are currently set for best performance on G8X GPUs. The optimal
* parameters may change on future GPUs. In the future, we hope to make
* CUDPP a self-tuning library.
*/
#ifndef __CUDPP_GLOBALS_H__
#define __CUDPP_GLOBALS_H__
const int NUM_BANKS = 16; /**< Number of shared memory banks */
const int LOG_NUM_BANKS = 4; /**< log_2(NUM_BANKS) */
const int CTA_SIZE = 128; /**< Number of threads in a CTA */
const int WARP_SIZE = 32; /**< Number of threads in a warp */
const int LOG_CTA_SIZE = 7; /**< log_2(CTA_SIZE) */
const int LOG_WARP_SIZE = 5; /**< log_2(WARP_SIZE) */
const int LOG_SIZEOF_FLOAT = 2; /**< log_2(sizeof(float)) */
const int SCAN_ELTS_PER_THREAD = 8; /**< Number of elements per scan thread */
const int SEGSCAN_ELTS_PER_THREAD = 8; /**< Number of elements per segmented scan thread */
const int maxSharedMemoryPerBlock = 16384; /**< Number of bytes of shared
memory in each block */
const int maxThreadsPerBlock = CTA_SIZE; /**< Maximum number of
* threads in a CTA */
/**
* @brief Macro to insert necessary __syncthreads() in device emulation mode
*/
#ifdef __DEVICE_EMULATION__
#define __EMUSYNC __syncthreads()
#else
#define __EMUSYNC
#endif
#define AVOID_BANK_CONFLICTS /**< Set if by default, we want our
* shared memory allocation to perform
* additional computation to avoid bank
* conflicts */
#ifdef AVOID_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
#else
#define CONFLICT_FREE_OFFSET(index) (0)
#endif
#endif // __CUDPP_GLOBALS_H__
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:

View File

@ -0,0 +1,94 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#include "cudpp_maximal_launch.h"
inline size_t min(size_t x, size_t y)
{
return (x <= y) ? x : y;
}
inline size_t max(size_t x, size_t y)
{
return (x >= y) ? x : y;
}
// computes next highest multiple of f from x
inline size_t multiple(size_t x, size_t f)
{
return ((x + (f-1)) / f);
}
// MS Excel-style CEIL() function
// Rounds x up to nearest multiple of f
inline size_t ceiling(size_t x, size_t f)
{
return multiple(x, f) * f;
}
extern "C"
size_t maxBlocks(cudaFuncAttributes &attribs,
cudaDeviceProp &devprop,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock)
{
// Determine the maximum number of CTAs that can be run simultaneously for each kernel
// This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
const unsigned int warpAllocationMultiple = 2;
const unsigned int smemAllocationUnit = 512; // in bytes
const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
const unsigned int maxBlocksPerSM = 8;
// Number of warps (round up to nearest whole multiple of warp size)
size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
// Round up to warp allocation multiple
numWarps = ceiling(numWarps, warpAllocationMultiple);
// Number of regs is regs per thread times number of warps times warp size
size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
// Round up to multiple of register allocation unit size
regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);
size_t smemBytes = attribs.sharedSizeBytes + bytesDynamicSharedMem;
size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);
size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock;
return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
}
extern "C"
size_t maxBlocksFromPointer(void* kernel,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock)
{
cudaDeviceProp devprop;
int deviceID = -1;
cudaError_t err = cudaGetDevice(&deviceID);
if (err == cudaSuccess)
{
err = cudaGetDeviceProperties(&devprop, deviceID);
if (err != cudaSuccess)
return -1;
cudaFuncAttributes attr;
err = cudaFuncGetAttributes(&attr, (const char*)kernel);
if (err != cudaSuccess)
return -1;
return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
}
return -1;
}

View File

@ -0,0 +1,37 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#ifndef _MAXIMAL_LAUNCH_H_
#define _MAXIMAL_LAUNCH_H_
#include "cuda_runtime.h"
extern "C"
size_t maxBlocks(cudaFuncAttributes &attribs,
cudaDeviceProp &devprop,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock);
extern "C"
size_t maxBlocksFromPointer(void* kernel,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock);
#ifdef __cplusplus
template <typename T>
size_t maxBlocks(T kernel,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock)
{
return maxBlocksFromPointer((void*)kernel, bytesDynamicSharedMem, threadsPerBlock);
}
#endif
#endif // _MAXIMAL_LAUNCH_H_

View File

@ -0,0 +1,459 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 3572$
// $Date: 2007-11-19 13:58:06 +0000 (Mon, 19 Nov 2007) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#include "cudpp.h"
#include "cudpp_plan_manager.h"
#include "cudpp_scan.h"
//#include "cudpp_segscan.h"
//#include "cudpp_compact.h"
//#include "cudpp_spmvmult.h"
#include "cudpp_radixsort.h"
#include <assert.h>
CUDPPPlanManager* CUDPPPlanManager::m_instance = NULL;
CUDPPResult validateOptions(CUDPPConfiguration config, size_t /*numElements*/, size_t numRows, size_t /*rowPitch*/)
{
CUDPPResult ret = CUDPP_SUCCESS;
if ((config.options & CUDPP_OPTION_BACKWARD) && (config.options & CUDPP_OPTION_FORWARD))
ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION;
if ((config.options & CUDPP_OPTION_EXCLUSIVE) && (config.options & CUDPP_OPTION_INCLUSIVE))
ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION;
if (config.algorithm == CUDPP_COMPACT && numRows > 1)
ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION; //!< @todo: add support for multi-row cudppCompact
return ret;
}
/** @addtogroup publicInterface
* @{
*/
/** @name Plan Interface
* @{
*/
/** @brief Create a CUDPP plan
*
* A plan is a data structure containing state and intermediate storage space
* that CUDPP uses to execute algorithms on data. A plan is created by
* passing to cudppPlan() a CUDPPConfiguration that specifies the algorithm,
* operator, datatype, and options. The size of the data must also be passed
* to cudppPlan(), in the \a numElements, \a numRows, and \a rowPitch
* arguments. These sizes are used to allocate internal storage space at the
* time the plan is created. The CUDPP planner may use the sizes, options,
* and information about the present hardware to choose optimal settings.
*
* Note that \a numElements is the maximum size of the array to be processed
* with this plan. That means that a plan may be re-used to process (for
* example, to sort or scan) smaller arrays.
*
* @param[out] planHandle A pointer to an opaque handle to the internal plan
* @param[in] config The configuration struct specifying algorithm and options
* @param[in] numElements The maximum number of elements to be processed
* @param[in] numRows The number of rows (for 2D operations) to be processed
* @param[in] rowPitch The pitch of the rows of input data, in elements
*/
CUDPP_DLL
CUDPPResult cudppPlan(CUDPPHandle *planHandle,
CUDPPConfiguration config,
size_t numElements,
size_t numRows,
size_t rowPitch)
{
CUDPPResult result = CUDPP_SUCCESS;
CUDPPPlan *plan;
result = validateOptions(config, numElements, numRows, rowPitch);
if (result != CUDPP_SUCCESS)
{
*planHandle = CUDPP_INVALID_HANDLE;
return result;
}
switch (config.algorithm)
{
case CUDPP_SCAN:
{
plan = new CUDPPScanPlan(config, numElements, numRows, rowPitch);
break;
}
// case CUDPP_COMPACT:
// {
// plan = new CUDPPCompactPlan(config, numElements, numRows, rowPitch);
// break;
// }
case CUDPP_SORT_RADIX:
//case CUDPP_SORT_RADIX_GLOBAL:
{
plan = new CUDPPRadixSortPlan(config, numElements);
break;
}
/* case CUDPP_SEGMENTED_SCAN:
{
plan = new CUDPPSegmentedScanPlan(config, numElements);
break;
}
//new rand plan
case CUDPP_RAND_MD5:
{
plan = new CUDPPRandPlan(config, numElements);
break;
}
case CUDPP_REDUCE:*/
default:
//! @todo: implement cudppReduce()
return CUDPP_ERROR_ILLEGAL_CONFIGURATION;
break;
}
*planHandle = CUDPPPlanManager::AddPlan(plan);
if (CUDPP_INVALID_HANDLE == *planHandle)
return CUDPP_ERROR_UNKNOWN;
else
return CUDPP_SUCCESS;
}
/** @brief Destroy a CUDPP Plan
*
* Deletes the plan referred to by \a planHandle and all associated internal
* storage.
*
* @param[in] planHandle The CUDPPHandle to the plan to be destroyed
*/
CUDPP_DLL
CUDPPResult cudppDestroyPlan(CUDPPHandle planHandle)
{
if (CUDPPPlanManager::RemovePlan(planHandle) == false)
return CUDPP_ERROR_INVALID_HANDLE;
else
return CUDPP_SUCCESS;
}
/** @brief Create a CUDPP Sparse Matrix Object
*
* The sparse matrix plan is a data structure containing state and intermediate storage space
* that CUDPP uses to perform sparse matrix dense vector multiply. This plan is created by
* passing to CUDPPSparseMatrixVectorMultiplyPlan() a CUDPPConfiguration that specifies the
* algorithm (sprarse matrix-dense vector multiply) and datatype, along with the sparse matrix
* itself in CSR format. The number of non-zero elements in the sparse matrix must also be passed
* as \a numNonZeroElements. This is used to allocate internal storage space at the time the
* sparse matrix plan is created.
*
* @param[out] sparseMatrixHandle A pointer to an opaque handle to the sparse matrix object
* @param[in] config The configuration struct specifying algorithm and options
* @param[in] numNonZeroElements The number of non zero elements in the sparse matrix
* @param[in] numRows This is the number of rows in y, x and A for y = A * x
* @param[in] A The matrix data
* @param[in] h_rowIndices An array containing the index of the start of each row in \a A
* @param[in] h_indices An array containing the index of each nonzero element in \a A
CUDPP_DLL
CUDPPResult cudppSparseMatrix(CUDPPHandle *sparseMatrixHandle,
CUDPPConfiguration config,
size_t numNonZeroElements,
size_t numRows,
const void *A,
const unsigned int *h_rowIndices,
const unsigned int *h_indices)
{
CUDPPResult result = CUDPP_SUCCESS;
CUDPPPlan *sparseMatrix;
if ((config.algorithm != CUDPP_SPMVMULT) ||
(numNonZeroElements <= 0) || (numRows <= 0))
{
result = CUDPP_ERROR_ILLEGAL_CONFIGURATION;
}
if (result != CUDPP_SUCCESS)
{
*sparseMatrixHandle = CUDPP_INVALID_HANDLE;
return result;
}
sparseMatrix =
new CUDPPSparseMatrixVectorMultiplyPlan(config, numNonZeroElements, A,
h_rowIndices, h_indices, numRows);
*sparseMatrixHandle = CUDPPPlanManager::AddPlan(sparseMatrix);
if (CUDPP_INVALID_HANDLE == *sparseMatrixHandle)
return CUDPP_ERROR_UNKNOWN;
else
return CUDPP_SUCCESS;
}
*/
/** @brief Destroy a CUDPP Sparse Matrix Object
*
* Deletes the sparse matrix data and plan referred to by \a sparseMatrixHandle
* and all associated internal storage.
*
* @param[in] sparseMatrixHandle The CUDPPHandle to the matrix object to be destroyed
CUDPP_DLL
CUDPPResult cudppDestroySparseMatrix(CUDPPHandle sparseMatrixHandle)
{
return cudppDestroyPlan(sparseMatrixHandle);
}
*/
/** @} */ // end Plan Interface
/** @} */ // end publicInterface
/** @brief Plan base class constructor
*
* @param[in] config The configuration struct specifying algorithm and options
* @param[in] numElements The maximum number of elements to be processed
* @param[in] numRows The number of rows (for 2D operations) to be processed
* @param[in] rowPitch The pitch of the rows of input data, in elements
*/
CUDPPPlan::CUDPPPlan(CUDPPConfiguration config,
size_t numElements,
size_t numRows,
size_t rowPitch)
: m_config(config),
m_numElements(numElements),
m_numRows(numRows),
m_rowPitch(rowPitch)
{
}
/** @brief Scan Plan constructor
*
* @param[in] config The configuration struct specifying algorithm and options
* @param[in] numElements The maximum number of elements to be scanned
* @param[in] numRows The maximum number of rows (for 2D operations) to be scanned
* @param[in] rowPitch The pitch of the rows of input data, in elements
*/
CUDPPScanPlan::CUDPPScanPlan(CUDPPConfiguration config,
size_t numElements,
size_t numRows,
size_t rowPitch)
: CUDPPPlan(config, numElements, numRows, rowPitch),
m_blockSums(0),
m_rowPitches(0),
m_numEltsAllocated(0),
m_numRowsAllocated(0),
m_numLevelsAllocated(0)
{
allocScanStorage(this);
}
/** @brief CUDPP scan plan destructor */
CUDPPScanPlan::~CUDPPScanPlan()
{
freeScanStorage(this);
}
/** @brief SegmentedScan Plan constructor
*
* @param[in] config The configuration struct specifying options
* @param[in] numElements The maximum number of elements to be scanned
CUDPPSegmentedScanPlan::CUDPPSegmentedScanPlan(CUDPPConfiguration config,
size_t numElements)
: CUDPPPlan(config, numElements, 1, 0),
m_blockSums(0),
m_blockFlags(0),
m_blockIndices(0),
m_numEltsAllocated(0),
m_numLevelsAllocated(0)
{
allocSegmentedScanStorage(this);
}
*/
/** @brief SegmentedScan plan destructor
CUDPPSegmentedScanPlan::~CUDPPSegmentedScanPlan()
{
freeSegmentedScanStorage(this);
}
*/
/** @brief Compact Plan constructor
*
* @param[in] config The configuration struct specifying options
* @param[in] numElements The maximum number of elements to be compacted
* @param[in] numRows The number of rows (for 2D operations) to be compacted
* @param[in] rowPitch The pitch of the rows of input data, in elements
CUDPPCompactPlan::CUDPPCompactPlan(CUDPPConfiguration config,
size_t numElements,
size_t numRows,
size_t rowPitch)
: CUDPPPlan(config, numElements, numRows, rowPitch),
m_d_outputIndices(0)
{
assert(numRows == 1); //!< @todo Add support for multirow compaction
CUDPPConfiguration scanConfig =
{
CUDPP_SCAN,
CUDPP_ADD,
CUDPP_UINT,
(config.options & CUDPP_OPTION_BACKWARD) ?
CUDPP_OPTION_BACKWARD | CUDPP_OPTION_EXCLUSIVE :
CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE
};
m_scanPlan = new CUDPPScanPlan(scanConfig, numElements, numRows, rowPitch);
allocCompactStorage(this);
}
*/
/** @brief Compact plan destructor
CUDPPCompactPlan::~CUDPPCompactPlan()
{
delete m_scanPlan;
freeCompactStorage(this);
}
*/
/** @brief Sort Plan constructor
*
* @param[in] config The configuration struct specifying algorithm and options
* @param[in] numElements The maximum number of elements to be sorted
*/
/*CUDPPSortPlan::CUDPPSortPlan(CUDPPConfiguration config, size_t numElements)
: CUDPPPlan(config, numElements, 1, 0),
m_scanPlan(0),
m_d_temp(0),
m_d_tempAddress(0)
{
CUDPPConfiguration scanConfig =
{
CUDPP_SCAN,
CUDPP_ADD,
CUDPP_UINT,
CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE
};
//if (config.algorithm == CUDPP_SORT_RADIX_GLOBAL)
{
m_scanPlan = new CUDPPScanPlan(scanConfig, numElements, 1, 0);
}
allocSortStorage(this);
}*/
/** @brief Sort plan destructor */
/*CUDPPSortPlan::~CUDPPSortPlan()
{
delete m_scanPlan;
freeSortStorage(this);
}*/
CUDPPRadixSortPlan::CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements)
: CUDPPPlan(config, numElements, 1, 0),
m_scanPlan(0),
m_tempKeys(0),
m_tempValues(0),
m_counters(0),
m_countersSum(0),
m_blockOffsets(0)
{
size_t numBlocks2 = ((numElements % (SORT_CTA_SIZE * 2)) == 0) ?
(numElements / (SORT_CTA_SIZE * 2)) : (numElements / (SORT_CTA_SIZE * 2) + 1);
CUDPPConfiguration scanConfig =
{
CUDPP_SCAN,
CUDPP_ADD,
CUDPP_UINT,
CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE
};
if(m_config.options == CUDPP_OPTION_KEYS_ONLY)
m_bKeysOnly = true;
else
m_bKeysOnly = false;
m_scanPlan = new CUDPPScanPlan(scanConfig, numBlocks2*16, 1, 0);
allocRadixSortStorage(this);
}
CUDPPRadixSortPlan::~CUDPPRadixSortPlan()
{
delete m_scanPlan;
freeRadixSortStorage(this);
}
/** @brief SparseMatrixVectorMultiply Plan constructor
*
* @param[in] config The configuration struct specifying options
* @param[in] numNonZeroElements The number of non-zero elements in sparse matrix
* @param[in] A Array of non-zero matrix elements
* @param[in] rowIndex Array of indices of the first element of each row
* in the "flattened" version of the sparse matrix
* @param[in] index Array of indices of non-zero elements in the matrix
* @param[in] numRows The number of rows in the sparse matrix
CUDPPSparseMatrixVectorMultiplyPlan::CUDPPSparseMatrixVectorMultiplyPlan(
CUDPPConfiguration config,
size_t numNonZeroElements,
const void *A,
const unsigned int *rowIndex,
const unsigned int *index,
size_t numRows
)
: CUDPPPlan(config, numNonZeroElements, 1, 0),
m_segmentedScanPlan(0),
m_d_prod(0),
m_d_flags(0),
m_d_rowFinalIndex(0),
m_rowFinalIndex(0),
m_numRows(numRows),
m_numNonZeroElements(numNonZeroElements)
{
CUDPPConfiguration segScanConfig =
{
CUDPP_SEGMENTED_SCAN,
CUDPP_ADD,
config.datatype,
(CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE)
};
m_segmentedScanPlan = new CUDPPSegmentedScanPlan(segScanConfig, m_numNonZeroElements);
// Generate an array of the indices of the last element of each row
// in the "flattened" version of the sparse matrix
m_rowFinalIndex = new unsigned int [m_numRows];
for (unsigned int i=0; i < m_numRows; ++i)
{
if (i < m_numRows-1)
m_rowFinalIndex[i] = rowIndex[i+1];
else
m_rowFinalIndex[i] = (unsigned int)numNonZeroElements;
}
allocSparseMatrixVectorMultiplyStorage(this, A, rowIndex, index);
}
*/
/** @brief Sparse matrix-vector plan destructor
CUDPPSparseMatrixVectorMultiplyPlan::~CUDPPSparseMatrixVectorMultiplyPlan()
{
freeSparseMatrixVectorMultiplyStorage(this);
delete m_segmentedScanPlan;
delete [] m_rowFinalIndex;
}
*/
/** @brief CUDPP Rand Plan Constructor
* @param[in] config The configuration struct specifying options
* @param[in] num_elements The number of elements to generate random bits for
CUDPPRandPlan::CUDPPRandPlan(CUDPPConfiguration config, size_t num_elements)
: CUDPPPlan(config, num_elements, 1, 0),
m_seed(0)
{
}
*/

View File

@ -0,0 +1,158 @@
// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 3572$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#ifndef __CUDPP_PLAN_H__
#define __CUDPP_PLAN_H__
typedef void* KernelPointer;
extern "C" size_t getNumCTAs(KernelPointer kernel);
extern "C" void compNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock);
template <typename T>
size_t numCTAs(T kernel)
{
return getNumCTAs((KernelPointer)kernel);
}
template <typename T>
void computeNumCTAs(T kernel, unsigned int bytesDynamicSharedMem, size_t threadsPerBlock)
{
compNumCTAs((KernelPointer)kernel, bytesDynamicSharedMem, threadsPerBlock);
}
/** @brief Base class for CUDPP Plan data structures
*
* CUDPPPlan and its subclasses provide the internal (i.e. not visible to the
* library user) infrastructure for planning algorithm execution. They
* own intermediate storage for CUDPP algorithms as well as, in some cases,
* information about optimal execution configuration for the present hardware.
*
*/
class CUDPPPlan
{
public:
CUDPPPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
virtual ~CUDPPPlan() {}
// Note anything passed to functions compiled by NVCC must be public
CUDPPConfiguration m_config; //!< @internal Options structure
size_t m_numElements; //!< @internal Maximum number of input elements
size_t m_numRows; //!< @internal Maximum number of input rows
size_t m_rowPitch; //!< @internal Pitch of input rows in elements
};
/** @brief Plan class for scan algorithm
*
*/
class CUDPPScanPlan : public CUDPPPlan
{
public:
CUDPPScanPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
virtual ~CUDPPScanPlan();
void **m_blockSums; //!< @internal Intermediate block sums array
size_t *m_rowPitches; //!< @internal Pitch of each row in elements (for cudppMultiScan())
size_t m_numEltsAllocated; //!< @internal Number of elements allocated (maximum scan size)
size_t m_numRowsAllocated; //!< @internal Number of rows allocated (for cudppMultiScan())
size_t m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums)
};
/** @brief Plan class for segmented scan algorithm
*
*/
class CUDPPSegmentedScanPlan : public CUDPPPlan
{
public:
CUDPPSegmentedScanPlan(CUDPPConfiguration config, size_t numElements);
virtual ~CUDPPSegmentedScanPlan();
void **m_blockSums; //!< @internal Intermediate block sums array
unsigned int **m_blockFlags; //!< @internal Intermediate block flags array
unsigned int **m_blockIndices; //!< @internal Intermediate block indices array
size_t m_numEltsAllocated; //!< @internal Number of elements allocated (maximum scan size)
size_t m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums)
};
/** @brief Plan class for compact algorithm
*
*/
class CUDPPCompactPlan : public CUDPPPlan
{
public:
CUDPPCompactPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
virtual ~CUDPPCompactPlan();
CUDPPScanPlan *m_scanPlan; //!< @internal Compact performs a scan of type unsigned int using this plan
unsigned int* m_d_outputIndices; //!< @internal Output address of compacted elements; this is the result of scan
};
class CUDPPRadixSortPlan : public CUDPPPlan
{
public:
CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements);
virtual ~CUDPPRadixSortPlan();
bool m_bKeysOnly;
bool m_bManualCoalesce;
bool m_bUsePersistentCTAs;
unsigned int m_persistentCTAThreshold[2];
unsigned int m_persistentCTAThresholdFullBlocks[2];
CUDPPScanPlan *m_scanPlan; //!< @internal Sort performs a scan of type unsigned int using this plan
unsigned int m_keyBits;
mutable void *m_tempKeys; //!< @internal Intermediate storage for keys
mutable void *m_tempValues; //!< @internal Intermediate storage for values
unsigned int *m_counters; //!< @internal Counter for each radix
unsigned int *m_countersSum; //!< @internal Prefix sum of radix counters
unsigned int *m_blockOffsets; //!< @internal Global offsets of each radix in each block
};
/** @brief Plan class for sparse-matrix dense-vector multiply
*
*/
class CUDPPSparseMatrixVectorMultiplyPlan : public CUDPPPlan
{
public:
CUDPPSparseMatrixVectorMultiplyPlan(CUDPPConfiguration config, size_t numNZElts,
const void *A,
const unsigned int *rowindx,
const unsigned int *indx, size_t numRows);
virtual ~CUDPPSparseMatrixVectorMultiplyPlan();
CUDPPSegmentedScanPlan *m_segmentedScanPlan; //!< @internal Performs a segmented scan of type T using this plan
void *m_d_prod; //!< @internal Vector of products (of an element in A and its corresponding (thats is
//! belongs to the same row) element in x; this is the input and output of
//! segmented scan
unsigned int *m_d_flags; //!< @internal Vector of flags where a flag is set if an element of A is the first element
//! of its row; this is the flags vector for segmented scan
unsigned int *m_d_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
//! which is the last element of that row. Resides in GPU memory.
unsigned int *m_d_rowIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
//! which is the first element of that row. Resides in GPU memory.
unsigned int *m_d_index; //!<@internal Vector of column numbers one for each element in A
void *m_d_A; //!<@internal The A matrix
unsigned int *m_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
//! which is the last element of that row. Resides in CPU memory.
size_t m_numRows; //!< Number of rows
size_t m_numNonZeroElements; //!<Number of non-zero elements
};
/** @brief Plan class for random number generator
*
*/
class CUDPPRandPlan : public CUDPPPlan
{
public:
CUDPPRandPlan(CUDPPConfiguration config, size_t num_elements);
unsigned int m_seed; //!< @internal the seed for the random number generator
};
#endif // __CUDPP_PLAN_H__

View File

@ -0,0 +1,155 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 3572$
// $Date: 2007-11-19 13:58:06 +0000 (Mon, 19 Nov 2007) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#include "cudpp.h"
#include "cudpp_plan.h"
#include "cudpp_plan_manager.h"
#include "cudpp_maximal_launch.h"
typedef void* KernelPointer;
extern "C" size_t getNumCTAs(KernelPointer kernel)
{
return CUDPPPlanManager::numCTAs(kernel);
}
extern "C" void compNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock)
{
CUDPPPlanManager::computeNumCTAs(kernel, bytesDynamicSharedMem, threadsPerBlock);
}
//! @internal Instantiate the plan manager singleton object
void CUDPPPlanManager::Instantiate()
{
if (NULL == m_instance)
m_instance = new CUDPPPlanManager;
}
//! @internal Destroy the plan manager singleton object
void CUDPPPlanManager::Destroy()
{
if (NULL != m_instance)
{
delete m_instance;
m_instance = NULL;
}
}
/** @brief Plan Manager destructor
* Destroys all plans as well as the plan manager.
*/
CUDPPPlanManager::~CUDPPPlanManager()
{
std::map<CUDPPHandle,CUDPPPlan*>::iterator it;
for (it = m_instance->plans.begin(); it != m_instance->plans.end(); it++)
{
CUDPPPlan* plan = it->second;
delete plan;
plan = NULL;
}
m_instance->plans.clear();
m_instance->numCTAsTable.clear();
}
/** @brief Add a plan to the plan manager
*
* @returns a valid CUDPPHandle if the plan was successfully added, or
* CUDPP_INVALID_HANDLE otherwise
* @param[in] plan The plan to add
*/
CUDPPHandle CUDPPPlanManager::AddPlan(CUDPPPlan* plan)
{
Instantiate();
std::pair<std::map<CUDPPHandle, CUDPPPlan*>::iterator, bool> ret;
CUDPPHandle handle = (CUDPPHandle)m_instance->plans.size();
ret = m_instance->plans.insert(std::pair<CUDPPHandle,CUDPPPlan*>(handle, plan));
if (ret.second == true)
return handle;
else
return CUDPP_INVALID_HANDLE;
}
/** @brief Remove a plan from the plan manager
*
* @returns true if the plan was successfully removed, false otherwise
* @param[in] handle The handle to the plan to remove
*/
bool CUDPPPlanManager::RemovePlan(CUDPPHandle handle)
{
if (m_instance == NULL)
{
return false;
}
std::map<CUDPPHandle,CUDPPPlan*>::iterator it;
it = m_instance->plans.find(handle);
if (it != m_instance->plans.end())
{
CUDPPPlan* plan = it->second;
delete plan;
plan = NULL;
m_instance->plans.erase(it);
if (0 == m_instance->plans.size())
{
Destroy();
}
return true;
}
else
{
return false;
}
}
/** @brief Get a plan from the plan manager by handle
*
* @returns A pointer to the plan if found, or NULL otherwise
* @param handle The handle to the requested plan
*/
CUDPPPlan* CUDPPPlanManager::GetPlan(CUDPPHandle handle)
{
if (m_instance == NULL)
{
return NULL;
}
std::map<CUDPPHandle, CUDPPPlan*>::iterator it;
it = m_instance->plans.find(handle);
if (it != m_instance->plans.end())
{
return it->second;
}
else
{
return NULL;
}
}
size_t CUDPPPlanManager::numCTAs(KernelPointer kernel)
{
if (m_instance == NULL)
{
return 0;
}
return m_instance->numCTAsTable[kernel];
}
void CUDPPPlanManager::computeNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock)
{
Instantiate();
m_instance->numCTAsTable[kernel] = maxBlocks(kernel, bytesDynamicSharedMem, threadsPerBlock);
}

View File

@ -0,0 +1,56 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 3572$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#ifndef __CUDPP_PLAN_MANAGER_H__
#define __CUDPP_PLAN_MANAGER_H__
#include <map>
class CUDPPPlan;
typedef void* KernelPointer;
/** @brief Singleton manager class for CUDPPPlan objects
*
* This class manages all active plans in CUDPP. It is a singleton class,
* meaning that only one instance can exist. It is created automatically the
* first time AddPlan() is called, and destroyed when the last plan is removed
* using RemovePlan().
*/
class CUDPPPlanManager
{
public:
static CUDPPHandle AddPlan(CUDPPPlan* plan);
static bool RemovePlan(CUDPPHandle handle);
static CUDPPPlan* GetPlan(CUDPPHandle handle);
static size_t numCTAs(KernelPointer kernel);
static void computeNumCTAs(KernelPointer kernel,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock);
protected:
static CUDPPPlanManager* m_instance;
std::map<CUDPPHandle, CUDPPPlan*> plans;
std::map<void*, size_t> numCTAsTable;
private:
//! @internal Instantiate the plan manager singleton object
static void Instantiate();
//! @internal Destroy the plan manager singleton object
static void Destroy();
private:
CUDPPPlanManager() {}
CUDPPPlanManager(const CUDPPPlanManager&) {}
~CUDPPPlanManager();
};
#endif // __CUDPP_PLAN_MANAGER_H__

View File

@ -0,0 +1,34 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#ifndef __RADIXSORT_H__
#define __RADIXSORT_H__
#define SORT_CTA_SIZE 256 //This CTA_SIZE must equal 16 * number of radices
#include "cudpp_globals.h"
#include "cudpp.h"
#include "cudpp_plan.h"
extern "C"
void allocRadixSortStorage(CUDPPRadixSortPlan* plan);
extern "C"
void freeRadixSortStorage(CUDPPRadixSortPlan* plan);
extern "C"
void cudppRadixSortDispatch(void *keys,
void *values,
size_t numElements,
int keyBits,
const CUDPPRadixSortPlan *plan);
#endif // __RADIXSORT_H__

View File

@ -0,0 +1,36 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* cudpp_scan.h
*
* @brief Scan functionality header file - contains CUDPP interface (not public)
*/
#ifndef _CUDPP_SCAN_H_
#define _CUDPP_SCAN_H_
class CUDPPScanPlan;
extern "C"
void allocScanStorage(CUDPPScanPlan *plan);
extern "C"
void freeScanStorage(CUDPPScanPlan *plan);
extern "C"
void cudppScanDispatch(void *d_out,
const void *d_in,
size_t numElements,
size_t numRows,
const CUDPPScanPlan *plan);
#endif // _CUDPP_SCAN_H_

View File

@ -0,0 +1,363 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* cudpp_util.h
*
* @brief C++ utility functions and classes used internally to cuDPP
*/
#ifndef __CUDPP_UTIL_H__
#define __CUDPP_UTIL_H__
#ifdef WIN32
#include <windows.h>
#endif
#include <cuda.h>
#include <cudpp.h>
#include <limits.h>
#include <float.h>
#if (CUDA_VERSION >= 3000)
#define LAUNCH_BOUNDS(x) __launch_bounds__((x))
#define LAUNCH_BOUNDS_MINBLOCKs(x, y) __launch_bounds__((x),(y))
#else
#define LAUNCH_BOUNDS(x)
#define LAUNCH_BOUNDS_MINBLOCKS(x, y)
#endif
/** @brief Determine if \a n is a power of two.
* @param n Value to be checked to see if it is a power of two
* @returns True if \a n is a power of two, false otherwise
*/
inline bool
isPowerOfTwo(int n)
{
return ((n&(n-1))==0) ;
}
/** @brief Determine if an integer \a n is a multiple of an integer \a f.
* @param n Multiple
* @param f Factor
* @returns True if \a n is a multiple of \a f, false otherwise
*/
inline bool
isMultiple(int n, int f)
{
if (isPowerOfTwo(f))
return ((n&(f-1))==0);
else
return (n%f==0);
}
/** @brief Compute the smallest power of two larger than \a n.
* @param n Input value
* @returns The smallest power f two larger than \a n
*/
inline int
ceilPow2(int n)
{
double log2n = log2((double)n);
if (isPowerOfTwo(n))
return n;
else
return 1 << (int)ceil(log2n);
}
/** @brief Compute the largest power of two smaller than \a n.
* @param n Input value
* @returns The largest power of two smaller than \a n.
*/
inline int
floorPow2(int n)
{
#ifdef WIN32
// method 2
return 1 << (int)_logb((float)n);
#else
// method 3
int exp;
frexp((float)n, &exp);
return 1 << (exp - 1);
#endif
}
/** @brief Returns the maximum value for type \a T.
*
* Implemented using template specialization on \a T.
*/
template <class T>
__host__ __device__ inline T getMax() { return 0; }
/** @brief Returns the minimum value for type \a T.
*
* Implemented using template specialization on \a T.
*/
template <class T>
__host__ __device__ inline T getMin() { return 0; }
// type specializations for the above
// getMax
template <> __host__ __device__ inline int getMax() { return INT_MAX; }
template <> __host__ __device__ inline unsigned int getMax() { return INT_MAX; }
template <> __host__ __device__ inline float getMax() { return FLT_MAX; }
template <> __host__ __device__ inline char getMax() { return (char)INT_MAX; }
template <> __host__ __device__ inline unsigned char getMax() { return (unsigned char)INT_MAX; }
// getMin
template <> __host__ __device__ inline int getMin() { return INT_MIN; }
template <> __host__ __device__ inline unsigned int getMin() { return 0; }
template <> __host__ __device__ inline float getMin() { return -FLT_MAX; }
template <> __host__ __device__ inline char getMin() { return (char)INT_MIN; }
template <> __host__ __device__ inline unsigned char getMin() { return (unsigned char)0; }
/** @brief Returns the maximum of three values.
* @param a First value.
* @param b Second value.
* @param c Third value.
* @returns The maximum of \a a, \a b and \a c.
*/
template<class T>
inline int max3(T a, T b, T c)
{
return (a > b) ? ((a > c)? a : c) : ((b > c) ? b : c);
}
/** @brief Utility template struct for generating small vector types from scalar types
*
* Given a base scalar type (\c int, \c float, etc.) and a vector length (1 through 4) as
* template parameters, this struct defines a vector type (\c float3, \c int4, etc.) of the
* specified length and base type. For example:
* \code
* template <class T>
* __device__ void myKernel(T *data)
* {
* typeToVector<T,4>::Result myVec4; // create a vec4 of type T
* myVec4 = (typeToVector<T,4>::Result*)data[0]; // load first element of data as a vec4
* }
* \endcode
*
* This functionality is implemented using template specialization. Currently specializations
* for int, float, and unsigned int vectors of lengths 2-4 are defined. Note that this results
* in types being generated at compile time -- there is no runtime cost. typeToVector is used by
* the optimized scan \c __device__ functions in scan_cta.cu.
*/
template <typename T, int N>
struct typeToVector
{
typedef T Result;
};
template<>
struct typeToVector<int, 4>
{
typedef int4 Result;
};
template<>
struct typeToVector<unsigned int, 4>
{
typedef uint4 Result;
};
template<>
struct typeToVector<float, 4>
{
typedef float4 Result;
};
template<>
struct typeToVector<int, 3>
{
typedef int3 Result;
};
template<>
struct typeToVector<unsigned int, 3>
{
typedef uint3 Result;
};
template<>
struct typeToVector<float, 3>
{
typedef float3 Result;
};
template<>
struct typeToVector<int, 2>
{
typedef int2 Result;
};
template<>
struct typeToVector<unsigned int, 2>
{
typedef uint2 Result;
};
template<>
struct typeToVector<float, 2>
{
typedef float2 Result;
};
/** @brief Templatized operator class used by scan and segmented scan
*
* This Operator class is used to allow generic support of binary
* associative operators in scan. It defines two member functions,
* op() and identity(), that are used in place of + and 0 (for
* example) in the scan and segmented scan code. Because this is
* template code, all decisions in the code are made at compile
* time, resulting in optimal operator code. Currently the operators
* CUDPP_ADD, CUDPP_MULTIPLY, CUDPP_MIN, and CUDPP_MAX are supported.
* Operator is implemented using template specialization for the
* types \c int, \c unsigned int, and \c float.
*/
template <typename T, CUDPPOperator oper>
class Operator
{
public:
/** Applies the operator to operands \a a and \a b.
* @param a First operand
* @param b Second operand
* @returns a OP b, where OP is defined by ::CUDPPOperator \a oper.
*/
static __device__ T op(const T a, const T b)
{
switch (oper)
{
case CUDPP_ADD:
return a + b;
case CUDPP_MULTIPLY:
return a * b;
case CUDPP_MIN:
return min(a, b);
case CUDPP_MAX:
return max(a, b);
}
}
/** Returns the identity element defined for type \a T */
static __device__ T identity() { return 0; }
};
// specializations for different types
template <CUDPPOperator oper>
class Operator <int, oper>
{
public:
static __device__ int op(const int a, const int b)
{
switch (oper)
{
default:
case CUDPP_ADD:
return a + b;
case CUDPP_MULTIPLY:
return a * b;
case CUDPP_MIN:
return min(a, b);
case CUDPP_MAX:
return max(a, b);
}
}
static __device__ int identity()
{
switch (oper)
{
default:
case CUDPP_ADD:
return 0;
case CUDPP_MULTIPLY:
return 1;
case CUDPP_MIN:
return INT_MAX;
case CUDPP_MAX:
return INT_MIN;
}
}
};
template <CUDPPOperator oper>
class Operator <unsigned int, oper>
{
public:
static __device__ unsigned int op(const unsigned int a, const unsigned int b)
{
switch (oper)
{
default:
case CUDPP_ADD:
return a + b;
case CUDPP_MULTIPLY:
return a * b;
case CUDPP_MIN:
return min(a, b);
case CUDPP_MAX:
return max(a, b);
}
}
static __device__ unsigned int identity()
{
switch (oper)
{
default:
case CUDPP_ADD:
return 0;
case CUDPP_MULTIPLY:
return 1;
case CUDPP_MIN:
return UINT_MAX;
case CUDPP_MAX:
return 0;
}
}
};
template <CUDPPOperator oper>
class Operator <float, oper>
{
public:
static __device__ float op(const float a, const float b)
{
switch (oper)
{
default:
case CUDPP_ADD:
return a + b;
case CUDPP_MULTIPLY:
return a * b;
case CUDPP_MIN:
return min(a, b);
case CUDPP_MAX:
return max(a, b);
}
}
static __device__ float identity()
{
switch (oper)
{
default:
case CUDPP_ADD:
return 0.0f;
case CUDPP_MULTIPLY:
return 1.0f;
case CUDPP_MIN:
return FLT_MAX;
case CUDPP_MAX:
return -FLT_MAX;
}
}
};
#endif // __CUDPP_UTIL_H__
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:

879
lib/gpu/cudpp_mini/cutil.h Normal file
View File

@ -0,0 +1,879 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
/* CUda UTility Library */
#ifndef _CUTIL_H_
#define _CUTIL_H_
#include <cuda_runtime.h>
#ifdef _WIN32
# pragma warning( disable : 4996 ) // disable deprecated warning
#endif
#ifdef __cplusplus
extern "C" {
#endif
// helper typedefs for building DLL
#ifdef _WIN32
# ifdef BUILD_DLL
# define DLL_MAPPING __declspec(dllexport)
# else
# define DLL_MAPPING __declspec(dllimport)
# endif
#else
# define DLL_MAPPING
#endif
#ifdef _WIN32
#define CUTIL_API __stdcall
#else
#define CUTIL_API
#endif
////////////////////////////////////////////////////////////////////////////
//! CUT bool type
////////////////////////////////////////////////////////////////////////////
enum CUTBoolean
{
CUTFalse = 0,
CUTTrue = 1
};
////////////////////////////////////////////////////////////////////////////
//! Deallocate memory allocated within Cutil
//! @param pointer to memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
void CUTIL_API
cutFree( void* ptr);
////////////////////////////////////////////////////////////////////////////
//! Helper for bank conflict checking (should only be used with the
//! CUT_BANK_CHECKER macro)
//! @param tidx thread id in x dimension of block
//! @param tidy thread id in y dimension of block
//! @param tidz thread id in z dimension of block
//! @param bdimx block size in x dimension
//! @param bdimy block size in y dimension
//! @param bdimz block size in z dimension
//! @param file name of the source file where the access takes place
//! @param line line in the source file where the access takes place
//! @param aname name of the array which is accessed
//! @param index index into the array
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
void CUTIL_API
cutCheckBankAccess( unsigned int tidx, unsigned int tidy, unsigned int tidz,
unsigned int bdimx, unsigned int bdimy,
unsigned int bdimz, const char* file, const int line,
const char* aname, const int index);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename within a hardcoded set of paths
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
char* CUTIL_API
cutFindFilePath(const char* filename, const char* executablePath);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename within a specified directory tree
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutFindFile(char * outputPath, const char * startDir, const char * dirName);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename within a specified directory tree
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutFindDir(char * outputPath, const char * startDir, const char * dirName);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutWriteFileui( const char* filename,const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutWriteFileub( const char* filename,const unsigned char* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPGMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type)
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPPMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPPM4ub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned int as data element type)
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPGMi( const char* file, unsigned int** data,
unsigned int* w, unsigned int* h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned short as data element type)
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized withing Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPGMs( const char* file, unsigned short** data,
unsigned int* w, unsigned int* h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with float as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized withing Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutLoadPGMf( const char* file, float** data,
unsigned int* w, unsigned int* h);
////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutSavePGMub( const char* file, unsigned char* data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutSavePPMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned int as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutSavePGMi( const char* file, unsigned int* data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned short as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutSavePGMs( const char* file, unsigned short* data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with float as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutSavePGMf( const char* file, float* data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return CUTTrue if command line argument \a flag_name has been given,
//! otherwise 0
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return CUTTrue if command line argument \a arg_name has been given and
//! is of the requested type, otherwise CUTFalse
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return CUTTrue if command line argument \a arg_name has been given and
//! is of the requested type, otherwise CUTFalse
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return CUTTrue if command line argument \a arg_name has been given and
//! is of the requested type, otherwise CUTFalse
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return CUTTrue if command line argument \a arg_name has been given and
//! is of the requested type, otherwise CUTFalse
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
//! Extended assert
//! @return CUTTrue if the condition \a val holds, otherwise CUTFalse
//! @param val condition to test
//! @param file __FILE__ macro
//! @param line __LINE__ macro
//! @note This function should be used via the CONDITION(val) macro
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutCheckCondition( int val, const char* file, const int line);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutCompareube( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const int epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Timer functionality
////////////////////////////////////////////////////////////////////////////
//! Create a new timer
//! @return CUTTrue if a time has been created, otherwise false
//! @param name of the new timer, 0 if the creation failed
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutCreateTimer( unsigned int* name);
////////////////////////////////////////////////////////////////////////////
//! Delete a timer
//! @return CUTTrue if a time has been deleted, otherwise false
//! @param name of the timer to delete
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutDeleteTimer( unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Start the time with name \a name
//! @param name name of the timer to start
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutStartTimer( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Stop the time with name \a name. Does not reset.
//! @param name name of the timer to stop
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutStopTimer( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Resets the timer's counter.
//! @param name name of the timer to reset.
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API
cutResetTimer( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Returns total execution time in milliseconds for the timer over all
//! runs since the last reset or timer creation.
//! @param name name of the timer to return the time of
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
float CUTIL_API
cutGetTimerValue( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Return the average time in milliseconds for timer execution as the
//! total time for the timer dividied by the number of completed (stopped)
//! runs the timer has made.
//! Excludes the current running time if the timer is currently running.
//! @param name name of the timer to return the time of
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
float CUTIL_API
cutGetAverageTimerValue( const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Macros
#ifdef _DEBUG
#if __DEVICE_EMULATION__
// Interface for bank conflict checker
#define CUT_BANK_CHECKER( array, index) \
(cutCheckBankAccess( threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x, \
blockDim.y, blockDim.z, \
__FILE__, __LINE__, #array, index ), \
array[index])
#else
#define CUT_BANK_CHECKER( array, index) array[index]
#endif
# define CU_SAFE_CALL_NO_SYNC( call ) do { \
CUresult err = call; \
if( CUDA_SUCCESS != err) { \
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
exit(EXIT_FAILURE); \
} } while (0)
# define CU_SAFE_CALL( call ) do { \
CU_SAFE_CALL_NO_SYNC(call); \
CUresult err = cuCtxSynchronize(); \
if( CUDA_SUCCESS != err) { \
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
exit(EXIT_FAILURE); \
} } while (0)
# define CUDA_SAFE_CALL_NO_SYNC( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
# define CUDA_SAFE_CALL( call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
# define CUFFT_SAFE_CALL( call) do { \
cufftResult err = call; \
if( CUFFT_SUCCESS != err) { \
fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} } while (0)
# define CUT_SAFE_CALL( call) \
if( CUTTrue != call) { \
fprintf(stderr, "Cut error in file '%s' in line %i.\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
}
//! Check for CUDA error
# define CUT_CHECK_ERROR(errorMessage) do { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} } while (0)
//! Check for malloc error
# define CUT_SAFE_MALLOC( mallocCall ) do{ \
if( !(mallocCall)) { \
fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} } while(0);
//! Check if conditon is true (flexible assert)
# define CUT_CONDITION( val) \
if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) { \
exit(EXIT_FAILURE); \
}
#else // not DEBUG
#define CUT_BANK_CHECKER( array, index) array[index]
// void macros for performance reasons
# define CUT_CHECK_ERROR(errorMessage)
# define CUT_CHECK_ERROR_GL()
# define CUT_CONDITION( val)
# define CU_SAFE_CALL_NO_SYNC( call) call
# define CU_SAFE_CALL( call) call
# define CUDA_SAFE_CALL_NO_SYNC( call) call
# define CUDA_SAFE_CALL( call) call
# define CUT_SAFE_CALL( call) call
# define CUFFT_SAFE_CALL( call) call
# define CUT_SAFE_MALLOC( mallocCall ) mallocCall
#endif
#if __DEVICE_EMULATION__
# define CUT_DEVICE_INIT(ARGC, ARGV)
#else
# define CUT_DEVICE_INIT(ARGC, ARGV) { \
int deviceCount; \
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount)); \
if (deviceCount == 0) { \
fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); \
exit(EXIT_FAILURE); \
} \
int dev = 0; \
cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev); \
if (dev > deviceCount-1) dev = deviceCount - 1; \
cudaDeviceProp deviceProp; \
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev)); \
if (deviceProp.major < 1) { \
fprintf(stderr, "cutil error: device does not support CUDA.\n"); \
exit(EXIT_FAILURE); \
} \
if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \
fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name); \
CUDA_SAFE_CALL(cudaSetDevice(dev)); \
}
#endif
# define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV) { \
cuDevice = 0; \
int deviceCount = 0; \
CUresult err = cuInit(0); \
if (CUDA_SUCCESS == err) \
CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount)); \
if (deviceCount == 0) { \
fprintf(stderr, "cutil error: no devices supporting CUDA\n"); \
exit(EXIT_FAILURE); \
} \
int dev = 0; \
cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev); \
if (dev > deviceCount-1) dev = deviceCount - 1; \
CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev)); \
char name[100]; \
cuDeviceGetName(name, 100, cuDevice); \
if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \
fprintf(stderr, "Using device %d: %s\n", dev, name); \
}
#define CUT_EXIT(argc, argv) \
if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) { \
printf("\nPress ENTER to exit...\n"); \
fflush( stdout); \
fflush( stderr); \
getchar(); \
} \
exit(EXIT_SUCCESS);
#ifdef __cplusplus
}
#endif // #ifdef _DEBUG (else branch)
#endif // #ifndef _CUTIL_H_

View File

@ -0,0 +1,868 @@
// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#include "cudpp_radixsort.h"
#include <cudpp_globals.h>
#include "sharedmem.h"
#include "cta/radixsort_cta.cu"
#ifdef __DEVICE_EMULATION__
#define __EMUSYNC __syncthreads()
#else
#define __EMUSYNC
#endif
/**
* @file
* radixsort_app.cu
*
* @brief CUDPP kernel-level radix sorting routines
*/
/** \addtogroup cudpp_kernel
* @{
*/
/** @name RadixSort Functions
* @{
*/
typedef unsigned int uint;
/** @brief And empty kernel used to reset CTA issue hardware
**/
__global__ void emptyKernel() {}
/** @brief Does special binary arithmetic before sorting floats
*
* Uses floatFlip function to flip bits.
* @param[in,out] values Values to be manipulated
* @param[in] numValues Number of values to be flipped
**/
__global__ void
LAUNCH_BOUNDS(SORT_CTA_SIZE)
flipFloats(uint *values, uint numValues)
{
uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x;
if (index < numValues) values[index] = floatFlip<true>(values[index]);
index += blockDim.x;
if (index < numValues) values[index] = floatFlip<true>(values[index]);
index += blockDim.x;
if (index < numValues) values[index] = floatFlip<true>(values[index]);
index += blockDim.x;
if (index < numValues) values[index] = floatFlip<true>(values[index]);
}
/** @brief Undoes the flips from flipFloats
*
* Uses floatUnflip function to unflip bits.
* @param[in,out] values Values to be manipulated
* @param[in] numValues Number of values to be unflipped
**/
__global__ void
LAUNCH_BOUNDS(SORT_CTA_SIZE)
unflipFloats(uint *values, uint numValues)
{
uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x;
if (index < numValues) values[index] = floatUnflip<true>(values[index]);
index += blockDim.x;
if (index < numValues) values[index] = floatUnflip<true>(values[index]);
index += blockDim.x;
if (index < numValues) values[index] = floatUnflip<true>(values[index]);
index += blockDim.x;
if (index < numValues) values[index] = floatUnflip<true>(values[index]);
}
/** @brief Optimization for sorts of WARP_SIZE or fewer elements
*
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param[in] numElements Number of elements in the sort.
*/
template <bool flip>
__global__
LAUNCH_BOUNDS(WARP_SIZE)
void radixSortSingleWarp(uint *keys,
uint *values,
uint numElements)
{
volatile __shared__ uint sKeys[WARP_SIZE]; //remove class distinctions
volatile __shared__ uint sValues[WARP_SIZE];
volatile __shared__ uint sFlags[WARP_SIZE];
sKeys[threadIdx.x] = floatFlip<flip>(keys[threadIdx.x]);
sValues[threadIdx.x] = values[threadIdx.x];
__EMUSYNC; // emulation only
for(uint i = 1; i < numElements; i++)
{
uint key_i = sKeys[i];
uint val_i = sValues[i];
sFlags[threadIdx.x] = 0;
uint temp, tempval;
if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) )
{
temp = sKeys[threadIdx.x];
tempval = sValues[threadIdx.x];
sFlags[threadIdx.x] = 1;
#ifdef __DEVICE_EMULATION__
}
__EMUSYNC;
if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) )
{
#endif
sKeys[threadIdx.x + 1] = temp;
sValues[threadIdx.x + 1] = tempval;
sFlags[threadIdx.x + 1] = 0;
}
if(sFlags[threadIdx.x] == 1 )
{
sKeys[threadIdx.x] = key_i;
sValues[threadIdx.x] = val_i;
}
__EMUSYNC; // emulation only
}
keys[threadIdx.x] = floatUnflip<flip>(sKeys[threadIdx.x]);
values[threadIdx.x] = sValues[threadIdx.x];
}
/** @brief Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.
*
* @param[in,out] keys Keys to be sorted
* @param[in] numElements Total number of elements to be sorted
**/
template <bool flip>
__global__
LAUNCH_BOUNDS(WARP_SIZE)
void radixSortSingleWarpKeysOnly(uint *keys,
uint numElements)
{
volatile __shared__ uint sKeys[WARP_SIZE];
volatile __shared__ uint sFlags[WARP_SIZE];
sKeys[threadIdx.x] = floatFlip<flip>(keys[threadIdx.x]);
__EMUSYNC; // emulation only
for(uint i = 1; i < numElements; i++)
{
uint key_i = sKeys[i];
sFlags[threadIdx.x] = 0;
uint temp;
if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) )
{
temp = sKeys[threadIdx.x];
sFlags[threadIdx.x] = 1;
#ifdef __DEVICE_EMULATION__
}
__EMUSYNC;
if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) )
{
#endif
sKeys[threadIdx.x + 1] = temp;
sFlags[threadIdx.x + 1] = 0;
}
if(sFlags[threadIdx.x] == 1 )
{
sKeys[threadIdx.x] = key_i;
}
__EMUSYNC; // emulation only
}
keys[threadIdx.x] = floatUnflip<flip>(sKeys[threadIdx.x]);
}
/** @brief sorts all blocks of data independently in shared memory.
* Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
*
* The radix sort is done in two stages. This stage calls radixSortBlock on each
* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
*
* Template parameters are used to generate efficient code for various special cases
* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
* differently than arrays that are not. "flip" is used to only compile in the
* float flip code when float keys are used. "loop" is used when persistent CTAs
* are used.
*
* By persistent CTAs we mean that we launch only as many thread blocks as can
* be resident in the GPU and no more, rather than launching as many threads as
* we have elements. Persistent CTAs loop over blocks of elements until all work
* is complete. This can be faster in some cases. In our tests it is faster
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
* GPUs than it is on compute version 1.2 GPUs.
*
* @param[out] keysOut Output of sorted keys
* @param[out] valuesOut Output of associated values
* @param[in] keysIn Input of unsorted keys in GPU
* @param[in] valuesIn Input of associated input values
* @param[in] numElements Total number of elements to sort
* @param[in] totalBlocks The number of blocks of data to sort
*/
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void
LAUNCH_BOUNDS(SORT_CTA_SIZE)
radixSortBlocks(uint4* keysOut, uint4* valuesOut,
uint4* keysIn, uint4* valuesIn,
uint numElements, uint totalBlocks)
{
extern __shared__ uint4 sMem[];
uint4 key, value;
uint blockId = blockIdx.x;
while (!loop || blockId < totalBlocks)
{
uint i = blockId * blockDim.x + threadIdx.x;
uint idx = i << 2;
// handle non-full last block if array is not multiple of 1024 numElements
if (!fullBlocks && idx+3 >= numElements)
{
if (idx >= numElements)
{
key = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
value = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
}
else
{
// for non-full block, we handle uint1 values instead of uint4
uint *keys1 = (uint*)keysIn;
uint *values1 = (uint*)valuesIn;
key.x = (idx < numElements) ? floatFlip<flip>(keys1[idx]) : UINT_MAX;
key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
key.w = UINT_MAX;
value.x = (idx < numElements) ? values1[idx] : UINT_MAX;
value.y = (idx+1 < numElements) ? values1[idx+1] : UINT_MAX;
value.z = (idx+2 < numElements) ? values1[idx+2] : UINT_MAX;
value.w = UINT_MAX;
}
}
else
{
key = keysIn[i];
value = valuesIn[i];
if (flip)
{
key.x = floatFlip<flip>(key.x);
key.y = floatFlip<flip>(key.y);
key.z = floatFlip<flip>(key.z);
key.w = floatFlip<flip>(key.w);
}
}
__syncthreads();
radixSortBlock<nbits, startbit>(key, value);
// handle non-full last block if array is not multiple of 1024 numElements
if(!fullBlocks && idx+3 >= numElements)
{
if (idx < numElements)
{
// for non-full block, we handle uint1 values instead of uint4
uint *keys1 = (uint*)keysOut;
uint *values1 = (uint*)valuesOut;
keys1[idx] = key.x;
values1[idx] = value.x;
if (idx + 1 < numElements)
{
keys1[idx + 1] = key.y;
values1[idx + 1] = value.y;
if (idx + 2 < numElements)
{
keys1[idx + 2] = key.z;
values1[idx + 2] = value.z;
}
}
}
}
else
{
keysOut[i] = key;
valuesOut[i] = value;
}
if (loop)
blockId += gridDim.x;
else
break;
}
}
/** @brief Computes the number of keys of each radix in each block stores offset.
*
* Given an array with blocks sorted according to a 4-bit radix group, each
* block counts the number of keys that fall into each radix in the group, and
* finds the starting offset of each radix in the block. It then writes the radix
* counts to the counters array, and the starting offsets to the blockOffsets array.
*
* Template parameters are used to generate efficient code for various special cases
* For example, we have to handle arrays that are a multiple of the block size
* (fullBlocks) differently than arrays that are not. "loop" is used when persistent
* CTAs are used.
*
* By persistent CTAs we mean that we launch only as many thread blocks as can
* be resident in the GPU and no more, rather than launching as many threads as
* we have elements. Persistent CTAs loop over blocks of elements until all work
* is complete. This can be faster in some cases. In our tests it is faster
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
* GPUs than it is on compute version 1.2 GPUs.
*
* @param[in] keys Input keys
* @param[out] counters Radix count for each block
* @param[out] blockOffsets The offset address for each block
* @param[in] numElements Total number of elements
* @param[in] totalBlocks Total number of blocks
**/
template<uint startbit, bool fullBlocks, bool loop>
__global__ void
LAUNCH_BOUNDS(SORT_CTA_SIZE)
findRadixOffsets(uint2 *keys,
uint *counters,
uint *blockOffsets,
uint numElements,
uint totalBlocks)
{
extern __shared__ uint sRadix1[];
__shared__ uint sStartPointers[16];
uint blockId = blockIdx.x;
while (!loop || blockId < totalBlocks)
{
uint2 radix2;
uint i = blockId * blockDim.x + threadIdx.x;
// handle non-full last block if array is not multiple of 1024 numElements
if(!fullBlocks && ((i + 1) << 1 ) > numElements )
{
// handle uint1 rather than uint2 for non-full blocks
uint *keys1 = (uint*)keys;
uint j = i << 1;
radix2.x = (j < numElements) ? keys1[j] : UINT_MAX;
j++;
radix2.y = (j < numElements) ? keys1[j] : UINT_MAX;
}
else
{
radix2 = keys[i];
}
sRadix1[2 * threadIdx.x] = (radix2.x >> startbit) & 0xF;
sRadix1[2 * threadIdx.x + 1] = (radix2.y >> startbit) & 0xF;
// Finds the position where the sRadix1 entries differ and stores start
// index for each radix.
if(threadIdx.x < 16)
{
sStartPointers[threadIdx.x] = 0;
}
__syncthreads();
if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) )
{
sStartPointers[sRadix1[threadIdx.x]] = threadIdx.x;
}
if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1])
{
sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE]] = threadIdx.x + SORT_CTA_SIZE;
}
__syncthreads();
if(threadIdx.x < 16)
{
blockOffsets[blockId*16 + threadIdx.x] = sStartPointers[threadIdx.x];
}
__syncthreads();
// Compute the sizes of each block.
if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) )
{
sStartPointers[sRadix1[threadIdx.x - 1]] =
threadIdx.x - sStartPointers[sRadix1[threadIdx.x - 1]];
}
if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1] )
{
sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]] =
threadIdx.x + SORT_CTA_SIZE - sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]];
}
if(threadIdx.x == SORT_CTA_SIZE - 1)
{
sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]] =
2 * SORT_CTA_SIZE - sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]];
}
__syncthreads();
if(threadIdx.x < 16)
{
counters[threadIdx.x * totalBlocks + blockId] =
sStartPointers[threadIdx.x];
}
if (loop)
blockId += gridDim.x;
else
break;
}
}
/**@brief Reorders data in the global array.
*
* reorderData shuffles data in the array globally after the radix
* offsets have been found. On compute version 1.1 and earlier GPUs, this code depends
* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
*
* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
* that all writes are coalesced using extra work in the kernel. On later
* GPUs coalescing rules have been relaxed, so this extra overhead hurts
* performance. On these GPUs we set manualCoalesce=false and directly store
* the results.
*
* Template parameters are used to generate efficient code for various special cases
* For example, we have to handle arrays that are a multiple of the block size
* (fullBlocks) differently than arrays that are not. "loop" is used when persistent
* CTAs are used.
*
* By persistent CTAs we mean that we launch only as many thread blocks as can
* be resident in the GPU and no more, rather than launching as many threads as
* we have elements. Persistent CTAs loop over blocks of elements until all work
* is complete. This can be faster in some cases. In our tests it is faster
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
* GPUs than it is on compute version 1.2 GPUs.
*
* @param[out] outKeys Output of sorted keys
* @param[out] outValues Output of associated values
* @param[in] keys Input of unsorted keys in GPU
* @param[in] values Input of associated input values
* @param[in] blockOffsets The offset address for each block
* @param[in] offsets Address of each radix within each block
* @param[in] sizes Number of elements in a block
* @param[in] numElements Total number of elements
* @param[in] totalBlocks Total number of data blocks to process
*
* @todo Args that are const below should be prototyped as const
**/
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void
LAUNCH_BOUNDS(SORT_CTA_SIZE)
reorderData(uint *outKeys,
uint *outValues,
uint2 *keys,
uint2 *values,
uint *blockOffsets,
uint *offsets,
uint *sizes,
uint numElements,
uint totalBlocks)
{
__shared__ uint2 sKeys2[SORT_CTA_SIZE];
__shared__ uint2 sValues2[SORT_CTA_SIZE];
__shared__ uint sOffsets[16];
__shared__ uint sBlockOffsets[16];
uint *sKeys1 = (uint*)sKeys2;
uint *sValues1 = (uint*)sValues2;
uint blockId = blockIdx.x;
while (!loop || blockId < totalBlocks)
{
uint i = blockId * blockDim.x + threadIdx.x;
// handle non-full last block if array is not multiple of 1024 numElements
if(!fullBlocks && (((i + 1) << 1) > numElements))
{
uint *keys1 = (uint*)keys;
uint *values1 = (uint*)values;
uint j = i << 1;
sKeys1[threadIdx.x << 1] = (j < numElements) ? keys1[j] : UINT_MAX;
sValues1[threadIdx.x << 1] = (j < numElements) ? values1[j] : UINT_MAX;
j++;
sKeys1[(threadIdx.x << 1) + 1] = (j < numElements) ? keys1[j] : UINT_MAX;
sValues1[(threadIdx.x << 1) + 1] = (j < numElements) ? values1[j] : UINT_MAX;
}
else
{
sKeys2[threadIdx.x] = keys[i];
sValues2[threadIdx.x] = values[i];
}
if (!manualCoalesce)
{
if(threadIdx.x < 16)
{
sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId];
sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
}
__syncthreads();
uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
if (fullBlocks || globalOffset < numElements)
{
outKeys[globalOffset] = floatUnflip<unflip>(sKeys1[threadIdx.x]);
outValues[globalOffset] = sValues1[threadIdx.x];
}
radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
if (fullBlocks || globalOffset < numElements)
{
outKeys[globalOffset] = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
outValues[globalOffset] = sValues1[threadIdx.x + SORT_CTA_SIZE];
}
}
else
{
__shared__ uint sSizes[16];
if(threadIdx.x < 16)
{
sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId];
sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
sSizes[threadIdx.x] = sizes[threadIdx.x * totalBlocks + blockId];
}
__syncthreads();
// 1 half-warp is responsible for writing out all values for 1 radix.
// Loops if there are more than 16 values to be written out.
// All start indices are rounded down to the nearest multiple of 16, and
// all end indices are rounded up to the nearest multiple of 16.
// Thus it can do extra work if the start and end indices are not multiples of 16
// This is bounded by a factor of 2 (it can do 2X more work at most).
const uint halfWarpID = threadIdx.x >> 4;
const uint halfWarpOffset = threadIdx.x & 0xF;
const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
uint endPos = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 -
((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
uint numIterations = endPos - startPos;
uint outOffset = startPos + halfWarpOffset;
uint inOffset = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
{
if( (outOffset >= sOffsets[halfWarpID]) &&
(inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID]))
{
if(blockId < totalBlocks - 1 || outOffset < numElements)
{
outKeys[outOffset] = floatUnflip<unflip>(sKeys1[inOffset]);
outValues[outOffset] = sValues1[inOffset];
}
}
}
}
if (loop)
{
blockId += gridDim.x;
__syncthreads();
}
else
break;
}
}
/** @brief Sorts all blocks of data independently in shared memory.
* Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
*
* The radix sort is done in two stages. This stage calls radixSortBlock on each
* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
*
* Template parameters are used to generate efficient code for various special cases
* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
* differently than arrays that are not. "flip" is used to only compile in the
* float flip code when float keys are used. "loop" is used when persistent CTAs
* are used.
*
* By persistent CTAs we mean that we launch only as many thread blocks as can
* be resident in the GPU and no more, rather than launching as many threads as
* we have elements. Persistent CTAs loop over blocks of elements until all work
* is complete. This can be faster in some cases. In our tests it is faster
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
* GPUs than it is on compute version 1.2 GPUs.
*
* @param[out] keysOut Output of sorted keys GPU main memory
* @param[in] keysIn Input of unsorted keys in GPU main memory
* @param[in] numElements Total number of elements to sort
* @param[in] totalBlocks Total number of blocks to sort
*
*/
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void
LAUNCH_BOUNDS(SORT_CTA_SIZE)
radixSortBlocksKeysOnly(uint4* keysOut, uint4* keysIn, uint numElements, uint totalBlocks)
{
extern __shared__ uint4 sMem[];
uint4 key;
uint blockId = blockIdx.x;
while (!loop || blockId < totalBlocks)
{
uint i = blockId * blockDim.x + threadIdx.x;
uint idx = i << 2;
// handle non-full last block if array is not multiple of 1024 numElements
if (!fullBlocks && idx+3 >= numElements)
{
if (idx >= numElements)
{
key = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
}
else
{
// for non-full block, we handle uint1 values instead of uint4
uint *keys1 = (uint*)keysIn;
key.x = (idx < numElements) ? floatFlip<flip>(keys1[idx]) : UINT_MAX;
key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
key.w = UINT_MAX;
}
}
else
{
key = keysIn[i];
if (flip)
{
key.x = floatFlip<flip>(key.x);
key.y = floatFlip<flip>(key.y);
key.z = floatFlip<flip>(key.z);
key.w = floatFlip<flip>(key.w);
}
}
__syncthreads();
radixSortBlockKeysOnly<nbits, startbit>(key);
// handle non-full last block if array is not multiple of 1024 numElements
if(!fullBlocks && idx+3 >= numElements)
{
if (idx < numElements)
{
// for non-full block, we handle uint1 values instead of uint4
uint *keys1 = (uint*)keysOut;
keys1[idx] = key.x;
if (idx + 1 < numElements)
{
keys1[idx + 1] = key.y;
if (idx + 2 < numElements)
{
keys1[idx + 2] = key.z;
}
}
}
}
else
{
keysOut[i] = key;
}
if (loop)
blockId += gridDim.x;
else
break;
}
}
/** @brief Reorders data in the global array.
*
* reorderDataKeysOnly shuffles data in the array globally after the radix offsets
* have been found. On compute version 1.1 and earlier GPUs, this code depends
* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
*
* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
* that all writes are coalesced using extra work in the kernel. On later
* GPUs coalescing rules have been relaxed, so this extra overhead hurts
* performance. On these GPUs we set manualCoalesce=false and directly store
* the results.
*
* Template parameters are used to generate efficient code for various special cases
* For example, we have to handle arrays that are a multiple of the block size
* (fullBlocks) differently than arrays that are not. "loop" is used when persistent
* CTAs are used.
*
* By persistent CTAs we mean that we launch only as many thread blocks as can
* be resident in the GPU and no more, rather than launching as many threads as
* we have elements. Persistent CTAs loop over blocks of elements until all work
* is complete. This can be faster in some cases. In our tests it is faster
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
* GPUs than it is on compute version 1.2 GPUs.
*
* @param[out] outKeys Output result of reorderDataKeysOnly()
* @param[in] keys Keys to be reordered
* @param[in] blockOffsets Start offset for each block
* @param[in] offsets Offset of each radix within each block
* @param[in] sizes Number of elements in a block
* @param[in] numElements Total number of elements
* @param[in] totalBlocks Total number of blocks
*/
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void
LAUNCH_BOUNDS(SORT_CTA_SIZE)
reorderDataKeysOnly(uint *outKeys,
uint2 *keys,
uint *blockOffsets,
uint *offsets,
uint *sizes,
uint numElements,
uint totalBlocks)
{
__shared__ uint2 sKeys2[SORT_CTA_SIZE];
__shared__ uint sOffsets[16];
__shared__ uint sBlockOffsets[16];
uint *sKeys1 = (uint*)sKeys2;
uint blockId = blockIdx.x;
while (!loop || blockId < totalBlocks)
{
uint i = blockId * blockDim.x + threadIdx.x;
// handle non-full last block if array is not multiple of 1024 numElements
if(!fullBlocks && (((i + 1) << 1) > numElements))
{
uint *keys1 = (uint*)keys;
uint j = i << 1;
sKeys1[threadIdx.x << 1] = (j < numElements) ? keys1[j] : UINT_MAX;
j++;
sKeys1[(threadIdx.x << 1) + 1] = (j < numElements) ? keys1[j] : UINT_MAX;
}
else
{
sKeys2[threadIdx.x] = keys[i];
}
if (!manualCoalesce)
{
if(threadIdx.x < 16)
{
sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId];
sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
}
__syncthreads();
uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
if (fullBlocks || globalOffset < numElements)
{
outKeys[globalOffset] = floatUnflip<unflip>(sKeys1[threadIdx.x]);
}
radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
if (fullBlocks || globalOffset < numElements)
{
outKeys[globalOffset] = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
}
}
else
{
__shared__ uint sSizes[16];
if(threadIdx.x < 16)
{
sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId];
sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
sSizes[threadIdx.x] = sizes[threadIdx.x * totalBlocks + blockId];
}
__syncthreads();
// 1 half-warp is responsible for writing out all values for 1 radix.
// Loops if there are more than 16 values to be written out.
// All start indices are rounded down to the nearest multiple of 16, and
// all end indices are rounded up to the nearest multiple of 16.
// Thus it can do extra work if the start and end indices are not multiples of 16
// This is bounded by a factor of 2 (it can do 2X more work at most).
const uint halfWarpID = threadIdx.x >> 4;
const uint halfWarpOffset = threadIdx.x & 0xF;
const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
uint endPos = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 -
((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
uint numIterations = endPos - startPos;
uint outOffset = startPos + halfWarpOffset;
uint inOffset = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
{
if( (outOffset >= sOffsets[halfWarpID]) &&
(inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID]))
{
if(blockId < totalBlocks - 1 || outOffset < numElements)
{
outKeys[outOffset] = floatUnflip<unflip>(sKeys1[inOffset]);
}
}
}
}
if (loop)
{
blockId += gridDim.x;
__syncthreads();
}
else
break;
}
}
/** @} */ // end radixsort functions
/** @} */ // end cudpp_kernel

View File

@ -0,0 +1,113 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5633 $
// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* scan_kernel.cu
*
* @brief CUDPP kernel-level scan routines
*/
/** \defgroup cudpp_kernel CUDPP Kernel-Level API
* The CUDPP Kernel-Level API contains functions that run on the GPU
* device across a grid of Cooperative Thread Array (CTA, aka Thread
* Block). These kernels are declared \c __global__ so that they
* must be invoked from host (CPU) code. They generally invoke GPU
* \c __device__ routines in the CUDPP \link cudpp_cta CTA-Level API\endlink.
* Kernel-Level API functions are used by CUDPP
* \link cudpp_app Application-Level\endlink functions to implement their
* functionality.
* @{
*/
/** @name Scan Functions
* @{
*/
#include <cudpp_globals.h>
#include "cta/scan_cta.cu"
#include "sharedmem.h"
/**
* @brief Main scan kernel
*
* This __global__ device function performs one level of a multiblock scan on
* an arbitrary-dimensioned array in \a d_in, returning the result in \a d_out
* (which may point to the same array). The same function may be used for
* single or multi-row scans. To perform a multirow scan, pass the width of
* each row of the input row (in elements) in \a dataRowPitch, and the width of
* the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke
* with a thread block grid with height greater than 1.
*
* This function peforms one level of a recursive, multiblock scan. At the
* app level, this function is called by cudppScan and cudppMultiScan and used
* in combination with vectorAddUniform4() to produce a complete scan.
*
* Template parameter \a T is the datatype of the array to be scanned.
* Template parameter \a traits is the ScanTraits struct containing
* compile-time options for the scan, such as whether it is forward or
* backward, exclusive or inclusive, multi- or single-row, etc.
*
* @param[out] d_out The output (scanned) array
* @param[in] d_in The input array to be scanned
* @param[out] d_blockSums The array of per-block sums
* @param[in] numElements The number of elements to scan
* @param[in] dataRowPitch The width of each row of \a d_in in elements
* (for multi-row scans)
* @param[in] blockSumRowPitch The with of each row of \a d_blockSums in elements
* (for multi-row scans)
*/
template<class T, class traits>
__global__ void scan4(T *d_out,
const T *d_in,
T *d_blockSums,
int numElements,
unsigned int dataRowPitch,
unsigned int blockSumRowPitch)
{
SharedMemory<T> smem;
T* temp = smem.getPointer();
int devOffset, ai, bi, aiDev, biDev;
T threadScan0[4], threadScan1[4];
unsigned int blockN = numElements;
unsigned int blockSumIndex = blockIdx.x;
if (traits::isMultiRow())
{
//int width = __mul24(gridDim.x, blockDim.x) << 1;
int yIndex = __umul24(blockDim.y, blockIdx.y) + threadIdx.y;
devOffset = __umul24(dataRowPitch, yIndex);
blockN += (devOffset << 2);
devOffset += __umul24(blockIdx.x, blockDim.x << 1);
blockSumIndex += __umul24(blockSumRowPitch << 2, yIndex) ;
}
else
{
devOffset = __umul24(blockIdx.x, (blockDim.x << 1));
}
// load data into shared memory
loadSharedChunkFromMem4<T, traits>
(temp, threadScan0, threadScan1, d_in,
blockN, devOffset, ai, bi, aiDev, biDev);
scanCTA<T, traits>(temp, d_blockSums, blockSumIndex);
// write results to device memory
storeSharedChunkToMem4<T, traits>
(d_out, threadScan0, threadScan1, temp,
blockN, devOffset, ai, bi, aiDev, biDev);
}
/** @} */ // end scan functions
/** @} */ // end cudpp_kernel

View File

@ -0,0 +1,469 @@
// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5632 $
// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* vector_kernel.cu
*
* @brief CUDA kernel methods for basic operations on vectors.
*
* CUDA kernel methods for basic operations on vectors.
*
* Examples:
* - vectorAddConstant(): d_vector + constant
* - vectorAddUniform(): d_vector + uniform (per-block constants)
* - vectorAddVectorVector(): d_vector + d_vector
*/
// MJH: these functions assume there are 2N elements for N threads.
// Is this always going to be a good idea? There may be cases where
// we have as many threads as elements, but for large problems
// we are probably limited by max CTA size for simple kernels like
// this so we should process multiple elements per thread.
// we may want to extend these with looping versions that process
// many elements per thread.
#include "cudpp_util.h"
#include "sharedmem.h"
#include "cudpp.h"
/** \addtogroup cudpp_kernel
* @{
*/
/** @name Vector Functions
* CUDA kernel methods for basic operations on vectors.
* @{
*/
/** @brief Adds a constant value to all values in the input d_vector
*
* Each thread adds two pairs of elements.
* @todo Test this function -- it is currently not yet used.
*
* @param[in,out] d_vector The array of elements to be modified
* @param[in] constant The constant value to be added to elements of
* \a d_vector
* @param[in] n The number of elements in the d_vector to be modified
* @param[in] baseIndex An optional offset to the beginning of the
* elements in the input array to be processed
*/
template <class T>
__global__ void vectorAddConstant(T *d_vector,
T constant,
int n,
int baseIndex)
{
// Compute this thread's output address
unsigned int address = baseIndex + threadIdx.x +
__mul24(blockIdx.x, (blockDim.x << 1));
// note two adds per thread: one in first half of the block, one in last
d_vector[address] += constant;
d_vector[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * constant;
}
/** @brief Add a uniform value to each data element of an array
*
* This function reads one value per CTA from \a d_uniforms into shared
* memory and adds that value to all values "owned" by the CTA in \a
* d_vector. Each thread adds two pairs of values.
*
* @param[out] d_vector The d_vector whose values will have the uniform added
* @param[in] d_uniforms The array of uniform values (one per CTA)
* @param[in] numElements The number of elements in \a d_vector to process
* @param[in] blockOffset an optional offset to the beginning of this block's
* data.
* @param[in] baseIndex an optional offset to the beginning of the array
* within \a d_vector.
*/
template <class T>
__global__ void vectorAddUniform(T *d_vector,
const T *d_uniforms,
int numElements,
int blockOffset,
int baseIndex)
{
__shared__ T uni;
// Get this block's uniform value from the uniform array in device memory
// We store it in shared memory so that the hardware's shared memory
// broadcast capability can be used to share among all threads in each warp
// in a single cycle
if (threadIdx.x == 0)
{
uni = d_uniforms[blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset];
}
// Compute this thread's output address
int width = __mul24(gridDim.x,(blockDim.x << 1));
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 1));
__syncthreads();
// note two adds per thread: one in first half of the block, one in last
d_vector[address] += uni;
if (threadIdx.x + blockDim.x < numElements) d_vector[address + blockDim.x] += uni;
}
/** @brief Add a uniform value to each data element of an array (vec4 version)
*
* This function reads one value per CTA from \a d_uniforms into shared
* memory and adds that value to all values "owned" by the CTA in \a d_vector.
* Each thread adds the uniform value to eight values in \a d_vector.
*
* @param[out] d_vector The d_vector whose values will have the uniform added
* @param[in] d_uniforms The array of uniform values (one per CTA)
* @param[in] numElements The number of elements in \a d_vector to process
* @param[in] vectorRowPitch For 2D arrays, the pitch (in elements) of the
* rows of \a d_vector.
* @param[in] uniformRowPitch For 2D arrays, the pitch (in elements) of the
* rows of \a d_uniforms.
* @param[in] blockOffset an optional offset to the beginning of this block's
* data.
* @param[in] baseIndex an optional offset to the beginning of the array
* within \a d_vector.
*/
template <class T, CUDPPOperator op, int elementsPerThread>
__global__ void vectorAddUniform4(T *d_vector,
const T *d_uniforms,
int numElements,
int vectorRowPitch, // width of input array in elements
int uniformRowPitch, // width of uniform array in elements
int blockOffset,
int baseIndex)
{
__shared__ T uni;
// Get this block's uniform value from the uniform array in device memory
// We store it in shared memory so that the hardware's shared memory
// broadcast capability can be used to share among all threads in each warp
// in a single cycle
if (threadIdx.x == 0)
{
uni = d_uniforms[blockIdx.x + __umul24(uniformRowPitch, blockIdx.y) + blockOffset];
}
// Compute this thread's output address
//int width = __mul24(gridDim.x,(blockDim.x << 1));
unsigned int address = baseIndex + __umul24(vectorRowPitch, blockIdx.y)
+ threadIdx.x + __umul24(blockIdx.x, (blockDim.x * elementsPerThread));
numElements += __umul24(vectorRowPitch, blockIdx.y);
__syncthreads();
switch (op)
{
case CUDPP_ADD:
for (int i = 0; i < elementsPerThread && address < numElements; i++)
{
d_vector[address] += uni;
address += blockDim.x;
}
break;
case CUDPP_MULTIPLY:
for (int i = 0; i < elementsPerThread && address < numElements; i++)
{
d_vector[address] *= uni;
address += blockDim.x;
}
break;
case CUDPP_MAX:
for (int i = 0; i < elementsPerThread && address < numElements; i++)
{
d_vector[address] = max(d_vector[address], uni);
address += blockDim.x;
}
break;
case CUDPP_MIN:
for (int i = 0; i < elementsPerThread && address < numElements; i++)
{
d_vector[address] = min(d_vector[address], uni);
address += blockDim.x;
}
break;
default:
break;
}
}
/** @brief Adds together two vectors
*
* Each thread adds two pairs of elements.
* @todo Test this function -- it is currently not yet used.
*
* @param[out] d_vectorA The left operand array and the result
* @param[in] d_vectorB The right operand array
* @param[in] numElements The number of elements in the vectors to be added.
* @param[in] baseIndex An optional offset to the beginning of the
* elements in the input arrays to be processed
*/
template <class T>
__global__ void vectorAddVector(T *d_vectorA, // A += B
const T *d_vectorB,
int numElements,
int baseIndex)
{
// Compute this thread's output address
unsigned int address = baseIndex + threadIdx.x +
__mul24(blockIdx.x, (blockDim.x << 1));
// note two adds per thread: one in first half of the block, one in last
d_vectorA[address] += d_vectorB[address];
d_vectorA[address + blockDim.x] +=
(threadIdx.x + blockDim.x < numElements) * d_vectorB[address];
}
/** @brief Add a uniform value to data elements of an array (vec4 version)
*
* This function reads one value per CTA from \a d_uniforms into shared
* memory and adds that value to values "owned" by the CTA in \a d_vector.
* The uniform value is added to only those values "owned" by the CTA which
* have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX
* it adds the uniform to all values "owned" by the CTA.
* Each thread adds the uniform value to eight values in \a d_vector.
*
* @param[out] d_vector The d_vector whose values will have the uniform added
* @param[in] d_uniforms The array of uniform values (one per CTA)
* @param[in] d_maxIndices The array of maximum indices (one per CTA). This is
* index upto which the uniform would be added. If this is UINT_MAX
* the uniform is added to all elements of the CTA. This index is
* 1-based.
* @param[in] numElements The number of elements in \a d_vector to process
* @param[in] blockOffset an optional offset to the beginning of this block's
* data.
* @param[in] baseIndex an optional offset to the beginning of the array
* within \a d_vector.
*/
template <class T, CUDPPOperator oper, bool isLastBlockFull>
__global__ void vectorSegmentedAddUniform4(T *d_vector,
const T *d_uniforms,
const unsigned int *d_maxIndices,
unsigned int numElements,
int blockOffset,
int baseIndex)
{
__shared__ T uni[2];
unsigned int blockAddress =
blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
// Get this block's uniform value from the uniform array in device memory
// We store it in shared memory so that the hardware's shared memory
// broadcast capability can be used to share among all threads in each warp
// in a single cycle
if (threadIdx.x == 0)
{
if (blockAddress > 0)
uni[0] = d_uniforms[blockAddress-1];
else
uni[0] = Operator<T, oper>::identity();
// Tacit assumption that T is four-byte wide
uni[1] = (T)(d_maxIndices[blockAddress]);
}
// Compute this thread's output address
int width = __mul24(gridDim.x,(blockDim.x << 1));
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3));
__syncthreads();
unsigned int maxIndex = (unsigned int)(uni[1]);
bool isLastBlock = (blockIdx.x == (gridDim.x-1));
if (maxIndex < UINT_MAX)
{
// Since maxIndex is a 1 based index
--maxIndex;
bool leftLess = address < maxIndex;
bool rightLess = (address + 7 * blockDim.x) < maxIndex;
if (leftLess)
{
if (rightLess)
{
for (unsigned int i = 0; i < 8; ++i)
d_vector[address + i * blockDim.x] =
Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
}
else
{
for (unsigned int i=0; i < 8; ++i)
{
if (address < maxIndex)
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
}
}
else
{
if (!isLastBlockFull && isLastBlock)
{
for (unsigned int i = 0; i < 8; ++i)
{
if (address < numElements)
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
else
{
for (unsigned int i=0; i<8; ++i)
{
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
}
}
/** @brief Add a uniform value to data elements of an array (vec4 version)
*
* This function reads one value per CTA from \a d_uniforms into shared
* memory and adds that value to values "owned" by the CTA in \a d_vector.
* The uniform value is added to only those values "owned" by the CTA which
* have an index greater than d_minIndex. If d_minIndex for that CTA is 0
* it adds the uniform to all values "owned" by the CTA.
* Each thread adds the uniform value to eight values in \a d_vector.
*
* @param[out] d_vector The d_vector whose values will have the uniform added
* @param[in] d_uniforms The array of uniform values (one per CTA)
* @param[in] d_minIndices The array of minimum indices (one per CTA). The
* uniform is added to the right of this index (that is, to every index
* that is greater than this index). If this is 0, the uniform is
* added to all elements of the CTA. This index is 1-based to
* prevent overloading of what 0 means. In our case it means
* absence of a flag. But if the first element of a CTA has
* flag the index will also be 0. Hence we use 1-based indices
* so the index is 1 in the latter case.
* @param[in] numElements The number of elements in \a d_vector to process
* @param[in] blockOffset an optional offset to the beginning of this block's
* data.
* @param[in] baseIndex an optional offset to the beginning of the array
* within \a d_vector.
*
*/
template <class T, CUDPPOperator oper, bool isLastBlockFull>
__global__ void vectorSegmentedAddUniformToRight4(T *d_vector,
const T *d_uniforms,
const unsigned int *d_minIndices,
unsigned int numElements,
int blockOffset,
int baseIndex)
{
__shared__ T uni[2];
unsigned int blockAddress =
blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
// Get this block's uniform value from the uniform array in device memory
// We store it in shared memory so that the hardware's shared memory
// broadcast capability can be used to share among all threads in each warp
// in a single cycle
if (threadIdx.x == 0)
{
// FIXME - blockAddress test here is incompatible with how it is calculated
// above
if (blockAddress < (gridDim.x-1))
uni[0] = d_uniforms[blockAddress+1];
else
uni[0] = Operator<T, oper>::identity();
// Tacit assumption that T is four-byte wide
uni[1] = (T)(d_minIndices[blockAddress]);
}
// Compute this thread's output address
int width = __mul24(gridDim.x,(blockDim.x << 1));
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3));
__syncthreads();
unsigned int minIndex = (unsigned int)(uni[1]);
bool isLastBlock = (blockIdx.x == (gridDim.x-1));
if (minIndex > 0)
{
// Since minIndex is a 1 based index
--minIndex;
bool leftInRange = address > minIndex;
bool rightInRange = (address + 7 * blockDim.x) > minIndex;
if (rightInRange)
{
if (leftInRange)
{
for (unsigned int i = 0; i < 8; ++i)
d_vector[address + i * blockDim.x] =
Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
}
else
{
for (unsigned int i=0; i < 8; ++i)
{
if (address > minIndex)
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
}
}
else
{
if (!isLastBlockFull && isLastBlock)
{
for (unsigned int i = 0; i < 8; ++i)
{
if (address < numElements)
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
else
{
for (unsigned int i=0; i<8; ++i)
{
d_vector[address] =
Operator<T, oper>::op(d_vector[address], uni[0]);
address += blockDim.x;
}
}
}
}
/** @} */ // end d_vector functions
/** @} */ // end cudpp_kernel

View File

@ -0,0 +1,25 @@
Copyright (c) 2007-2010 The Regents of the University of California, Davis
campus ("The Regents") and NVIDIA Corporation ("NVIDIA"). All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the The Regents, nor NVIDIA, nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,993 @@
// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* radixsort_app.cu
*
* @brief CUDPP application-level radix sorting routines
*/
/** @addtogroup cudpp_app
* @{
*/
/** @name RadixSort Functions
* @{
*/
#include "cudpp.h"
#include "cudpp_util.h"
#include "cudpp_radixsort.h"
#include "cudpp_scan.h"
#include "kernel/radixsort_kernel.cu"
#include <cutil.h>
#include <cstdlib>
#include <cstdio>
#include <assert.h>
typedef unsigned int uint;
/** @brief Perform one step of the radix sort. Sorts by nbits key bits per step,
* starting at startbit.
*
* Uses cudppScanDispatch() for the prefix sum of radix counters.
*
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param[in] plan Configuration information for RadixSort.
* @param[in] numElements Number of elements in the sort.
**/
template<uint nbits, uint startbit, bool flip, bool unflip>
void radixSortStep(uint *keys,
uint *values,
const CUDPPRadixSortPlan *plan,
uint numElements)
{
const uint eltsPerBlock = SORT_CTA_SIZE * 4;
const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
bool fullBlocks = ((numElements % eltsPerBlock) == 0);
uint numBlocks = (fullBlocks) ?
(numElements / eltsPerBlock) :
(numElements / eltsPerBlock + 1);
uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
(numElements / eltsPerBlock2) :
(numElements / eltsPerBlock2 + 1);
bool loop = numBlocks > 65535;
uint blocks = loop ? 65535 : numBlocks;
uint blocksFind = loop ? 65535 : numBlocks2;
uint blocksReorder = loop ? 65535 : numBlocks2;
uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0];
bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
if (persist)
{
loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
blocks = numBlocks;
blocksFind = numBlocks2;
blocksReorder = numBlocks2;
// Run an empty kernel -- this seems to reset some of the CTA scheduling hardware
// on GT200, resulting in better scheduling and lower run times
if (startbit > 0)
{
emptyKernel<<<numCTAs(emptyKernel), SORT_CTA_SIZE>>>();
}
}
if (fullBlocks)
{
if (loop)
{
if (persist)
{
blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) :
numCTAs(radixSortBlocks<4, 0, true, false, true>);
}
radixSortBlocks<nbits, startbit, true, flip, true>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
}
else
{
radixSortBlocks<nbits, startbit, true, flip, false>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
}
}
else
{
if (loop)
{
if (persist)
{
blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) :
numCTAs(radixSortBlocks<4, 0, false, false, true>);
}
radixSortBlocks<nbits, startbit, false, flip, true>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
}
else
{
radixSortBlocks<nbits, startbit, false, flip, false>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
}
}
CUT_CHECK_ERROR("radixSortBlocks");
if (fullBlocks)
{
if (loop)
{
if (persist)
{
blocksFind = numCTAs(findRadixOffsets<0, true, true>);
}
findRadixOffsets<startbit, true, true>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
{
findRadixOffsets<startbit, true, false>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
}
else
{
if (loop)
{
if (persist)
{
blocksFind = numCTAs(findRadixOffsets<0, false, true>);
}
findRadixOffsets<startbit, false, true>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
{
findRadixOffsets<startbit, false, false>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
}
CUT_CHECK_ERROR("findRadixOffsets");
cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
if (fullBlocks)
{
if (plan->m_bManualCoalesce)
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) :
numCTAs(reorderData<0, true, true, false, true>);
}
reorderData<startbit, true, true, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
else
{
reorderData<startbit, true, true, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
}
else
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) :
numCTAs(reorderData<0, true, false, false, true>);
}
reorderData<startbit, true, false, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
else
{
reorderData<startbit, true, false, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
}
}
else
{
if (plan->m_bManualCoalesce)
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderData<0, false, true, true, true>) :
numCTAs(reorderData<0, false, true, false, true>);
}
reorderData<startbit, false, true, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
else
{
reorderData<startbit, false, true, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
}
else
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderData<0, false, false, true, true>) :
numCTAs(reorderData<0, false, false, false, true>);
}
reorderData<startbit, false, false, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
else
{
reorderData<startbit, false, false, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
}
}
CUT_CHECK_ERROR("radixSortStep");
}
/**
* @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements
*
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param numElements Number of elements in the sort.
**/
template <bool flip>
void radixSortSingleBlock(uint *keys,
uint *values,
uint numElements)
{
bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
if (fullBlocks)
{
radixSortBlocks<32, 0, true, flip, false>
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)keys, (uint4*)values,
(uint4*)keys, (uint4*)values,
numElements, 0);
}
else
{
radixSortBlocks<32, 0, false, flip, false>
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)keys, (uint4*)values,
(uint4*)keys, (uint4*)values,
numElements, 0);
}
if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
CUT_CHECK_ERROR("radixSortSingleBlock");
}
/**
* @brief Main radix sort function
*
* Main radix sort function. Sorts in place in the keys and values arrays,
* but uses the other device arrays as temporary storage. All pointer
* parameters are device pointers. Uses cudppScan() for the prefix sum of
* radix counters.
*
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param[in] plan Configuration information for RadixSort.
* @param[in] numElements Number of elements in the sort.
* @param[in] flipBits Is set true if key datatype is a float
* (neg. numbers) for special float sorting operations.
* @param[in] keyBits Number of interesting bits in the key
**/
void radixSort(uint *keys,
uint* values,
const CUDPPRadixSortPlan *plan,
size_t numElements,
bool flipBits,
int keyBits)
{
if(numElements <= WARP_SIZE)
{
if (flipBits)
radixSortSingleWarp<true><<<1, numElements>>>
(keys, values, numElements);
else
radixSortSingleWarp<false><<<1, numElements>>>
(keys, values, numElements);
CUT_CHECK_ERROR("radixSortSingleWarp");
return;
}
#ifdef __DEVICE_EMULATION__
printf("bits: %d\n", keyBits);
#endif
if(numElements <= SORT_CTA_SIZE * 4)
{
if (flipBits)
radixSortSingleBlock<true>(keys, values, numElements);
else
radixSortSingleBlock<false>(keys, values, numElements);
return;
}
// flip float bits on the first pass, unflip on the last pass
if (flipBits)
{
radixSortStep<4, 0, true, false>
(keys, values, plan, numElements);
}
else
{
radixSortStep<4, 0, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 4)
{
radixSortStep<4, 4, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 8)
{
radixSortStep<4, 8, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 12)
{
radixSortStep<4, 12, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 16)
{
radixSortStep<4, 16, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 20)
{
radixSortStep<4, 20, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 24)
{
radixSortStep<4, 24, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 28)
{
if (flipBits) // last pass
{
radixSortStep<4, 28, false, true>
(keys, values, plan, numElements);
}
else
{
radixSortStep<4, 28, false, false>
(keys, values, plan, numElements);
}
}
}
/**
* @brief Wrapper to call main radix sort function. For float configuration.
*
* Calls the main radix sort function. For float configuration.
*
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param[in] plan Configuration information for RadixSort.
* @param[in] numElements Number of elements in the sort.
* @param[in] negativeKeys Is set true if key datatype has neg. numbers.
* @param[in] keyBits Number of interesting bits in the key
**/
extern "C"
void radixSortFloatKeys(float* keys,
uint* values,
const CUDPPRadixSortPlan *plan,
size_t numElements,
bool negativeKeys,
int keyBits)
{
radixSort((uint*)keys, (uint*)values, plan,
numElements, negativeKeys, keyBits);
}
/** @brief Perform one step of the radix sort. Sorts by nbits key bits per step,
* starting at startbit.
*
* @param[in,out] keys Keys to be sorted.
* @param[in] plan Configuration information for RadixSort.
* @param[in] numElements Number of elements in the sort.
**/
template<uint nbits, uint startbit, bool flip, bool unflip>
void radixSortStepKeysOnly(uint *keys,
const CUDPPRadixSortPlan *plan,
uint numElements)
{
const uint eltsPerBlock = SORT_CTA_SIZE * 4;
const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
bool fullBlocks = ((numElements % eltsPerBlock) == 0);
uint numBlocks = (fullBlocks) ?
(numElements / eltsPerBlock) :
(numElements / eltsPerBlock + 1);
uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
(numElements / eltsPerBlock2) :
(numElements / eltsPerBlock2 + 1);
bool loop = numBlocks > 65535;
uint blocks = loop ? 65535 : numBlocks;
uint blocksFind = loop ? 65535 : numBlocks2;
uint blocksReorder = loop ? 65535 : numBlocks2;
uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1];
bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
if (persist)
{
loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
blocks = numBlocks;
blocksFind = numBlocks2;
blocksReorder = numBlocks2;
}
if (fullBlocks)
{
if (loop)
{
if (persist)
{
blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) :
numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>);
}
radixSortBlocksKeysOnly<nbits, startbit, true, flip, true>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
}
else
radixSortBlocksKeysOnly<nbits, startbit, true, flip, false>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
}
else
{
if (loop)
{
if (persist)
{
blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) :
numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>);
}
radixSortBlocksKeysOnly<nbits, startbit, false, flip, true>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
}
else
radixSortBlocksKeysOnly<nbits, startbit, false, flip, false>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
}
if (fullBlocks)
{
if (loop)
{
if (persist)
{
blocksFind = numCTAs(findRadixOffsets<0, true, true>);
}
findRadixOffsets<startbit, true, true>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
findRadixOffsets<startbit, true, false>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
{
if (loop)
{
if (persist)
{
blocksFind = numCTAs(findRadixOffsets<0, false, true>);
}
findRadixOffsets<startbit, false, true>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
findRadixOffsets<startbit, false, false>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
if (fullBlocks)
{
if (plan->m_bManualCoalesce)
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderDataKeysOnly<0, true, true, true, true>) :
numCTAs(reorderDataKeysOnly<0, true, true, false, true>);
}
reorderDataKeysOnly<startbit, true, true, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
reorderDataKeysOnly<startbit, true, true, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderDataKeysOnly<0, true, false, true, true>) :
numCTAs(reorderDataKeysOnly<0, true, false, false, true>);
}
reorderDataKeysOnly<startbit, true, false, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
reorderDataKeysOnly<startbit, true, false, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
}
else
{
if (plan->m_bManualCoalesce)
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderDataKeysOnly<0, false, true, true, true>) :
numCTAs(reorderDataKeysOnly<0, false, true, false, true>);
}
reorderDataKeysOnly<startbit, false, true, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
reorderDataKeysOnly<startbit, false, true, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderDataKeysOnly<0, false, false, true, true>) :
numCTAs(reorderDataKeysOnly<0, false, false, false, true>);
}
reorderDataKeysOnly<startbit, false, false, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
reorderDataKeysOnly<startbit, false, false, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
}
CUT_CHECK_ERROR("radixSortStepKeysOnly");
}
/**
* @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only).
*
* @param[in,out] keys Keys to be sorted.
* @param numElements Number of elements in the sort.
**/
template <bool flip>
void radixSortSingleBlockKeysOnly(uint *keys,
uint numElements)
{
bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
if (fullBlocks)
{
radixSortBlocksKeysOnly<32, 0, true, flip, false>
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)keys, (uint4*)keys, numElements, 1 );
}
else
{
radixSortBlocksKeysOnly<32, 0, false, flip, false>
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)keys, (uint4*)keys, numElements, 1 );
}
if (flip)
unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
CUT_CHECK_ERROR("radixSortSingleBlock");
}
/**
* @brief Main radix sort function. For keys only configuration.
*
* Main radix sort function. Sorts in place in the keys array,
* but uses the other device arrays as temporary storage. All pointer
* parameters are device pointers. Uses scan for the prefix sum of
* radix counters.
*
* @param[in,out] keys Keys to be sorted.
* @param[in] plan Configuration information for RadixSort.
* @param[in] flipBits Is set true if key datatype is a float (neg. numbers)
* for special float sorting operations.
* @param[in] numElements Number of elements in the sort.
* @param[in] keyBits Number of interesting bits in the key
**/
extern "C"
void radixSortKeysOnly(uint *keys,
const CUDPPRadixSortPlan *plan,
bool flipBits,
size_t numElements,
int keyBits)
{
if(numElements <= WARP_SIZE)
{
if (flipBits)
radixSortSingleWarpKeysOnly<true><<<1, numElements>>>(keys, numElements);
else
radixSortSingleWarpKeysOnly<false><<<1, numElements>>>(keys, numElements);
return;
}
if(numElements <= SORT_CTA_SIZE * 4)
{
if (flipBits)
radixSortSingleBlockKeysOnly<true>(keys, numElements);
else
radixSortSingleBlockKeysOnly<false>(keys, numElements);
return;
}
// flip float bits on the first pass, unflip on the last pass
if (flipBits)
{
radixSortStepKeysOnly<4, 0, true, false>(keys, plan, numElements);
}
else
{
radixSortStepKeysOnly<4, 0, false, false>(keys, plan, numElements);
}
if (keyBits > 4)
{
radixSortStepKeysOnly<4, 4, false, false>(keys, plan, numElements);
}
if (keyBits > 8)
{
radixSortStepKeysOnly<4, 8, false, false>(keys, plan, numElements);
}
if (keyBits > 12)
{
radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements);
}
if (keyBits > 16)
{
radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements);
}
if (keyBits > 20)
{
radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements);
}
if (keyBits > 24)
{
radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements);
}
if (keyBits > 28)
{
if (flipBits) // last pass
{
radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements);
}
else
{
radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements);
}
}
}
/**
* @brief Wrapper to call main radix sort function. For floats and keys only.
*
* Calls the radixSortKeysOnly function setting parameters for floats.
*
* @param[in,out] keys Keys to be sorted.
* @param[in] plan Configuration information for RadixSort.
* @param[in] negativeKeys Is set true if key flipBits is to be true in
* radixSortKeysOnly().
* @param[in] numElements Number of elements in the sort.
* @param[in] keyBits Number of interesting bits in the key
**/
extern "C"
void radixSortFloatKeysOnly(float *keys,
const CUDPPRadixSortPlan *plan,
bool negativeKeys,
size_t numElements,
int keyBits)
{
radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits);
}
extern "C"
void initDeviceParameters(CUDPPRadixSortPlan *plan)
{
int deviceID = -1;
if (cudaSuccess == cudaGetDevice(&deviceID))
{
cudaDeviceProp devprop;
cudaGetDeviceProperties(&devprop, deviceID);
int smVersion = devprop.major * 10 + devprop.minor;
// sm_12 and later devices don't need help with coalesce in reorderData kernel
plan->m_bManualCoalesce = (smVersion < 12);
// sm_20 and later devices are better off not using persistent CTAs
plan->m_bUsePersistentCTAs = (smVersion < 20);
if (plan->m_bUsePersistentCTAs)
{
// The following is only true on pre-sm_20 devices (pre-Fermi):
// Empirically we have found that for some (usually larger) sort
// sizes it is better to use exactly as many "persistent" CTAs
// as can fill the GPU, which loop over the "blocks" of work. For smaller
// arrays it is better to use the typical CUDA approach of launching one CTA
// per block of work.
// 0-element of these two-element arrays is for key-value sorts
// 1-element is for key-only sorts
plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288;
plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288;
plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608;
plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0;
// create a map of function pointers to register counts for more accurate occupancy calculation
// Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it
// Note we only insert the "loop" version of the kernels (the one with the last template param = true)
// Because those are the only ones that require persistent CTAs that maximally fill the device.
computeNumCTAs(radixSortBlocks<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocks<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocks<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocks<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(findRadixOffsets<0, false, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(findRadixOffsets<0, true, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, false, false, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, false, false, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, false, true, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, false, true, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, true, false, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, true, false, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, true, true, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, true, true, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(emptyKernel, 0, SORT_CTA_SIZE);
}
}
}
/**
* @brief From the programmer-specified sort configuration,
* creates internal memory for performing the sort.
*
* @param[in] plan Pointer to CUDPPRadixSortPlan object
**/
extern "C"
void allocRadixSortStorage(CUDPPRadixSortPlan *plan)
{
unsigned int numElements = plan->m_numElements;
unsigned int numBlocks =
((numElements % (SORT_CTA_SIZE * 4)) == 0) ?
(numElements / (SORT_CTA_SIZE * 4)) :
(numElements / (SORT_CTA_SIZE * 4) + 1);
switch(plan->m_config.datatype)
{
case CUDPP_UINT:
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
numElements * sizeof(unsigned int)));
if (!plan->m_bKeysOnly)
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
numElements * sizeof(unsigned int)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
WARP_SIZE * numBlocks * sizeof(unsigned int)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
WARP_SIZE * numBlocks * sizeof(unsigned int)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
WARP_SIZE * numBlocks * sizeof(unsigned int)));
break;
case CUDPP_FLOAT:
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
numElements * sizeof(float)));
if (!plan->m_bKeysOnly)
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
numElements * sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
WARP_SIZE * numBlocks * sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
WARP_SIZE * numBlocks * sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
WARP_SIZE * numBlocks * sizeof(float)));
break;
}
initDeviceParameters(plan);
}
/** @brief Deallocates intermediate memory from allocRadixSortStorage.
*
*
* @param[in] plan Pointer to CUDPPRadixSortPlan object
**/
extern "C"
void freeRadixSortStorage(CUDPPRadixSortPlan* plan)
{
CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys));
CUDA_SAFE_CALL( cudaFree(plan->m_tempValues));
CUDA_SAFE_CALL( cudaFree(plan->m_counters));
CUDA_SAFE_CALL( cudaFree(plan->m_countersSum));
CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets));
}
/** @brief Dispatch function to perform a sort on an array with
* a specified configuration.
*
* This is the dispatch routine which calls radixSort...() with
* appropriate template parameters and arguments as specified by
* the plan.
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param[in] numElements Number of elements in the sort.
* @param[in] keyBits Number of interesting bits in the key*
* @param[in] plan Configuration information for RadixSort.
**/
extern "C"
void cudppRadixSortDispatch(void *keys,
void *values,
size_t numElements,
int keyBits,
const CUDPPRadixSortPlan *plan)
{
if(plan->m_bKeysOnly)
{
switch(plan->m_config.datatype)
{
case CUDPP_UINT:
radixSortKeysOnly((uint*)keys, plan, false,
numElements, keyBits);
break;
case CUDPP_FLOAT:
radixSortFloatKeysOnly((float*)keys, plan, true,
numElements, keyBits);
}
}
else
{
switch(plan->m_config.datatype)
{
case CUDPP_UINT:
radixSort((uint*)keys, (uint*) values, plan,
numElements, false, keyBits);
break;
case CUDPP_FLOAT:
radixSortFloatKeys((float*)keys, (uint*) values, plan,
numElements, true, keyBits);
}
}
}
/** @} */ // end radixsort functions
/** @} */ // end cudpp_app

View File

@ -0,0 +1,771 @@
// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5633 $
// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* scan_app.cu
*
* @brief CUDPP application-level scan routines
*/
/** \defgroup cudpp_app CUDPP Application-Level API
* The CUDPP Application-Level API contains functions
* that run on the host CPU and invoke GPU routines in
* the CUDPP \link cudpp_kernel Kernel-Level API\endlink.
* Application-Level API functions are used by
* CUDPP \link publicInterface Public Interface\endlink
* functions to implement CUDPP's core functionality.
* @{
*/
/** @name Scan Functions
* @{
*/
#include "cudpp.h"
#include "cudpp_util.h"
#include "cudpp_plan.h"
#include "kernel/scan_kernel.cu"
#include "kernel/vector_kernel.cu"
#include <cutil.h>
#include <cstdlib>
#include <cstdio>
#include <assert.h>
/** @brief Perform recursive scan on arbitrary size arrays
*
* This is the CPU-side workhorse function of the scan engine. This function
* invokes the CUDA kernels which perform the scan on individual blocks.
*
* Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans,
* where each block is scanned by a single CUDA thread block. At each recursive level of the
* scanArrayRecursive first invokes a kernel to scan all blocks of that level, and if the level
* has more than one block, it calls itself recursively. On returning from each recursive level,
* the total sum of each block from the level below is added to all elements of the corresponding
* block in this level. See "Parallel Prefix Sum (Scan) in CUDA" for more information (see
* \ref references ).
*
* Template parameter \a T is the datatype; \a isBackward specifies backward or forward scan;
* \a isExclusive specifies exclusive or inclusive scan, and \a op specifies the binary associative
* operator to be used.
*
* @param[out] d_out The output array for the scan results
* @param[in] d_in The input array to be scanned
* @param[out] d_blockSums Array of arrays of per-block sums (one array per recursive level, allocated
* by allocScanStorage())
* @param[in] numElements The number of elements in the array to scan
* @param[in] numRows The number of rows in the array to scan
* @param[in] rowPitches Array of row pitches (one array per recursive level, allocated by
* allocScanStorage())
* @param[in] level The current recursive level of the scan
*/
template <class T, bool isBackward, bool isExclusive, CUDPPOperator op>
void scanArrayRecursive(T *d_out,
const T *d_in,
T **d_blockSums,
size_t numElements,
size_t numRows,
const size_t *rowPitches,
int level)
{
unsigned int numBlocks =
max(1, (unsigned int)ceil((double)numElements / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
unsigned int sharedEltsPerBlock = CTA_SIZE * 2;
unsigned int sharedMemSize = sizeof(T) * sharedEltsPerBlock;
// divide pitch by four since scan's load/store addresses are for vec4 elements
unsigned int rowPitch = 1;
unsigned int blockSumRowPitch = 1;
if (numRows > 1)
{
rowPitch = rowPitches[level] / 4;
blockSumRowPitch = (numBlocks > 1) ? rowPitches[level+1] / 4 : 0;
}
bool fullBlock = (numElements == numBlocks * SCAN_ELTS_PER_THREAD * CTA_SIZE);
// setup execution parameters
dim3 grid(numBlocks, numRows, 1);
dim3 threads(CTA_SIZE, 1, 1);
// make sure there are no CUDA errors before we start
CUT_CHECK_ERROR("scanArray before kernels");
unsigned int traitsCode = 0;
if (numBlocks > 1) traitsCode |= 1;
if (numRows > 1) traitsCode |= 2;
if (fullBlock) traitsCode |= 4;
switch (traitsCode)
{
case 0: // single block, single row, non-full block
scan4<T, ScanTraits<T, op, isBackward, isExclusive, false, false, false> >
<<< grid, threads, sharedMemSize >>>
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
break;
case 1: // multiblock, single row, non-full block
scan4< T, ScanTraits<T, op, isBackward, isExclusive, false, true, false> >
<<< grid, threads, sharedMemSize >>>
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
break;
case 2: // single block, multirow, non-full block
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, false, false> >
<<< grid, threads, sharedMemSize >>>
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
break;
case 3: // multiblock, multirow, non-full block
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, true, false> >
<<< grid, threads, sharedMemSize >>>
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
break;
case 4: // single block, single row, full block
scan4<T, ScanTraits<T, op, isBackward, isExclusive, false, false, true> >
<<< grid, threads, sharedMemSize >>>
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
break;
case 5: // multiblock, single row, full block
scan4< T, ScanTraits<T, op, isBackward, isExclusive, false, true, true> >
<<< grid, threads, sharedMemSize >>>
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
break;
case 6: // single block, multirow, full block
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, false, true> >
<<< grid, threads, sharedMemSize >>>
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
break;
case 7: // multiblock, multirow, full block
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, true, true> >
<<< grid, threads, sharedMemSize >>>
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
break;
}
CUT_CHECK_ERROR("prescan");
if (numBlocks > 1)
{
// After scanning all the sub-blocks, we are mostly done. But
// now we need to take all of the last values of the
// sub-blocks and scan those. This will give us a new value
// that must be sdded to each block to get the final results.
scanArrayRecursive<T, isBackward, true, op>
((T*)d_blockSums[level], (const T*)d_blockSums[level],
(T**)d_blockSums, numBlocks, numRows, rowPitches, level + 1); // recursive (CPU) call
vectorAddUniform4<T, op, SCAN_ELTS_PER_THREAD>
<<< grid, threads >>>(d_out,
(T*)d_blockSums[level],
numElements,
rowPitch*4,
blockSumRowPitch*4,
0, 0);
CUT_CHECK_ERROR("vectorAddUniform");
}
}
// global
#ifdef __cplusplus
extern "C"
{
#endif
/** @brief Allocate intermediate arrays used by scan.
*
* Scans of large arrays must be split (possibly recursively) into a hierarchy
* of block scans, where each block is scanned by a single CUDA thread block.
* At each recursive level of the scan, we need an array in which to store the
* total sums of all blocks in that level. This function computes the amount
* of storage needed and allocates it.
*
* @param plan Pointer to CUDPPScanPlan object containing options and number
* of elements, which is used to compute storage requirements, and
* within which intermediate storage is allocated.
*/
void allocScanStorage(CUDPPScanPlan *plan)
{
//assert(config->_numEltsAllocated == 0); // shouldn't be called
plan->m_numEltsAllocated = plan->m_numElements;
size_t numElts = plan->m_numElements;
size_t level = 0;
do
{
size_t numBlocks =
max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
if (numBlocks > 1)
{
level++;
}
numElts = numBlocks;
} while (numElts > 1);
size_t elementSize = 0;
switch(plan->m_config.datatype)
{
case CUDPP_INT:
plan->m_blockSums = (void**) malloc(level * sizeof(int*));
elementSize = sizeof(int);
break;
case CUDPP_UINT:
plan->m_blockSums = (void**) malloc(level * sizeof(unsigned int*));
elementSize = sizeof(unsigned int);
break;
case CUDPP_FLOAT:
plan->m_blockSums = (void**) malloc(level * sizeof(float*));
elementSize = sizeof(float);
break;
default:
break;
}
plan->m_numLevelsAllocated = level;
numElts = plan->m_numElements;
size_t numRows = plan->m_numRows;
plan->m_numRowsAllocated = numRows;
plan->m_rowPitches = 0;
if (numRows > 1)
{
plan->m_rowPitches = (size_t*) malloc((level + 1) * sizeof(size_t));
plan->m_rowPitches[0] = plan->m_rowPitch;
}
level = 0;
do
{
size_t numBlocks =
max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
if (numBlocks > 1)
{
// Use cudaMallocPitch for multi-row block sums to ensure alignment
if (numRows > 1)
{
size_t dpitch;
CUDA_SAFE_CALL( cudaMallocPitch((void**) &(plan->m_blockSums[level]),
&dpitch,
numBlocks * elementSize,
numRows));
plan->m_rowPitches[level+1] = dpitch / elementSize;
level++;
}
else
{
CUDA_SAFE_CALL(cudaMalloc((void**) &(plan->m_blockSums[level++]),
numBlocks * elementSize));
}
}
numElts = numBlocks;
} while (numElts > 1);
CUT_CHECK_ERROR("allocScanStorage");
}
/** @brief Deallocate intermediate block sums arrays in a CUDPPScanPlan object.
*
* These arrays must have been allocated by allocScanStorage(), which is called
* by the constructor of cudppScanPlan().
*
* @param plan Pointer to CUDPPScanPlan object initialized by allocScanStorage().
*/
void freeScanStorage(CUDPPScanPlan *plan)
{
for (unsigned int i = 0; i < plan->m_numLevelsAllocated; i++)
{
cudaFree(plan->m_blockSums[i]);
}
CUT_CHECK_ERROR("freeScanStorage");
free((void**)plan->m_blockSums);
if (plan->m_numRows > 1)
free((void*)plan->m_rowPitches);
plan->m_blockSums = 0;
plan->m_numEltsAllocated = 0;
plan->m_numLevelsAllocated = 0;
}
/** @brief Dispatch function to perform a scan (prefix sum) on an
* array with the specified configuration.
*
* This is the dispatch routine which calls scanArrayRecursive() with
* appropriate template parameters and arguments to achieve the scan as
* specified in \a plan.
*
* @param[out] d_out The output array of scan results
* @param[in] d_in The input array
* @param[in] numElements The number of elements to scan
* @param[in] numRows The number of rows to scan in parallel
* @param[in] plan Pointer to CUDPPScanPlan object containing scan options
* and intermediate storage
*/
void cudppScanDispatch(void *d_out,
const void *d_in,
size_t numElements,
size_t numRows,
const CUDPPScanPlan *plan)
{
if (CUDPP_OPTION_EXCLUSIVE & plan->m_config.options)
{
if (CUDPP_OPTION_BACKWARD & plan->m_config.options)
{
switch (plan->m_config.datatype)
{
case CUDPP_INT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<int, true, true, CUDPP_ADD>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<int, true, true, CUDPP_MULTIPLY>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<int, true, true, CUDPP_MAX>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<int, true, true, CUDPP_MIN>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
case CUDPP_UINT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<unsigned int, true, true, CUDPP_ADD>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<unsigned int, true, true, CUDPP_MULTIPLY>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<unsigned int, true, true, CUDPP_MAX>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<unsigned int, true, true, CUDPP_MIN>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
case CUDPP_FLOAT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<float, true, true, CUDPP_ADD>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<float, true, true, CUDPP_MULTIPLY>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<float, true, true, CUDPP_MAX>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<float, true, true, CUDPP_MIN>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
default:
break;
}
}
else
{
switch (plan->m_config.datatype)
{
case CUDPP_INT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<int, false, true, CUDPP_ADD>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<int, false, true, CUDPP_MULTIPLY>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<int, false, true, CUDPP_MAX>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<int, false, true, CUDPP_MIN>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
case CUDPP_UINT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<unsigned int, false, true, CUDPP_ADD>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<unsigned int, false, true, CUDPP_MULTIPLY>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<unsigned int, false, true, CUDPP_MAX>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<unsigned int, false, true, CUDPP_MIN>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
case CUDPP_FLOAT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<float, false, true, CUDPP_ADD>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<float, false, true, CUDPP_MULTIPLY>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<float, false, true, CUDPP_MAX>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<float, false, true, CUDPP_MIN>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
default:
break;
}
}
}
else
{
if (CUDPP_OPTION_BACKWARD & plan->m_config.options)
{
switch (plan->m_config.datatype)
{
case CUDPP_INT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<int, true, false, CUDPP_ADD>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<int, true, false, CUDPP_MULTIPLY>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<int, true, false, CUDPP_MAX>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<int, true, false, CUDPP_MIN>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
case CUDPP_UINT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<unsigned int, true, false, CUDPP_ADD>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<unsigned int, true, false, CUDPP_MULTIPLY>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<unsigned int, true, false, CUDPP_MAX>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<unsigned int, true, false, CUDPP_MIN>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
case CUDPP_FLOAT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<float, true, false, CUDPP_ADD>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<float, true, false, CUDPP_MULTIPLY>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<float, true, false, CUDPP_MAX>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<float, true, false, CUDPP_MIN>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
default:
break;
}
}
else
{
switch (plan->m_config.datatype)
{
case CUDPP_INT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<int, false, false, CUDPP_ADD>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<int, false, false, CUDPP_MULTIPLY>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<int, false, false, CUDPP_MAX>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<int, false, false, CUDPP_MIN>
((int*)d_out, (const int*)d_in,
(int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
case CUDPP_UINT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<unsigned int, false, false, CUDPP_ADD>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<unsigned int, false, false, CUDPP_MULTIPLY>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<unsigned int, false, false, CUDPP_MAX>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<unsigned int, false, false, CUDPP_MIN>
((unsigned int*)d_out, (const unsigned int*)d_in,
(unsigned int**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
case CUDPP_FLOAT:
switch(plan->m_config.op)
{
case CUDPP_ADD:
scanArrayRecursive<float, false, false, CUDPP_ADD>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MULTIPLY:
scanArrayRecursive<float, false, false, CUDPP_MULTIPLY>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MAX:
scanArrayRecursive<float, false, false, CUDPP_MAX>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
case CUDPP_MIN:
scanArrayRecursive<float, false, false, CUDPP_MIN>
((float*)d_out, (const float*)d_in,
(float**)plan->m_blockSums,
numElements, numRows, plan->m_rowPitches, 0);
break;
default:
break;
}
break;
default:
break;
}
}
}
}
#ifdef __cplusplus
}
#endif
/** @} */ // end scan functions
/** @} */ // end cudpp_app

View File

@ -0,0 +1,166 @@
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* sharedmem.h
*
* @brief Shared memory declaration struct for templatized types.
*
* Because dynamically sized shared memory arrays are declared "extern" in CUDA,
* we can't templatize their types directly. To get around this, we declare a
* simple wrapper struct that will declare the extern array with a different
* name depending on the type. This avoids linker errors about multiple
* definitions.
*
* To use dynamically allocated shared memory in a templatized __global__ or
* __device__ function, just replace code like this:
*
* <pre>
* template<class T>
* __global__ void
* foo( T* d_out, T* d_in)
* {
* // Shared mem size is determined by the host app at run time
* extern __shared__ T sdata[];
* ...
* doStuff(sdata);
* ...
* }
* </pre>
*
* With this
* <pre>
* template<class T>
* __global__ void
* foo( T* d_out, T* d_in)
* {
* // Shared mem size is determined by the host app at run time
* SharedMemory<T> smem;
* T* sdata = smem.getPointer();
* ...
* doStuff(sdata);
* ...
* }
* </pre>
*/
#ifndef _SHAREDMEM_H_
#define _SHAREDMEM_H_
/** @brief Wrapper class for templatized dynamic shared memory arrays.
*
* This struct uses template specialization on the type \a T to declare
* a differently named dynamic shared memory array for each type
* (\code extern __shared__ T s_type[] \endcode).
*
* Currently there are specializations for the following types:
* \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long,
* \c unsigned long, \c bool, \c float, and \c double. One can also specialize it
* for user defined types.
*/
template <typename T>
struct SharedMemory
{
/** Return a pointer to the runtime-sized shared memory array. **/
__device__ T* getPointer()
{
extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
Error_UnsupportedType();
return (T*)0;
}
// TODO: Use operator overloading to make this class look like a regular array
};
// Following are the specializations for the following types.
// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
// One could also specialize it for user-defined types.
template <>
struct SharedMemory <int>
{
__device__ int* getPointer() { extern __shared__ int s_int[]; return s_int; }
};
template <>
struct SharedMemory <unsigned int>
{
__device__ unsigned int* getPointer() { extern __shared__ unsigned int s_uint[]; return s_uint; }
};
template <>
struct SharedMemory <char>
{
__device__ char* getPointer() { extern __shared__ char s_char[]; return s_char; }
};
template <>
struct SharedMemory <unsigned char>
{
__device__ unsigned char* getPointer() { extern __shared__ unsigned char s_uchar[]; return s_uchar; }
};
template <>
struct SharedMemory <short>
{
__device__ short* getPointer() { extern __shared__ short s_short[]; return s_short; }
};
template <>
struct SharedMemory <unsigned short>
{
__device__ unsigned short* getPointer() { extern __shared__ unsigned short s_ushort[]; return s_ushort; }
};
template <>
struct SharedMemory <long>
{
__device__ long* getPointer() { extern __shared__ long s_long[]; return s_long; }
};
template <>
struct SharedMemory <unsigned long>
{
__device__ unsigned long* getPointer() { extern __shared__ unsigned long s_ulong[]; return s_ulong; }
};
template <>
struct SharedMemory <bool>
{
__device__ bool* getPointer() { extern __shared__ bool s_bool[]; return s_bool; }
};
template <>
struct SharedMemory <float>
{
__device__ float* getPointer() { extern __shared__ float s_float[]; return s_float; }
};
template <>
struct SharedMemory <double>
{
__device__ double* getPointer() { extern __shared__ double s_double[]; return s_double; }
};
template <>
struct SharedMemory <uchar4>
{
__device__ uchar4* getPointer() { extern __shared__ uchar4 s_uchar4[]; return s_uchar4; }
};
#endif //_SHAREDMEM_H_
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:

449
lib/gpu/gb_gpu.cpp Normal file
View File

@ -0,0 +1,449 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "gb_gpu_memory.h"
using namespace std;
static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF;
#define GBMT GB_GPU_Memory<numtyp,acctyp>
template<class numtyp, class acctyp>
void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
const int inum, const int form_low, const int form_high) {
int stride=gbm.nbor->nbor_pitch();
int anall=gbm.atom->nall();
if (gbm.shared_types) {
GBMF.k_gb_nbor_fast.set_size(GX,BX);
GBMF.k_gb_nbor_fast.run(&gbm.atom->dev_x.begin(),
&gbm.cut_form.begin(), &gbm.nbor->dev_nbor.begin(), &stride,
&start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
&form_high, &anall);
} else {
GBMF.k_gb_nbor.set_size(GX,BX);
GBMF.k_gb_nbor.run(&gbm.atom->dev_x.begin(), &gbm.cut_form.begin(),
&gbm._lj_types, &gbm.nbor->dev_nbor.begin(), &stride,
&start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
&form_high, &anall);
}
}
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool gb_gpu_init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const double cell_size, int &gpu_mode, FILE *screen) {
GBMF.clear();
gpu_mode=GBMF.device->gpu_mode();
double gpu_split=GBMF.device->particle_split();
int first_gpu=GBMF.device->first_device();
int last_gpu=GBMF.device->last_device();
int world_me=GBMF.device->world_me();
int gpu_rank=GBMF.device->gpu_rank();
int procs_per_gpu=GBMF.device->procs_per_gpu();
GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
bool message=false;
if (world_me==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
if (world_me==0) {
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
sigma, epsilon, host_lshape, form, host_lj1,
host_lj2, host_lj3, host_lj4, offset, special_lj,
inum, nall, max_nbors, cell_size, gpu_split, screen);
if (!init_ok)
return false;
}
MPI_Barrier(MPI_COMM_WORLD);
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
sigma, epsilon, host_lshape, form, host_lj1,
host_lj2, host_lj3, host_lj4, offset, special_lj,
inum, nall, max_nbors, cell_size, gpu_split,
screen);
if (!init_ok)
return false;
}
MPI_Barrier(GBMF.device->gpu_comm);
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
}
// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
void gb_gpu_clear() {
GBMF.clear();
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class gbmtyp>
inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
const int host_inum, const int nall,
double **host_x, double **host_quat,
int *host_type, double *boxlo,
double *boxhi, bool &success) {
gbm.nbor_time_avail=true;
success=true;
gbm.resize_atom(inum,nall,success);
gbm.resize_local(inum,host_inum,gbm.nbor->max_nbors(),0,success);
if (!success)
return;
gbm.atom->cast_copy_x(host_x,host_type);
int mn;
gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
boxlo, boxhi, NULL, NULL, NULL, success, mn);
gbm.nbor->copy_unpacked(inum,mn);
gbm.last_ellipse=inum;
gbm.max_last_ellipse=inum;
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host and (if spheres) reorder so ellipses first
// ---------------------------------------------------------------------------
template <class gbmtyp>
void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
const int inum, const int osize,
int *ilist, int *numj,
int *type, int **firstneigh,
bool &success) {
success=true;
gbm.nbor_time_avail=true;
int mn=gbm.nbor->max_nbor_loop(inum,numj);
gbm.resize_atom(inum,nall,success);
gbm.resize_local(inum,0,mn,osize,success);
if (!success)
return;
if (gbm.multiple_forms) {
int p=0;
for (int i=0; i<osize; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
p++;
}
}
gbm.max_last_ellipse=p;
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
for (int i=0; i<osize; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
p++;
}
}
gbm.nbor->get_host(inum,gbm.host_olist.begin(),numj,firstneigh,
gbm.block_size());
gbm.nbor->copy_unpacked(inum,mn);
return;
}
gbm.last_ellipse=inum;
gbm.max_last_ellipse=inum;
gbm.nbor->get_host(inum,ilist,numj,firstneigh,gbm.block_size());
gbm.nbor->copy_unpacked(inum,mn);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=gbm.block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
int stride=gbm.nbor->nbor_pitch();
int ainum=gbm.atom->inum();
int anall=gbm.atom->nall();
if (gbm.multiple_forms) {
gbm.time_kernel.start();
if (gbm.last_ellipse>0) {
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
static_cast<double>(BX)));
gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
GBMF.k_gayberne.set_size(GX,BX);
GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(),
&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
&stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
gbm.time_gayberne.stop();
if (gbm.last_ellipse==gbm.atom->inum()) {
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
gbm.time_gayberne2.stop();
gbm.time_pair.start();
gbm.time_pair.stop();
return;
}
// ------------ SPHERE_ELLIPSE ---------------
gbm.time_kernel2.start();
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
gbm.last_ellipse)/BX));
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
SPHERE_ELLIPSE,SPHERE_ELLIPSE);
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
GBMF.k_sphere_gb.set_size(GX,BX);
GBMF.k_sphere_gb.run(&gbm.atom->dev_x.begin(),&gbm.atom->dev_quat.begin(),
&gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(),
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
&vflag, &gbm.last_ellipse, &ainum, &anall);
gbm.time_gayberne2.stop();
} else {
gbm.atom->dev_ans.zero();
gbm.atom->dev_engv.zero();
gbm.time_kernel.stop();
gbm.time_gayberne.start();
gbm.time_gayberne.stop();
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
gbm.time_gayberne2.stop();
}
// ------------ LJ ---------------
gbm.time_pair.start();
if (gbm.last_ellipse<gbm.atom->inum()) {
if (gbm.shared_types) {
GBMF.k_lj_fast.set_size(GX,BX);
GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
&gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
&stride, &gbm.nbor->dev_packed.begin(),
&gbm.atom->dev_ans.begin(),
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
} else {
GBMF.k_lj.set_size(GX,BX);
GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
&gbm.lj3.begin(), &gbm._lj_types,
&gbm.gamma_upsilon_mu.begin(), &stride,
&gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
}
}
gbm.time_pair.stop();
} else {
gbm.time_kernel.start();
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
GBMF.k_gayberne.set_size(GX,BX);
GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(), &gbm.atom->dev_quat.begin(),
&gbm.shape.begin(), &gbm.well.begin(),
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
&stride, &gbm.atom->dev_ans.begin(), &ainum,
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
&eflag, &vflag, &ainum, &anall);
gbm.time_gayberne.stop();
}
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, torques, energies
// ---------------------------------------------------------------------------
template <class gbmtyp>
inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
const int inum_full, const int nall,
double **host_x, int *host_type,
double *boxlo, double *boxhi, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success,
double **host_quat) {
gbm.acc_timers();
if (inum_full==0) {
gbm.zero_timers();
return NULL;
}
gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full);
gbm.atom->inum(inum);
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
host_quat, host_type, boxlo, boxhi, success);
if (!success)
return NULL;
gbm.atom->cast_quat_data(host_quat[0]);
gbm.hd_balancer.start_timer();
} else {
gbm.atom->cast_x_data(host_x,host_type);
gbm.atom->cast_quat_data(host_quat[0]);
gbm.hd_balancer.start_timer();
gbm.atom->add_x_data(host_x,host_type);
}
gbm.atom->add_other_data();
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
gbm.hd_balancer.stop_timer();
return gbm.device->nbor.host_nbor.begin();
}
int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success,
double **host_quat) {
return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x,
host_type, boxlo, boxhi, eflag, vflag, eatom, vatom,
host_start, cpu_time, success, host_quat);
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, torques,..
// ---------------------------------------------------------------------------
template <class gbmtyp>
inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
const int inum_full,const int nall,double **host_x,
int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success,
double **host_quat) {
gbm.acc_timers();
if (inum_full==0) {
gbm.zero_timers();
return NULL;
}
int ago=gbm.hd_balancer.ago_first(f_ago);
int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time,
gbm.nbor->gpu_nbor());
gbm.atom->inum(inum);
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
host_start=inum;
if (ago==0) {
_gb_gpu_reset_nbors(gbm, nall, inum, inum_full, ilist, numj, host_type,
firstneigh, success);
if (!success)
return NULL;
}
int *list;
if (gbm.multiple_forms)
list=gbm.host_olist.begin();
else
list=ilist;
gbm.atom->cast_x_data(host_x,host_type);
gbm.atom->cast_quat_data(host_quat[0]);
gbm.hd_balancer.start_timer();
gbm.atom->add_x_data(host_x,host_type);
gbm.atom->add_other_data();
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
gbm.hd_balancer.stop_timer();
return list;
}
int * gb_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double **host_quat) {
return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x,
host_type, ilist, numj, firstneigh, eflag, vflag,
eatom, vatom, host_start, cpu_time, success,
host_quat);
}
// ---------------------------------------------------------------------------
// Return memory usage
// ---------------------------------------------------------------------------
double gb_gpu_bytes() {
return GBMF.host_memory_usage();
}

View File

@ -1,595 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include "nvc_macros.h"
#include "nvc_timer.h"
#include "nvc_device.h"
#include "gb_gpu_memory.cu"
#include "gb_gpu_kernel.h"
using namespace std;
static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
#define GBMT GB_GPU_Memory<numtyp,acctyp>
// ---------------------------------------------------------------------------
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only pack neighbors matching the specified inclusive range of forms
// -- Only pack neighbors within cutoff
// ---------------------------------------------------------------------------
template<class numtyp>
__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
const int start, const int inum,
const int *dev_ij, const int form_low,
const int form_high, const int nall) {
// ii indexes the two interacting particles in gi
int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;
if (ii<inum) {
int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
nbor+=nbor_pitch;
int *nbor_newj=nbor;
nbor+=nbor_pitch;
vec4 ix=x_[i];
int itype=ix.w;
int newj=0;
for ( ; list<list_end; list++) {
int j=*list;
if (j>=nall)
j%=nall;
vec4 jx=x_[j];
int jtype=jx.w;
if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq< _cutsq_<numtyp>(itype,jtype)) {
*nbor=j;
nbor+=nbor_pitch;
newj++;
}
}
}
*nbor_newj=newj;
}
}
// ---------------------------------------------------------------------------
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only pack neighbors matching the specified inclusive range of forms
// -- Only pack neighbors within cutoff
// -- Fast version of routine that uses shared memory for LJ constants
// ---------------------------------------------------------------------------
template<class numtyp>
__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
const int start, const int inum,
const int *dev_ij, const int form_low,
const int form_high, const int nall) {
int ii=threadIdx.x;
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
int itype=ii/MAX_SHARED_TYPES;
int jtype=ii%MAX_SHARED_TYPES;
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
form[ii]=_form_(itype,jtype);
}
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
__syncthreads();
if (ii<inum) {
int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
nbor+=nbor_pitch;
int *nbor_newj=nbor;
nbor+=nbor_pitch;
vec4 ix=x_[i];
int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
int newj=0;
for ( ; list<list_end; list++) {
int j=*list;
if (j>=nall)
j%=nall;
vec4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
if (form[mtype]>=form_low && form[mtype]<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cutsq[mtype]) {
*nbor=j;
nbor+=nbor_pitch;
newj++;
}
}
}
*nbor_newj=newj;
}
}
template<class numtyp, class acctyp>
void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
const int inum, const int form_low, const int form_high) {
if (gbm.shared_types) {
kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
gbm.atom.inum(), start, inum,
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
} else
kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
gbm.atom.inum(), start, inum,
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
}
// ---------------------------------------------------------------------------
// Convert something to a string
// ---------------------------------------------------------------------------
#include <sstream>
template <class t>
inline string gb_gpu_toa(const t& in) {
ostringstream o;
o.precision(2);
o << in;
return o.str();
}
// ---------------------------------------------------------------------------
// Return string with GPU info
// ---------------------------------------------------------------------------
EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
string sname=GBMF[0].gpu.name(id)+", "+
gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ";
strcpy(name,sname.c_str());
}
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
const double upsilon, const double mu, double **shape,
double **well, double **cutsq, double **sigma,
double **epsilon, double *host_lshape, int **form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int nlocal, const int nall,
const int max_nbors, const int thread, const int gpu_id) {
assert(thread<MAX_GPU_THREADS);
GBMF[thread].gpu.init();
if (GBMF[thread].gpu.num_devices()==0)
return false;
ij_size=IJ_SIZE;
return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
well, cutsq, sigma, epsilon, host_lshape, form,
host_lj1, host_lj2, host_lj3, host_lj4, offset,
special_lj, nlocal, nall, max_nbors, false,
gpu_id);
}
// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
EXTERN void gb_gpu_clear(const int thread) {
GBMF[thread].clear();
}
// ---------------------------------------------------------------------------
// copy atom positions, quaternions, and optionally types to device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
double **host_quat, const int *host_type,
const bool rebuild, cudaStream_t &stream) {
atom.time_atom.start();
atom.reset_write_buffer();
// Rows 1-3 of dev_x are position; rows 4-7 are quaternion
atom.add_x_data(host_x,host_type);
atom.add_q_data(host_quat[0]);
atom.copy_x_data(stream);
atom.copy_q_data(stream);
atom.time_atom.stop();
}
EXTERN void gb_gpu_atom(double **host_x, double **host_quat,
const int *host_type, const bool rebuild,
const int thread) {
_gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
GBMF[thread].pair_stream);
}
// ---------------------------------------------------------------------------
// Signal that we need to transfer a new neighbor list
// ---------------------------------------------------------------------------
template <class gbmtyp>
int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
const int inum, int *ilist, const int *numj,
const int *type, bool &success) {
success=true;
gbm.nbor.time_nbor.start();
int mn=0;
for (int i=0; i<inum; i++)
mn=std::max(mn,numj[i]);
if (nall>gbm.max_atoms)
gbm.resize_atom(nall,success);
if (nlocal>gbm.max_local || mn>gbm._max_nbors)
gbm.resize_local(nlocal,mn,success);
if (!success)
return false;
gbm.atom.nall(nall);
gbm.atom.inum(inum);
if (gbm.multiple_forms) {
int ij_size=gbm.nbor.host_ij.numel();
if (inum*2<ij_size) {
int p=0, acc=0;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[p]=numj[ilist[i]];
gbm.nbor.host_ij[p+inum]=acc;
acc+=numj[ilist[i]];
p++;
}
}
gbm.last_ellipse=p;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[p]=numj[ilist[i]];
gbm.nbor.host_ij[p+inum]=acc;
acc+=numj[ilist[i]];
p++;
}
}
gbm.nbor.ij_total=0;
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum,
2*inum,gbm.pair_stream);
} else {
int p=0, acc=0;
int offset=0;
int half=ij_size/2;
int hi=0;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[hi]=numj[ilist[i]];
gbm.nbor.host_ij[hi+half]=acc;
acc+=numj[ilist[i]];
p++;
hi++;
if (hi==half) {
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
half,gbm.pair_stream);
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
inum*2+offset,
half,gbm.pair_stream);
hi=0;
offset+=half;
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
}
}
}
gbm.last_ellipse=p;
for (int i=0; i<inum; i++) {
int itype=type[ilist[i]];
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
gbm.host_olist[p]=ilist[i];
gbm.nbor.host_ij[hi]=numj[ilist[i]];
gbm.nbor.host_ij[hi+half]=acc;
acc+=numj[ilist[i]];
p++;
hi++;
if (hi==half) {
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
half,gbm.pair_stream);
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
inum*2+offset,
half,gbm.pair_stream);
hi=0;
offset+=half;
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
}
}
}
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
if (hi>0) {
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
hi,gbm.pair_stream);
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
inum*2+offset,
hi,gbm.pair_stream);
}
gbm.nbor.ij_total=0;
}
} else {
gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
gbm.last_ellipse=inum;
}
gbm.nbor.time_nbor.stop();
if (gbm.multiple_forms)
return gbm.host_olist.begin();
return ilist;
}
EXTERN int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
int *ilist, const int *numj, const int *type,
const int thread, bool &success) {
return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
success);
}
// ---------------------------------------------------------------------------
// Copy a set of ij_size ij interactions to device and compute energies,
// forces, and torques for those interactions
// ---------------------------------------------------------------------------
template <class gbmtyp>
void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij,
const bool eflag) {
gbm.nbor.time_nbor.add_to_total();
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
gbm.nbor.time_nbor.start();
gbm.nbor.add(num_ij,gbm.pair_stream);
gbm.nbor.time_nbor.stop();
}
EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
const int thread) {
_gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
}
template<class numtyp, class acctyp>
void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) {
gbm.atom.time_answer.start();
gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
gbm.atom.time_answer.stop();
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques for all ij interactions
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
const bool rebuild) {
// Compute the block size and grid size to keep all cores busy
const int BX=BLOCK_1D;
int ans_pitch=6;
if (eflag)
ans_pitch++;
if (vflag)
ans_pitch+=6;
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
if (gbm.multiple_forms) {
gbm.time_kernel.start();
if (gbm.last_ellipse>0) {
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
static_cast<double>(BX)));
pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
gbm.time_gayberne.stop();
if (gbm.last_ellipse==gbm.atom.inum()) {
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
gbm.time_gayberne2.stop();
gbm.time_pair.start();
gbm.time_pair.stop();
return;
}
// ------------ SPHERE_ELLIPSE ---------------
gbm.time_kernel2.start();
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
gbm.last_ellipse)/BX));
pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
ELLIPSE_SPHERE);
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
gbm.time_gayberne2.stop();
} else {
gbm.atom.ans.zero();
gbm.time_kernel.stop();
gbm.time_gayberne.start();
gbm.time_gayberne.stop();
gbm.time_kernel2.start();
gbm.time_kernel2.stop();
gbm.time_gayberne2.start();
gbm.time_gayberne2.stop();
}
// ------------ LJ ---------------
gbm.time_pair.start();
if (gbm.last_ellipse<gbm.atom.inum()) {
if (gbm.shared_types)
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag,
vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
else
kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
}
gbm.time_pair.stop();
} else {
gbm.time_kernel.start();
pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
gbm.time_kernel.stop();
gbm.time_gayberne.start();
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(),
eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
gbm.time_gayberne.stop();
}
}
EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
const int thread) {
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
_gb_gpu_enqueue<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag);
}
// ---------------------------------------------------------------------------
// Get energies, forces, and torques to host
// ---------------------------------------------------------------------------
template<class numtyp, class acctyp>
double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial) {
double evdw;
gbm.atom.time_atom.add_to_total();
gbm.nbor.time_nbor.add_to_total();
gbm.time_kernel.add_to_total();
gbm.time_gayberne.add_to_total();
if (gbm.multiple_forms) {
gbm.time_kernel2.add_to_total();
gbm.time_gayberne2.add_to_total();
gbm.time_pair.add_to_total();
}
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
if (gbm.last_ellipse>gbm.atom.inum()) {
if (eflag || vflag)
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
f,tor,gbm.atom.inum());
else
gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum());
} else {
if (eflag || vflag)
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
f,tor,gbm.last_ellipse);
else
gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse);
}
gbm.atom.time_answer.add_to_total();
return evdw;
}
EXTERN double gb_gpu_forces(double **f, double **tor, const int *ilist,
const bool eflag, const bool vflag, const bool eflag_atom,
const bool vflag_atom, double *eatom, double **vatom,
double *virial, const int thread) {
return _gb_gpu_forces<PRECISION,ACC_PRECISION>
(GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
vflag_atom,eatom,vatom,virial);
}
EXTERN void gb_gpu_time(const int i) {
cout.precision(4);
cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds()
<< " s.\n"
<< "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds()
<< " s.\n"
<< "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
GBMF[i].time_kernel2.total_seconds() << " s.\n"
<< "Force calc: " << GBMF[i].time_gayberne.total_seconds()+
GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
if (GBMF[i].multiple_forms)
cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n";
cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds()
<< " s.\n";
}
EXTERN int gb_gpu_num_devices() {
return GBMF[0].gpu.num_devices();
}
EXTERN double gb_gpu_bytes() {
return GBMF[0].host_memory_usage();
}

View File

@ -12,44 +12,60 @@
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_EXTRA_H
#define GB_GPU_EXTRA_H
#include "math.h"
#include "stdio.h"
#include "string.h"
#define MAX_SHARED_TYPES 8
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
/* ----------------------------------------------------------------------
Atomic update of global memory
------------------------------------------------------------------------- */
/*
template <class numtyp> __device__
inline void atomicAdd(numtyp *address, numtyp val);
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
template <>
__device__ inline void atomicAdd<float>(float *address, float val)
{
int i_val = __float_as_int(val);
int tmp0 = 0;
int tmp1;
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) {
tmp0 = tmp1;
i_val = __float_as_int(val + __int_as_float(tmp1));
}
}*/
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#endif
/* ----------------------------------------------------------------------
dot product of 2 vectors
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
{
return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
}
@ -58,9 +74,7 @@ static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
cross product of 2 vectors
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_cross3(const numtyp *v1,
const numtyp *v2, numtyp *ans)
__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
{
ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
@ -71,8 +85,7 @@ static __inline__ __device__ void gpu_cross3(const numtyp *v1,
determinant of a matrix
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
__inline numtyp gpu_det3(const numtyp m[9])
{
numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] -
m[3]*m[1]*m[8] + m[3]*m[2]*m[7] +
@ -84,47 +97,25 @@ static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
diagonal matrix times a full matrix
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9],
numtyp ans[9])
__inline void gpu_times3(const numtyp4 shape, const numtyp m[9],
numtyp ans[9])
{
ans[0] = _well_<numtyp>(i,0)*m[0];
ans[1] = _well_<numtyp>(i,0)*m[1];
ans[2] = _well_<numtyp>(i,0)*m[2];
ans[3] = _well_<numtyp>(i,1)*m[3];
ans[4] = _well_<numtyp>(i,1)*m[4];
ans[5] = _well_<numtyp>(i,1)*m[5];
ans[6] = _well_<numtyp>(i,2)*m[6];
ans[7] = _well_<numtyp>(i,2)*m[7];
ans[8] = _well_<numtyp>(i,2)*m[8];
}
/* ----------------------------------------------------------------------
diagonal matrix times a full matrix
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9],
numtyp ans[9])
{
ans[0] = _shape_<numtyp>(i,0)*m[0];
ans[1] = _shape_<numtyp>(i,0)*m[1];
ans[2] = _shape_<numtyp>(i,0)*m[2];
ans[3] = _shape_<numtyp>(i,1)*m[3];
ans[4] = _shape_<numtyp>(i,1)*m[4];
ans[5] = _shape_<numtyp>(i,1)*m[5];
ans[6] = _shape_<numtyp>(i,2)*m[6];
ans[7] = _shape_<numtyp>(i,2)*m[7];
ans[8] = _shape_<numtyp>(i,2)*m[8];
ans[0] = shape.x*m[0];
ans[1] = shape.x*m[1];
ans[2] = shape.x*m[2];
ans[3] = shape.y*m[3];
ans[4] = shape.y*m[4];
ans[5] = shape.y*m[5];
ans[6] = shape.z*m[6];
ans[7] = shape.z*m[7];
ans[8] = shape.z*m[8];
}
/* ----------------------------------------------------------------------
add two matrices
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_plus3(const numtyp m[9],
const numtyp m2[9], numtyp ans[9])
__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
{
ans[0] = m[0]+m2[0];
ans[1] = m[1]+m2[1];
@ -141,10 +132,8 @@ static __inline__ __device__ void gpu_plus3(const numtyp m[9],
multiply the transpose of mat1 times mat2
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9],
const numtyp m2[9],
numtyp ans[9])
__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
numtyp ans[9])
{
ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
@ -161,9 +150,7 @@ static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9],
row vector times matrix
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_row_times3(const numtyp *v,
const numtyp m[9], numtyp *ans)
__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
{
ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
@ -176,10 +163,8 @@ static __inline__ __device__ void gpu_row_times3(const numtyp *v,
error_flag set to 2 if bad matrix inversion attempted
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
const numtyp *v, numtyp *ans,
int *error_flag)
__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
__global int *error_flag)
{
// create augmented matrix for pivoting
@ -297,12 +282,10 @@ static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
quat = [w i j k]
------------------------------------------------------------------------- */
template <class numtyp>
static __inline__ __device__ void gpu_quat_to_mat_trans(const vec4 *qif,
const int qi,
numtyp mat[9])
__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
numtyp mat[9])
{
vec4 q=qif[qi];
numtyp4 q=qif[qi];
numtyp w2 = q.x*q.x;
numtyp i2 = q.y*q.y;

383
lib/gpu/gb_gpu_kernel.cu Normal file
View File

@ -0,0 +1,383 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_KERNEL
#define GB_GPU_KERNEL
#ifdef NV_KERNEL
#include "gb_gpu_extra.h"
#endif
__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape,
numtyp ans[9])
{
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
den = (numtyp)1.0/den;
ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
__kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, const int astride,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int inum,
const int nall) {
__local numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
if (ii<4)
sp_lj[ii]=gum[ii+3];
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp4 tor;
tor.x=(numtyp)0;
tor.y=(numtyp)0;
tor.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+mul24(stride,numj);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp a1[9], b1[9], g1[9];
numtyp4 ishape=shape[itype];
{
numtyp t[9];
gpu_quat_to_mat_trans(q,i,a1);
gpu_times3(ishape,a1,t);
gpu_transpose_times3(a1,t,g1);
gpu_times3(well[itype],a1,t);
gpu_transpose_times3(a1,t,b1);
}
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=stride) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
numtyp g2[9];
{
gpu_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
gpu_plus3(g1,g2,g12);
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- replace r12 with r12 hat
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
}
// torque for particle 1
{
numtyp tempv[3], tempv2[3];
tempv[0] = -uslj_rsq*kappa[0];
tempv[1] = -uslj_rsq*kappa[1];
tempv[2] = -uslj_rsq*kappa[2];
gpu_row_times3(kappa,g1,tempv2);
gpu_cross3(tempv,tempv2,tUr);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
// Compute teta
numtyp temp[9], tempv[3], tempv2[3];
compute_eta_torque(g12,a1,ishape,temp);
numtyp temp1 = -eta*gum[1];
tempv[0] = temp1*temp[0];
tempv[1] = temp1*temp[1];
tempv[2] = temp1*temp[2];
gpu_cross3(a1,tempv,tempv2);
teta[0] = tempv2[0];
teta[1] = tempv2[1];
teta[2] = tempv2[2];
tempv[0] = temp1*temp[3];
tempv[1] = temp1*temp[4];
tempv[2] = temp1*temp[5];
gpu_cross3(a1+3,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
tempv[0] = temp1*temp[6];
tempv[1] = temp1*temp[7];
tempv[2] = temp1*temp[8];
gpu_cross3(a1+6,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
}
numtyp chi, dchi[3], tchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b2[9], b12[9];
{
gpu_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
gpu_plus3(b1,b2,b12);
}
// compute chi_12
r12[0]*=r;
r12[1]*=r;
r12[2]*=r;
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
chi = gpu_dot3(r12,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
// compute t_chi
numtyp tempv[3];
gpu_row_times3(iota,b1,tempv);
gpu_cross3(tempv,iota,tchi);
temp1 = (numtyp)-4.0*ir*ir;
tchi[0] *= temp1;
tchi[1] *= temp1;
tchi[2] *= temp1;
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-r;
r12[1]*=-r;
r12[2]*=-r;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
// Torque on 1
temp1 = -u_r*eta*factor_lj;
temp2 = -u_r*chi*factor_lj;
numtyp temp3 = -chi*eta*factor_lj;
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=astride;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=astride;
}
}
ans[ii]=f;
ans[ii+astride]=tor;
} // if ii
}
#endif

View File

@ -1,863 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_KERNEL
#define GB_GPU_KERNEL
#include "gb_gpu_extra.h"
template <class numtyp>
static __inline__ __device__ void compute_eta_torque(numtyp m[9],
numtyp m2[9],
const int i,
numtyp ans[9])
{
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
den = (numtyp)1.0/den;
numtyp shapex=_shape_<numtyp>(i,0);
numtyp shapey=_shape_<numtyp>(i,1);
numtyp shapez=_shape_<numtyp>(i,2);
ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
#include "gb_gpu_kernel.h"
template<class numtyp, class acctyp>
__global__ void kernel_gayberne(const vec4* x_, const vec4 *q,
const numtyp *gum, const numtyp *special_lj,
const int *dev_nbor, const size_t nbor_pitch,
acctyp *ans, size_t ans_pitch, int *err_flag,
const bool eflag, const bool vflag,
const int inum, const int nall) {
__shared__ numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
if (ii<4)
sp_lj[ii]=special_lj[ii];
ii+=INT_MUL(blockIdx.x,blockDim.x);
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp torx=(numtyp)0;
acctyp tory=(numtyp)0;
acctyp torz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
nbor+=nbor_pitch;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *nbor_end=nbor+nbor_pitch*numj;
vec4 ix=x_[i];
int itype=ix.w;
numtyp a1[9], b1[9], g1[9];
{
numtyp t[9];
gpu_quat_to_mat_trans(q,i,a1);
gpu_shape_times3(itype,a1,t);
gpu_transpose_times3(a1,t,g1);
gpu_well_times3(itype,a1,t);
gpu_transpose_times3(a1,t,b1);
}
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
vec4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
numtyp g2[9];
{
gpu_shape_times3(jtype,a2,g12);
gpu_transpose_times3(a2,g12,g2);
gpu_plus3(g1,g2,g12);
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- replace r12 with r12 hat
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
numtyp sigma = _sigma_<numtyp>(itype,jtype);
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
}
// torque for particle 1
{
numtyp tempv[3], tempv2[3];
tempv[0] = -uslj_rsq*kappa[0];
tempv[1] = -uslj_rsq*kappa[1];
tempv[2] = -uslj_rsq*kappa[2];
gpu_row_times3(kappa,g1,tempv2);
gpu_cross3(tempv,tempv2,tUr);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
// Compute teta
numtyp temp[9], tempv[3], tempv2[3];
compute_eta_torque(g12,a1,itype,temp);
numtyp temp1 = -eta*gum[1];
tempv[0] = temp1*temp[0];
tempv[1] = temp1*temp[1];
tempv[2] = temp1*temp[2];
gpu_cross3(a1,tempv,tempv2);
teta[0] = tempv2[0];
teta[1] = tempv2[1];
teta[2] = tempv2[2];
tempv[0] = temp1*temp[3];
tempv[1] = temp1*temp[4];
tempv[2] = temp1*temp[5];
gpu_cross3(a1+3,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
tempv[0] = temp1*temp[6];
tempv[1] = temp1*temp[7];
tempv[2] = temp1*temp[8];
gpu_cross3(a1+6,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
}
numtyp chi, dchi[3], tchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b2[9], b12[9];
{
gpu_well_times3(jtype,a2,b12);
gpu_transpose_times3(a2,b12,b2);
gpu_plus3(b1,b2,b12);
}
// compute chi_12
r12[0]*=r;
r12[1]*=r;
r12[2]*=r;
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
chi = gpu_dot3(r12,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
// compute t_chi
numtyp tempv[3];
gpu_row_times3(iota,b1,tempv);
gpu_cross3(tempv,iota,tchi);
temp1 = (numtyp)-4.0*ir*ir;
tchi[0] *= temp1;
tchi[1] *= temp1;
tchi[2] *= temp1;
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag) {
r12[0]*=-r;
r12[1]*=-r;
r12[2]*=-r;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
fx+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
fy+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
fz+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
fx+=temp1*dchi[0]-temp2*dUr[0];
fy+=temp1*dchi[1]-temp2*dUr[1];
fz+=temp1*dchi[2]-temp2*dUr[2];
}
// Torque on 1
temp1 = -u_r*eta*factor_lj;
temp2 = -u_r*chi*factor_lj;
numtyp temp3 = -chi*eta*factor_lj;
torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
} // for nbor
// Store answers
acctyp *ap1=ans+ii*ans_pitch;
if (eflag) {
*ap1=energy;
ap1++;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1++;
}
}
*ap1=fx;
ap1++;
*ap1=fy;
ap1++;
*ap1=fz;
ap1++;
*ap1=torx;
ap1++;
*ap1=tory;
ap1++;
*ap1=torz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q,
const numtyp *gum, const numtyp *special_lj,
const int *dev_nbor, const size_t nbor_pitch,
acctyp *ans, size_t ans_pitch, int *err_flag,
const bool eflag, const bool vflag,
const int start, const int inum,
const int nall) {
__shared__ numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
if (ii<4)
sp_lj[ii]=special_lj[ii];
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
nbor+=nbor_pitch;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *nbor_end=nbor+nbor_pitch*numj;
vec4 ix=x_[i];
int itype=ix.w;
numtyp oner=_shape_<numtyp>(itype,0);
numtyp one_well=_well_<numtyp>(itype,0);
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
vec4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp r12hat[3];
r12hat[0]=r12[0]*ir;
r12hat[1]=r12[1]*ir;
r12hat[2]=r12[2]*ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], eta;
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
{
numtyp g2[9];
gpu_shape_times3(jtype,a2,g12);
gpu_transpose_times3(a2,g12,g2);
g12[0]=g2[0]+oner;
g12[4]=g2[4]+oner;
g12[8]=g2[8]+oner;
g12[1]=g2[1];
g12[2]=g2[2];
g12[3]=g2[3];
g12[5]=g2[5];
g12[6]=g2[6];
g12[7]=g2[7];
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12hat,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
numtyp sigma = _sigma_<numtyp>(itype,jtype);
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12hat);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
}
numtyp chi, dchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b12[9];
{
numtyp b2[9];
gpu_well_times3(jtype,a2,b12);
gpu_transpose_times3(a2,b12,b2);
b12[0]=b2[0]+one_well;
b12[4]=b2[4]+one_well;
b12[8]=b2[8]+one_well;
b12[1]=b2[1];
b12[2]=b2[2];
b12[3]=b2[3];
b12[5]=b2[5];
b12[6]=b2[6];
b12[7]=b2[7];
}
// compute chi_12
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
chi = gpu_dot3(r12hat,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12hat);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag) {
r12[0]*=-1;
r12[1]*=-1;
r12[2]*=-1;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
fx+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
fy+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
fz+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
fx+=temp1*dchi[0]-temp2*dUr[0];
fy+=temp1*dchi[1]-temp2*dUr[1];
fz+=temp1*dchi[2]-temp2*dUr[2];
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii*ans_pitch;
if (eflag) {
*ap1=energy;
ap1++;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1++;
}
}
*ap1=fx;
ap1++;
*ap1=fy;
ap1++;
*ap1=fz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_lj(const vec4 *x_,
const numtyp *special_lj, const int *dev_nbor,
const size_t nbor_pitch, const int *dev_ij, acctyp *ans,
size_t ans_pitch, int *err_flag, const bool eflag,
const bool vflag, const int start, const int inum,
const int nall) {
__shared__ numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
if (ii<4)
sp_lj[ii]=special_lj[ii];
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
vec4 ix=x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; list<list_end; list++) {
int j=*list;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
vec4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
_form_(itype,jtype)==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
_lj1_<numtyp>(itype,jtype).y);
force*=factor_lj;
fx+=delx*force;
fy+=dely*force;
fz+=delz*force;
if (eflag) {
numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
_lj3_<numtyp>(itype,jtype).y);
energy+=factor_lj*(e-_offset_<numtyp>(1,1));
}
if (vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii*ans_pitch;
if (eflag) {
*ap1+=energy;
ap1++;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1++;
}
}
*ap1+=fx;
ap1++;
*ap1+=fy;
ap1++;
*ap1+=fz;
} // if ii
}
template<class numtyp, class acctyp>
__global__ void kernel_lj_fast(const vec4 *x_,
const numtyp *special_lj, const int *dev_nbor,
const size_t nbor_pitch, const int *dev_ij,
acctyp *ans, size_t ans_pitch,int *err_flag,
const bool eflag, const bool vflag,
const int start, const int inum, const int nall){
// ii indexes the two interacting particles in gi
int ii=threadIdx.x;
__shared__ numtyp sp_lj[4];
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<4)
sp_lj[ii]=special_lj[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
int itype=ii/MAX_SHARED_TYPES;
int jtype=ii%MAX_SHARED_TYPES;
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
form[ii]=_form_(itype,jtype);
lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
if (eflag) {
lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
offset[ii]=_offset_<numtyp>(itype,jtype);
}
}
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp fx=(numtyp)0;
acctyp fy=(numtyp)0;
acctyp fz=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
const int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
const int *list=dev_ij+*nbor;
const int *list_end=list+numj;
vec4 ix=x_[i];
int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
numtyp factor_lj;
for ( ; list<list_end; list++) {
int j=*list;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
vec4 jx=x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
fx+=delx*force;
fy+=dely*force;
fz+=delz*force;
if (eflag) {
numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
energy+=factor_lj*(e-offset[mtype]);
}
if (vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
acctyp *ap1=ans+ii*ans_pitch;
if (eflag) {
*ap1+=energy;
ap1++;
}
if (vflag) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1++;
}
}
*ap1+=fx;
ap1++;
*ap1+=fy;
ap1++;
*ap1+=fz;
} // if ii
}
#endif

472
lib/gpu/gb_gpu_kernel_lj.cu Normal file
View File

@ -0,0 +1,472 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_KERNEL_LJ
#define GB_GPU_KERNEL_LJ
#ifdef NV_KERNEL
#include "gb_gpu_extra.h"
#endif
__kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
__global numtyp4* shape,__global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag,const int start, const int inum,
const int nall) {
__local numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
if (ii<4)
sp_lj[ii]=gum[ii+3];
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+stride*numj;
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp oner=shape[itype].x;
numtyp one_well=well[itype].x;
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=stride) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp r12hat[3];
r12hat[0]=r12[0]*ir;
r12hat[1]=r12[1]*ir;
r12hat[2]=r12[2]*ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], eta;
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
{
numtyp g2[9];
gpu_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
g12[0]=g2[0]+oner;
g12[4]=g2[4]+oner;
g12[8]=g2[8]+oner;
g12[1]=g2[1];
g12[2]=g2[2];
g12[3]=g2[3];
g12[5]=g2[5];
g12[6]=g2[6];
g12[7]=g2[7];
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12hat,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12hat);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
}
numtyp chi, dchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b12[9];
{
numtyp b2[9];
gpu_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
b12[0]=b2[0]+one_well;
b12[4]=b2[4]+one_well;
b12[8]=b2[8]+one_well;
b12[1]=b2[1];
b12[2]=b2[2];
b12[3]=b2[3];
b12[5]=b2[5];
b12[6]=b2[6];
b12[7]=b2[7];
}
// compute chi_12
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
chi = gpu_dot3(r12hat,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12hat);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-1;
r12[1]*=-1;
r12[2]*=-1;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int nall) {
__local numtyp sp_lj[4];
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
if (ii<4)
sp_lj[ii]=gum[ii+3];
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=stride) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int ii=itype*lj_types+jtype;
if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
energy+=factor_lj*(e-lj3[ii].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp *gum,
const int stride,
__global int *dev_ij, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
const int eflag,const int vflag, const int start,
const int inum, const int nall) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
__local numtyp sp_lj[4];
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<4)
sp_lj[ii]=gum[ii+3];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
if (eflag>0)
lj3[ii]=lj3_in[ii];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=stride) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
#endif

View File

@ -0,0 +1,170 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef PAIR_GPU_KERNEL_H
#define PAIR_GPU_KERNEL_H
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#else
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#endif
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#endif
// ---------------------------------------------------------------------------
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only unpack neighbors matching the specified inclusive range of forms
// -- Only unpack neighbors within cutoff
// ---------------------------------------------------------------------------
__kernel void kernel_gb_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
const int ntypes, __global int *dev_nbor,
const int nbor_pitch,
const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high, const int nall) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X+start;
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24(iw,ntypes);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j>=nall)
j%=nall;
numtyp4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
numtyp2 cf=cut_form[mtype];
if (cf.y>=form_low && cf.y<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cf.x) {
*packed=j;
packed+=nbor_pitch;
newj++;
}
}
}
dev_nbor[ii+nbor_pitch]=newj;
}
}
// ---------------------------------------------------------------------------
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only unpack neighbors matching the specified inclusive range of forms
// -- Only unpack neighbors within cutoff
// -- Fast version of routine that uses shared memory for LJ constants
// ---------------------------------------------------------------------------
__kernel void kernel_gb_nbor_fast(__global numtyp4 *x_,
__global numtyp2 *cut_form,
__global int *dev_nbor,
const int nbor_pitch,
const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high, const int nall) {
int ii=THREAD_ID_X;
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
cutsq[ii]=cut_form[ii].x;
form[ii]=cut_form[ii].y;
}
ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j>=nall)
j%=nall;
numtyp4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
if (form[mtype]>=form_low && form[mtype]<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cutsq[mtype]) {
*packed=j;
packed+=nbor_pitch;
newj++;
}
}
}
dev_nbor[ii+nbor_pitch]=newj;
}
}
#endif

334
lib/gpu/gb_gpu_memory.cpp Normal file
View File

@ -0,0 +1,334 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "gb_gpu_cl.h"
#include "gb_gpu_nbor_cl.h"
#else
#include "gb_gpu_ptx.h"
#endif
#include "gb_gpu_memory.h"
#include <cassert>
#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
_max_bytes(0.0) {
device=&pair_gpu_device;
}
template <class numtyp, class acctyp>
GB_GPU_MemoryT::~GB_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
const double upsilon, const double mu,
double **host_shape, double **host_well,
double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape,
int **h_form, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4,
double **host_offset, const double *host_special_lj,
const int nlocal, const int nall,
const int max_nbors, const double cell_size,
const double gpu_split, FILE *_screen) {
nbor_time_avail=false;
screen=_screen;
bool gpu_nbor=false;
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=true;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
if (host_nlocal>0)
_gpu_host=1;
if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host,
max_nbors,cell_size,true))
return false;
ucl_device=device->gpu;
atom=&device->atom;
nbor=&device->nbor;
_block_size=BLOCK_1D;
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
_block_size=ucl_device->group_size();
compile_kernels(*ucl_device);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for copying type data
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*ucl_device,
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
sigma_epsilon.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
host_sigma,host_epsilon);
cut_form.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,cut_form,host_write,
host_cutsq,h_form);
lj1.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq,h_form);
lj3.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
dev_error.alloc(1,*ucl_device);
dev_error.zero();
_allocated=true;
host_form=h_form;
// Initialize timers for the selected GPU
time_kernel.init(*ucl_device);
time_gayberne.init(*ucl_device);
time_kernel2.init(*ucl_device);
time_gayberne2.init(*ucl_device);
time_kernel.zero();
time_gayberne.zero();
time_kernel2.zero();
time_gayberne2.zero();
// Allocate, cast and asynchronous memcpy of constant data
// Copy data for bonded interactions
gamma_upsilon_mu.alloc(7,*ucl_device,UCL_READ_ONLY);
host_write[0]=static_cast<numtyp>(gamma);
host_write[1]=static_cast<numtyp>(upsilon);
host_write[2]=static_cast<numtyp>(mu);
host_write[3]=static_cast<numtyp>(host_special_lj[0]);
host_write[4]=static_cast<numtyp>(host_special_lj[1]);
host_write[5]=static_cast<numtyp>(host_special_lj[2]);
host_write[6]=static_cast<numtyp>(host_special_lj[3]);
ucl_copy(gamma_upsilon_mu,host_write,7,false);
lshape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
UCL_H_Vec<double> d_view;
d_view.view(host_lshape,lshape.numel(),*ucl_device);
ucl_copy(lshape,d_view,false);
// Copy shape, well, sigma, epsilon, and cutsq onto GPU
// - cast if necessary
shape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
for (int i=0; i<ntypes; i++) {
host_write[i*4]=host_shape[i][0];
host_write[i*4+1]=host_shape[i][1];
host_write[i*4+2]=host_shape[i][2];
}
UCL_H_Vec<numtyp4> view4;
view4.view((numtyp4*)host_write.begin(),shape.numel(),*ucl_device);
ucl_copy(shape,view4,false);
well.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
for (int i=0; i<ntypes; i++) {
host_write[i*4]=host_well[i][0];
host_write[i*4+1]=host_well[i][1];
host_write[i*4+2]=host_well[i][2];
}
view4.view((numtyp4*)host_write.begin(),well.numel(),*ucl_device);
ucl_copy(well,view4,false);
// See if we want fast GB-sphere or sphere-sphere calculations
multiple_forms=false;
for (int i=1; i<ntypes; i++)
for (int j=i; j<ntypes; j++)
if (host_form[i][j]!=ELLIPSE_ELLIPSE)
multiple_forms=true;
if (multiple_forms && host_nlocal>0) {
std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
exit(1);
}
if (multiple_forms)
atom->dev_ans.zero();
_max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
// Memory for ilist ordered by particle type
return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::clear() {
if (!_allocated)
return;
UCL_H_Vec<int> err_flag(1,*ucl_device);
ucl_copy(err_flag,dev_error,false);
if (err_flag[0] == 2)
std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
err_flag.clear();
_allocated=false;
// Output any timing information
acc_timers();
double single[6], times[6];
single[0]=atom->transfer_time();
single[1]=nbor->time_nbor.total_seconds();
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
nbor->time_kernel.total_seconds();
single[3]=time_gayberne.total_seconds()+time_gayberne2.total_seconds();
if (multiple_forms)
single[4]=time_pair.total_seconds();
else
single[4]=0;
single[5]=atom->cast_time();
MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
double avg_split=hd_balancer.all_avg_split();
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
gamma_upsilon_mu.row_bytes();
double mpi_max_bytes;
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
double max_mb=mpi_max_bytes/(1024*1024);
if (device->world_me()==0)
if (screen && times[3]>0.0) {
int world_size=device->world_size();
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (device->procs_per_gpu()==1) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/world_size);
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/world_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/world_size);
if (nbor->gpu_nbor())
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/world_size);
else
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/world_size);
fprintf(screen,"Force calc: %.4f s.\n",times[3]/world_size);
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/world_size);
}
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
fprintf(screen,"-------------------------------------");
fprintf(screen,"--------------------------------\n\n");
}
_max_bytes=0.0;
dev_error.clear();
lj1.clear();
lj3.clear();
sigma_epsilon.clear();
cut_form.clear();
shape.clear();
well.clear();
lshape.clear();
gamma_upsilon_mu.clear();
host_olist.clear();
time_kernel.clear();
time_gayberne.clear();
time_kernel2.clear();
time_gayberne2.clear();
time_pair.clear();
hd_balancer.clear();
if (_compiled) {
k_gb_nbor_fast.clear();
k_gb_nbor.clear();
k_gayberne.clear();
k_sphere_gb.clear();
k_lj_fast.clear();
k_lj.clear();
delete pair_program;
delete gb_program;
delete gb_lj_program;
_compiled=false;
}
device->clear();
}
template <class numtyp, class acctyp>
double GB_GPU_MemoryT::host_memory_usage() const {
return device->atom.host_memory_usage()+
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
sizeof(GB_GPU_Memory<numtyp,acctyp>)+
device->nbor.max_atoms()*sizeof(int);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::compile_kernels(UCL_Device &dev) {
if (_compiled)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE);
pair_program=new UCL_Program(dev);
pair_program->load_string(gb_gpu_kernel_nbor,flags.c_str());
k_gb_nbor_fast.set_function(*pair_program,"kernel_gb_nbor_fast");
k_gb_nbor.set_function(*pair_program,"kernel_gb_nbor");
gb_program=new UCL_Program(dev);
gb_program->load_string(gb_gpu_kernel,flags.c_str());
k_gayberne.set_function(*gb_program,"kernel_gayberne");
gb_lj_program=new UCL_Program(dev);
gb_lj_program->load_string(gb_gpu_kernel_lj,flags.c_str());
k_sphere_gb.set_function(*gb_lj_program,"kernel_sphere_gb");
k_lj_fast.set_function(*gb_lj_program,"kernel_lj_fast");
k_lj.set_function(*gb_lj_program,"kernel_lj");
_compiled=true;
}
template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,156 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "gb_gpu_memory.h"
#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
template <class numtyp, class acctyp>
GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() {
this->atom.atom_fields(8);
this->atom.ans_fields(13);
this->nbor.packing(true);
}
template <class numtyp, class acctyp>
GB_GPU_MemoryT::~GB_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
const double gamma, const double upsilon,
const double mu, double **host_shape,
double **host_well, double **host_cutsq,
double **host_sigma, double **host_epsilon,
double *host_lshape, int **h_form, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const bool force_d, const int me) {
_max_nbors=max_nbors;
if (this->allocated)
clear();
bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
host_lj1, host_lj2, host_lj3, host_lj4,
host_offset, host_special_lj, max_nbors, me,
nlocal, nall);
if (!p)
return false;
host_form=h_form;
// Initialize timers for the selected GPU
time_kernel.init();
time_gayberne.init();
time_kernel2.init();
time_gayberne2.init();
// Use the write buffer from atom for data initialization
NVC_HostT &host_write=this->atom.host_write;
assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
// Allocate, cast and asynchronous memcpy of constant data
gamma_upsilon_mu.safe_alloc(3);
host_write[0]=static_cast<numtyp>(gamma);
host_write[1]=static_cast<numtyp>(upsilon);
host_write[2]=static_cast<numtyp>(mu);
gamma_upsilon_mu.copy_from_host(host_write.begin());
lshape.safe_alloc(ntypes,lshape_get_texture<numtyp>());
lshape.cast_copy(host_lshape,host_write);
lshape.copy_from_host(host_write.begin());
// Copy shape, well, sigma, epsilon, and cutsq onto GPU
shape.safe_alloc(ntypes,3,shape_get_texture<numtyp>());
shape.cast_copy(host_shape[0],host_write);
well.safe_alloc(ntypes,3,well_get_texture<numtyp>());
well.cast_copy(host_well[0],host_write);
// Copy LJ data onto GPU
int lj_types=ntypes;
if (lj_types<=MAX_SHARED_TYPES)
lj_types=MAX_SHARED_TYPES;
form.safe_alloc(lj_types,lj_types,form_get_texture());
form.copy_2Dfrom_host(host_form[0],ntypes,ntypes);
// See if we want fast GB-sphere or sphere-sphere calculations
multiple_forms=false;
for (int i=1; i<ntypes; i++)
for (int j=i; j<ntypes; j++)
if (host_form[i][j]!=ELLIPSE_ELLIPSE)
multiple_forms=true;
// Memory for ilist ordered by particle type
return host_olist.alloc_rw(this->max_local);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::resize_atom(const int nall, bool &success) {
this->max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
this->atom.resize(this->max_atoms, success);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::resize_local(const int nlocal, const int max_nbors,
bool &success) {
if (nlocal>this->max_local) {
this->max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
host_olist.clear();
success=success && host_olist.alloc_rw(this->max_local);
}
if (max_nbors>_max_nbors)
_max_nbors=static_cast<int>(static_cast<double>(max_nbors)*1.10);
this->nbor.resize(this->max_local,_max_nbors,success);
}
template <class numtyp, class acctyp>
void GB_GPU_MemoryT::clear() {
if (!this->allocated)
return;
int err_flag;
this->dev_error.copy_to_host(&err_flag);
if (err_flag == 1)
std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N "
<< "and RECOMPILE.\n";
else if (err_flag == 2)
std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
LJ_GPU_MemoryT::clear();
lshape.unbind();
shape.clear();
well.clear();
form.clear();
lshape.clear();
gamma_upsilon_mu.clear();
host_olist.clear();
}
template <class numtyp, class acctyp>
double GB_GPU_MemoryT::host_memory_usage() {
return this->atom.host_memory_usage(this->max_atoms)+
this->nbor.host_memory_usage()+4*sizeof(numtyp)+
sizeof(GB_GPU_Memory<numtyp,acctyp>)+this->max_atoms*sizeof(int);
}
template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -12,61 +12,183 @@
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef GB_GPU_MEMORY_H
#define GB_GPU_MEMORY_H
#define MAX_GPU_THREADS 4
#include "lj_gpu_memory.h"
#define BLOCK_1D 64
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#include "pair_gpu_device.h"
#include "pair_gpu_balance.h"
#include "mpi.h"
template <class numtyp, class acctyp>
class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
class GB_GPU_Memory {
public:
GB_GPU_Memory();
~GB_GPU_Memory();
bool init(const int ij_size, const int ntypes, const double gamma,
/// Clear any previous data and set up for a new LAMMPS run
/** \param gpu_nbor true if neighboring performed on device
* \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \return false if there is not sufficient memory or device init prob **/
bool init(const int ntypes, const double gamma,
const double upsilon, const double mu, double **host_shape,
double **host_well, double **host_cutsq, double **host_sigma,
double **host_epsilon, double *host_lshape, int **h_form,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int max_nbors, const int nlocal, const int nall,
const bool force_d, const int me);
double **host_lj4, double **host_offset,
const double *host_special_lj, const int nlocal, const int nall,
const int max_nbors, const double cell_size,
const double gpu_split, FILE *screen);
void resize_atom(const int nall, bool &success);
void resize_local(const int nlocal, const int max_nbors, bool &success);
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
atom->resize(inum, nall, success);
if (multiple_forms) atom->dev_ans.zero();
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_bytes)
_max_bytes=bytes;
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \param olist_size size of list of particles from CPU neighboring
* \note host_inum is 0 if the host is performing neighboring
* \note if GPU is neighboring nlocal+host_inum=total number local particles
* \note if CPU is neighboring olist_size=total number of local particles
* \note if GPU is neighboring olist_size=0 **/
inline void resize_local(const int nlocal, const int host_inum,
const int max_nbors, const int olist_size,
bool &success) {
if (olist_size>static_cast<int>(host_olist.numel())) {
host_olist.clear();
int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
}
nbor->resize(nlocal,host_inum,max_nbors,success);
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_bytes)
_max_bytes=bytes;
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
double host_memory_usage();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
/// Accumulate timers
inline void acc_timers() {
if (nbor_time_avail) {
nbor->time_nbor.add_to_total();
nbor->time_kernel.add_to_total();
nbor_time_avail=false;
}
time_kernel.add_to_total();
time_gayberne.add_to_total();
if (multiple_forms) {
time_kernel2.add_to_total();
time_gayberne2.add_to_total();
time_pair.add_to_total();
}
atom->acc_timers();
}
// ---------------------------- DATA ----------------------------
/// Accumulate timers
inline void zero_timers() {
nbor_time_avail=false;
time_kernel.zero();
time_gayberne.zero();
if (multiple_forms) {
time_kernel2.zero();
time_gayberne2.zero();
time_pair.zero();
}
atom->zero_timers();
}
// ilist with particles sorted by type
NVC_HostI host_olist;
// --------------- Const Data for Atoms
NVC_ConstMatT shape, well;
NVC_ConstMatI form;
NVC_VecT lshape, gamma_upsilon_mu;
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
PairGPUDevice<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Error Flag - Set if a bad matrix inversion occurs
UCL_D_Vec<int> dev_error;
/// Device timers
UCL_Timer time_kernel, time_gayberne, time_kernel2, time_gayberne2, time_pair;
/// Host device load balancer
PairGPUBalance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- TYPE DATA --------------------------
// --------------- Timing Stuff
NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2;
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
UCL_D_Vec<numtyp2> sigma_epsilon;
/// cut_form.x = cutsq, cut_form.y = form
UCL_D_Vec<numtyp2> cut_form;
// 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
UCL_D_Vec<numtyp> gamma_upsilon_mu;
// True if we want to use fast GB-sphere or sphere-sphere calculations
bool multiple_forms;
int **host_form;
int last_ellipse;
int _max_nbors;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
int _lj_types;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
PairGPUAtom<numtyp,acctyp> *atom;
/// Aspherical Const Data for Atoms
UCL_D_Vec<numtyp4> shape, well;
/// Aspherical Const Data for Atoms
UCL_D_Vec<numtyp> lshape;
int last_ellipse, max_last_ellipse;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
PairGPUNbor *nbor;
/// ilist with particles sorted by type
UCL_H_Vec<int> host_olist;
/// True if we should accumulate the neighbor timer
bool nbor_time_avail;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program, *gb_program, *gb_lj_program;
UCL_Kernel k_gb_nbor_fast, k_gb_nbor;
UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
inline int block_size() { return _block_size; }
private:
bool _allocated, _compiled;
int _block_size;
double _max_bytes;
void compile_kernels(UCL_Device &dev);
};
#endif

27
lib/gpu/geryon/README Normal file
View File

@ -0,0 +1,27 @@
Geryon
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
Geryon is intended to be a simple library for managing the CUDA Runtime,
CUDA Driver, and OpenCL APIs with a consistent interface:
* Change from one API to another by simply changing the namespace
* Use multiple APIs in the same code
* Lightweight (only include files - no build required)
* Manage device query and selection
* Simple vector and matrix containers
* Simple routines for data copy and type casting
* Simple routines for data I/O
* Simple classes for managing device timing
* Simple classes for managing kernel compilation and execution
Geryon does not require building (although a Makefile is provided for testing
purposes). The library is a set of header files that can be included with
your code.
Documentation and examples are provided at
http://users.nccs.gov/~wb8/geryon/index.htm

View File

@ -0,0 +1 @@
Geryon Version 10.280

47
lib/gpu/geryon/file_to_cstr.sh Executable file
View File

@ -0,0 +1,47 @@
#!/bin/sh
# convert ptx assembly output into
# a c-style string constant written
# in portable posix shell script.
# requires: sed, rm, mv
#
# Author: Axel Kohlmeyer, Temple University
num_args=$#
# we write to a scratch file, since
# we know the real file name only at
# the very end.
output=geryon.tmp.$$
: > $output
# remove temporary file in case we're interrupted.
cleanup () {
rm -f geryon.tmp.$$
}
trap cleanup INT QUIT TERM
# loop over arguments and convert to
# string constants.
i=1
while [ $i -lt $num_args ]
do \
src=$1
krn=${src##*/}
krn=${krn%.*}
echo "Converting kernel $krn from $src to a c-style string"
echo "const char * $krn = " >> $output
sed -e 's/\\/\\\\/g' \
-e 's/"/\\"/g' \
-e 's/ *\/\/.*$//' \
-e '/\.file/D' \
-e '/^[ ]*$/D' \
-e 's/^\(.*\)$/"\1\\n"/' $src >> $output
echo ';' >> $output
shift
i=`expr $i + 1`
done
# $1 holds now the real output file name
mv $output $1

311
lib/gpu/geryon/nvc_device.h Normal file
View File

@ -0,0 +1,311 @@
/***************************************************************************
nvc_device.h
-------------------
W. Michael Brown
Utilities for dealing with cuda devices
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Wed Jan 28 2009
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef NVC_DEVICE
#define NVC_DEVICE
#include <string>
#include <vector>
#include <iostream>
#include <cstdlib>
#include "nvc_macros.h"
#include "ucl_types.h"
namespace ucl_cudart {
// --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF
// --------------------------------------------------------------------------
typedef cudaStream_t command_queue;
inline void ucl_sync(cudaStream_t &stream) {
CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
}
/// Class for looking at device properties
/** \note Calls to change the device outside of the class results in incorrect
* behavior
* \note There is no error checking for indexing past the number of devices **/
class UCL_Device {
public:
/// Collect properties for every GPU on the node
/** \note You must set the active GPU with set() before using the device **/
UCL_Device();
~UCL_Device();
/// Returns 1 (For compatibility with OpenCL)
inline int num_platforms() { return 1; }
/// Return a string with name and info of the current platform
std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA"; }
/// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); }
/// Set the CUDA device to the specified device number
void set(int num);
/// Get the current device number
inline int device_num() { return _device; }
/// Returns the default stream for the current device
inline command_queue & cq() { return cq(0); }
/// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; }
/// Block until all commands in the default stream have completed
inline void sync() { sync(0); }
/// Block until all commands in the specified stream have completed
inline void sync(const int i) { ucl_sync(cq(i)); }
/// Get the number of command queues currently available on device
inline int num_queues()
{ if (_device==-1) return 0; else return _cq.size(); }
/// Add a stream for device computations
inline void push_command_queue() {
_cq.push_back(cudaStream_t());
CUDA_SAFE_CALL_NS(cudaStreamCreate(&_cq.back()));
}
/// Remove a stream for device computations
/** \note You cannot delete the default stream **/
inline void pop_command_queue() {
if (_cq.size()<2) return;
CUDA_SAFE_CALL_NS(cudaStreamDestroy(_cq.back()));
_cq.pop_back();
}
/// Get the current CUDA device name
inline std::string name() { return name(_device); }
/// Get the CUDA device name
inline std::string name(const int i)
{ return std::string(_properties[i].name); }
/// Get a string telling the type of the current device
inline std::string device_type_name() { return device_type_name(_device); }
/// Get a string telling the type of the device
inline std::string device_type_name(const int i) { return "GPU"; }
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i) { return UCL_GPU; }
/// Returns true if double precision is support for the current device
bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device
bool double_precision(const int i) {return arch(i)>=1.3;}
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores
inline unsigned cores(const int i)
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else return _properties[i].multiProcessorCount*32; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory
inline double gigabytes(const int i)
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
/// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory
inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
/// Return the GPGPU compute capability for current device
inline double arch() { return arch(_device); }
/// Return the GPGPU compute capability
inline double arch(const int i)
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
/// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz
inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;}
/// Get the maximum number of threads per block
inline size_t group_size() { return group_size(_device); }
/// Get the maximum number of threads per block
inline size_t group_size(const int i)
{ return _properties[i].maxThreadsPerBlock; }
/// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes
inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
/// List all devices along with all properties
void print_all(std::ostream &out);
private:
int _device, _num_devices;
std::vector<cudaDeviceProp> _properties;
std::vector<cudaStream_t> _cq;
};
// Grabs the properties for all devices
inline UCL_Device::UCL_Device() {
CUDA_SAFE_CALL_NS(cudaGetDeviceCount(&_num_devices));
for (int dev=0; dev<_num_devices; ++dev) {
cudaDeviceProp deviceProp;
CUDA_SAFE_CALL_NS(cudaGetDeviceProperties(&deviceProp, dev));
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
break;
_properties.push_back(deviceProp);
}
_device=-1;
_cq.push_back(cudaStream_t());
_cq.back()=0;
}
inline UCL_Device::~UCL_Device() {
for (int i=1; i<num_queues(); i++) pop_command_queue();
}
// Set the CUDA device to the specified device number
inline void UCL_Device::set(int num) {
if (_device==num)
return;
for (int i=1; i<num_queues(); i++) pop_command_queue();
cudaThreadExit();
CUDA_SAFE_CALL_NS(cudaSetDevice(num));
_device=num;
}
// List all devices along with all properties
inline void UCL_Device::print_all(std::ostream &out) {
#if CUDART_VERSION >= 2020
int driver_version, runtime_version;
cudaDriverGetVersion(&driver_version);
out << "CUDA Driver Version: "
<< driver_version/1000 << "." << driver_version%100
<< std::endl;
cudaRuntimeGetVersion(&runtime_version);
out << "CUDA Runtime Version: "
<< runtime_version/1000 << "." << runtime_version%100
<< std::endl;
#endif
if (num_devices() == 0)
out << "There is no device supporting CUDA\n";
for (int i=0; i<num_devices(); ++i) {
out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
out << " Type of device: "
<< device_type_name(i).c_str() << std::endl;
out << " Compute capability: "
<< arch(i) << std::endl;
out << " Double precision support: ";
if (double_precision(i))
out << "Yes\n";
else
out << "No\n";
out << " Total amount of global memory: "
<< gigabytes(i) << " GB\n";
#if CUDART_VERSION >= 2000
out << " Number of compute units/multiprocessors: "
<< _properties[i].multiProcessorCount << std::endl;
out << " Number of cores: "
<< cores(i) << std::endl;
#endif
out << " Total amount of constant memory: "
<< _properties[i].totalConstMem << " bytes\n";
out << " Total amount of local/shared memory per block: "
<< _properties[i].sharedMemPerBlock << " bytes\n";
out << " Total number of registers available per block: "
<< _properties[i].regsPerBlock << std::endl;
out << " Warp size: "
<< _properties[i].warpSize << std::endl;
out << " Maximum number of threads per block: "
<< _properties[i].maxThreadsPerBlock << std::endl;
out << " Maximum group size (# of threads per block) "
<< _properties[i].maxThreadsDim[0] << " x "
<< _properties[i].maxThreadsDim[1] << " x "
<< _properties[i].maxThreadsDim[2] << std::endl;
out << " Maximum item sizes (# threads for each dim) "
<< _properties[i].maxGridSize[0] << " x "
<< _properties[i].maxGridSize[1] << " x "
<< _properties[i].maxGridSize[2] << std::endl;
out << " Maximum memory pitch: "
<< max_pitch(i) << " bytes\n";
out << " Texture alignment: "
<< _properties[i].textureAlignment << " bytes\n";
out << " Clock rate: "
<< clock_rate(i) << " GHz\n";
#if CUDART_VERSION >= 2000
out << " Concurrent copy and execution: ";
if (_properties[i].deviceOverlap)
out << "Yes\n";
else
out << "No\n";
#endif
#if CUDART_VERSION >= 2020
out << " Run time limit on kernels: ";
if (_properties[i].kernelExecTimeoutEnabled)
out << "Yes\n";
else
out << "No\n";
out << " Integrated: ";
if (_properties[i].integrated)
out << "Yes\n";
else
out << "No\n";
out << " Support host page-locked memory mapping: ";
if (_properties[i].canMapHostMemory)
out << "Yes\n";
else
out << "No\n";
out << " Compute mode: ";
if (_properties[i].computeMode == cudaComputeModeDefault)
out << "Default\n"; // multiple threads can use device
else if (_properties[i].computeMode == cudaComputeModeExclusive)
out << "Exclusive\n"; // only thread can use device
else if (_properties[i].computeMode == cudaComputeModeProhibited)
out << "Prohibited\n"; // no thread can use device
else
out << "Unknown\n";
#endif
#if CUDART_VERSION >= 3000
out << " Concurrent kernel execution: ";
if (_properties[i].concurrentKernels)
out << "Yes\n";
else
out << "No\n";
out << " Device has ECC support enabled: ";
if (_properties[i].ECCEnabled)
out << "Yes\n";
else
out << "No\n";
#endif
}
}
}
#endif

View File

@ -0,0 +1,57 @@
#ifndef NVC_MACROS_H
#define NVC_MACROS_H
#if defined(__APPLE__)
#if _GLIBCXX_ATOMIC_BUILTINS == 1
#undef _GLIBCXX_ATOMIC_BUILTINS
#endif // _GLIBCXX_ATOMIC_BUILTINS
#endif // __APPLE__
#include <stdio.h>
#include <cassert>
#include <cuda_runtime.h>
#ifdef MPI_GERYON
#include "mpi.h"
#define NVC_GERYON_EXIT MPI_Abort(MPI_COMM_WORLD,-1)
#else
#define NVC_GERYON_EXIT assert(0==1)
#endif
#ifndef UCL_NO_API_CHECK
#define CUDA_SAFE_CALL_NS( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in call at file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
NVC_GERYON_EXIT; \
} } while (0)
#ifdef UCL_SYNC_DEBUG
#define CUDA_SAFE_CALL( call) do { \
CUDA_SAFE_CALL_NS( call); \
cudaError err=cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
NVC_GERYON_EXIT; \
} } while (0)
#else
#define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NS( call)
#endif
#else // not DEBUG
// void macros for performance reasons
#define CUDA_SAFE_CALL( call) call
#define CUDA_SAFE_CALL_NS( call) call
#endif
#endif

View File

@ -0,0 +1,69 @@
/***************************************************************************
nvc_texture.h
-------------------
W. Michael Brown
Utilities for dealing with CUDA Runtime textures
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Fri Jul 2 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef NVC_TEXTURE
#define NVC_TEXTURE
#include "nvc_mat.h"
namespace ucl_cudart {
/// Class storing a texture reference
class UCL_Texture {
public:
UCL_Texture() {}
~UCL_Texture() {}
/// Construct with a specified texture reference
inline UCL_Texture(textureReference *t) { get_texture(t); }
/// Set the texture reference for this object
inline void get_texture(textureReference *t) { _tex_ptr=t; }
/// Bind a float array where each fetch grabs a vector of length numel
template<class mat_typ>
inline void bind_float(mat_typ &vec, const unsigned numel) {
#ifdef UCL_DEBUG
assert(numel!=0 && numel<5);
#endif
int bits[4]={0,0,0,0};
for (int i=0; i<numel; i++) bits[i]=32;
_channel = cudaCreateChannelDesc(bits[0], bits[1], bits[2], bits[3],
cudaChannelFormatKindFloat);
(*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
(*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
(*_tex_ptr).filterMode = cudaFilterModePoint;
(*_tex_ptr).normalized = false;
CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,vec.cbegin(),&_channel));
}
/// Unbind the texture reference from the memory allocation
inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
private:
textureReference *_tex_ptr;
cudaChannelFormatDesc _channel;
};
} // namespace
#endif

359
lib/gpu/geryon/nvd_device.h Normal file
View File

@ -0,0 +1,359 @@
/***************************************************************************
nvd_device.h
-------------------
W. Michael Brown
Utilities for dealing with cuda devices
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jan 21 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef NVD_DEVICE
#define NVD_DEVICE
#include <string>
#include <vector>
#include <iostream>
#include "nvd_macros.h"
#include "ucl_types.h"
namespace ucl_cudadr {
// --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF
// --------------------------------------------------------------------------
typedef CUstream command_queue;
inline void ucl_sync(CUstream &stream) {
CU_SAFE_CALL(cuStreamSynchronize(stream));
}
struct NVDProperties {
std::string name;
int major;
int minor;
CUDA_INT_TYPE totalGlobalMem;
int multiProcessorCount;
CUdevprop_st p;
int kernelExecTimeoutEnabled;
int integrated;
int canMapHostMemory;
int concurrentKernels;
int ECCEnabled;
};
/// Class for looking at device properties
/** \note Calls to change the device outside of the class results in incorrect
* behavior
* \note There is no error checking for indexing past the number of devices **/
class UCL_Device {
public:
/// Collect properties for every GPU on the node
/** \note You must set the active GPU with set() before using the device **/
UCL_Device();
~UCL_Device();
/// Returns 1 (For compatibility with OpenCL)
inline int num_platforms() { return 1; }
/// Return a string with name and info of the current platform
std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
/// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); }
/// Set the CUDA device to the specified device number
/** A context and default command queue will be created for the device **/
void set(int num);
/// Get the current device number
inline int device_num() { return _device; }
/// Returns the default stream for the current device
inline command_queue & cq() { return cq(0); }
/// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; }
/// Block until all commands in the default stream have completed
inline void sync() { sync(0); }
/// Block until all commands in the specified stream have completed
inline void sync(const int i) { ucl_sync(cq(i)); }
/// Get the number of command queues currently available on device
inline int num_queues()
{ return _cq.size(); }
/// Add a stream for device computations
inline void push_command_queue() {
_cq.push_back(CUstream());
CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0));
}
/// Remove a stream for device computations
/** \note You cannot delete the default stream **/
inline void pop_command_queue() {
if (_cq.size()<2) return;
CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back()));
_cq.pop_back();
}
/// Get the current CUDA device name
inline std::string name() { return name(_device); }
/// Get the CUDA device name
inline std::string name(const int i)
{ return std::string(_properties[i].name); }
/// Get a string telling the type of the current device
inline std::string device_type_name() { return device_type_name(_device); }
/// Get a string telling the type of the device
inline std::string device_type_name(const int i) { return "GPU"; }
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i) { return UCL_GPU; }
/// Returns true if double precision is support for the current device
bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device
bool double_precision(const int i) {return arch(i)>=1.3;}
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores
inline unsigned cores(const int i)
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else return _properties[i].multiProcessorCount*32; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory
inline double gigabytes(const int i)
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
/// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory
inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
// Get the gigabytes of free memory in the current device
inline double free_gigabytes() { return free_gigabytes(_device); }
// Get the gigabytes of free memory
inline double free_gigabytes(const int i)
{ return static_cast<double>(free_bytes(i))/1073741824; }
// Get the bytes of free memory in the current device
inline size_t free_bytes() { return free_bytes(_device); }
// Get the bytes of free memory
inline size_t free_bytes(const int i) {
CUDA_INT_TYPE dfree, dtotal;
CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal));
return static_cast<size_t>(dfree);
}
/// Return the GPGPU compute capability for current device
inline double arch() { return arch(_device); }
/// Return the GPGPU compute capability
inline double arch(const int i)
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
/// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz
inline double clock_rate(const int i)
{ return _properties[i].p.clockRate*1e-6;}
/// Get the maximum number of threads per block
inline size_t group_size() { return group_size(_device); }
/// Get the maximum number of threads per block
inline size_t group_size(const int i)
{ return _properties[i].p.maxThreadsPerBlock; }
/// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes
inline size_t max_pitch(const int i) { return _properties[i].p.memPitch; }
/// List all devices along with all properties
void print_all(std::ostream &out);
private:
int _device, _num_devices;
std::vector<NVDProperties> _properties;
std::vector<CUstream> _cq;
CUdevice _cu_device;
CUcontext _context;
};
// Grabs the properties for all devices
inline UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(cuInit(0));
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
for (int dev=0; dev<_num_devices; ++dev) {
CUdevice m;
CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
_properties.push_back(NVDProperties());
char namecstr[1024];
CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
_properties.back().name=namecstr;
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
&_properties.back().minor,m));
CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
m));
CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
#if CUDA_VERSION >= 2020
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().kernelExecTimeoutEnabled,
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().integrated,
CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().canMapHostMemory,
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
#endif
#if CUDA_VERSION >= 3000
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().concurrentKernels,
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
&_properties.back().ECCEnabled,
CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
#endif
}
_device=-1;
_cq.push_back(CUstream());
_cq.back()=0;
}
inline UCL_Device::~UCL_Device() {
if (_device>-1) {
for (int i=1; i<num_queues(); i++) pop_command_queue();
cuCtxDestroy(_context);
}
}
// Set the CUDA device to the specified device number
inline void UCL_Device::set(int num) {
if (_device==num)
return;
if (_device>-1) {
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
for (int i=1; i<num_queues(); i++) pop_command_queue();
}
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
_device=num;
}
// List all devices along with all properties
inline void UCL_Device::print_all(std::ostream &out) {
#if CUDA_VERSION >= 2020
int driver_version;
cuDriverGetVersion(&driver_version);
out << "CUDA Driver Version: "
<< driver_version/1000 << "." << driver_version%100
<< std::endl;
#endif
if (num_devices() == 0)
out << "There is no device supporting CUDA\n";
for (int i=0; i<num_devices(); ++i) {
out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
out << " Type of device: "
<< device_type_name(i).c_str() << std::endl;
out << " Compute capability: "
<< arch(i) << std::endl;
out << " Double precision support: ";
if (double_precision(i))
out << "Yes\n";
else
out << "No\n";
out << " Total amount of global memory: "
<< gigabytes(i) << " GB\n";
#if CUDA_VERSION >= 2000
out << " Number of compute units/multiprocessors: "
<< _properties[i].multiProcessorCount << std::endl;
out << " Number of cores: "
<< cores(i) << std::endl;
#endif
out << " Total amount of constant memory: "
<< _properties[i].p.totalConstantMemory << " bytes\n";
out << " Total amount of local/shared memory per block: "
<< _properties[i].p.sharedMemPerBlock << " bytes\n";
out << " Total number of registers available per block: "
<< _properties[i].p.regsPerBlock << std::endl;
out << " Warp size: "
<< _properties[i].p.SIMDWidth << std::endl;
out << " Maximum number of threads per block: "
<< _properties[i].p.maxThreadsPerBlock << std::endl;
out << " Maximum group size (# of threads per block) "
<< _properties[i].p.maxThreadsDim[0] << " x "
<< _properties[i].p.maxThreadsDim[1] << " x "
<< _properties[i].p.maxThreadsDim[2] << std::endl;
out << " Maximum item sizes (# threads for each dim) "
<< _properties[i].p.maxGridSize[0] << " x "
<< _properties[i].p.maxGridSize[1] << " x "
<< _properties[i].p.maxGridSize[2] << std::endl;
out << " Maximum memory pitch: "
<< max_pitch(i) << " bytes\n";
out << " Texture alignment: "
<< _properties[i].p.textureAlign << " bytes\n";
out << " Clock rate: "
<< clock_rate(i) << " GHz\n";
#if CUDA_VERSION >= 2020
out << " Run time limit on kernels: ";
if (_properties[i].kernelExecTimeoutEnabled)
out << "Yes\n";
else
out << "No\n";
out << " Integrated: ";
if (_properties[i].integrated)
out << "Yes\n";
else
out << "No\n";
out << " Support host page-locked memory mapping: ";
if (_properties[i].canMapHostMemory)
out << "Yes\n";
else
out << "No\n";
#endif
#if CUDA_VERSION >= 3000
out << " Concurrent kernel execution: ";
if (_properties[i].concurrentKernels)
out << "Yes\n";
else
out << "No\n";
out << " Device has ECC support enabled: ";
if (_properties[i].ECCEnabled)
out << "Yes\n";
else
out << "No\n";
#endif
}
}
}
#endif

259
lib/gpu/geryon/nvd_kernel.h Normal file
View File

@ -0,0 +1,259 @@
/***************************************************************************
nvd_kernel.h
-------------------
W. Michael Brown
Utilities for dealing with CUDA Driver kernels
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Tue Feb 9 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef NVD_KERNEL
#define NVD_KERNEL
#include "nvd_device.h"
#include <fstream>
namespace ucl_cudadr {
class UCL_Texture;
/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
public:
inline UCL_Program(UCL_Device &device) {}
inline ~UCL_Program() {}
/// Initialize the program with a device
inline void init(UCL_Device &device) { }
/// Clear any data associated with program
/** \note Must call init() after each clear **/
inline void clear() { }
/// Load a program from a file and compile with flags
inline int load(const char *filename, const char *flags="",
std::string *log=NULL) {
std::ifstream in(filename);
if (!in || in.is_open()==false) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open kernel file: "
<< filename << std::endl;
exit(1);
#endif
return UCL_FILE_NOT_FOUND;
}
std::string program((std::istreambuf_iterator<char>(in)),
std::istreambuf_iterator<char>());
in.close();
return load_string(program.c_str(),flags,log);
}
/// Load a program from a string and compile with flags
inline int load_string(const char *program, const char *flags="",
std::string *log=NULL) {
if (std::string(flags)=="BINARY")
return load_binary(program);
const unsigned int num_opts=2;
CUjit_option options[num_opts];
void *values[num_opts];
// set up size of compilation log buffer
options[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
values[0] = (void *)(int)10240;
// set up pointer to the compilation log buffer
options[1] = CU_JIT_INFO_LOG_BUFFER;
char clog[10240];
values[1] = clog;
CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
options,(void **)values);
if (log!=NULL)
*log=std::string(clog);
if (err != CUDA_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << std::endl
<< "----------------------------------------------------------\n"
<< " UCL Error: Error compiling PTX Program...\n"
<< "----------------------------------------------------------\n";
std::cerr << log << std::endl;
#endif
return UCL_COMPILE_ERROR;
}
return UCL_SUCCESS;
}
/// Load a precompiled program from a file
inline int load_binary(const char *filename) {
CUmodule _module;
CUresult err = cuModuleLoad(&_module,filename);
if (err==301) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open binary kernel file: "
<< filename << std::endl;
exit(1);
#endif
return UCL_FILE_NOT_FOUND;
} else if (err!=CUDA_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Error loading binary kernel file: "
<< filename << std::endl;
exit(1);
#endif
return UCL_FILE_NOT_FOUND;
}
//int ucl_error=UCL_SUCCESS;
//if (err==301)
// return UCL_FILE_NOT_FOUND;
//else if (err!=CUDA_SUCCESS)
// return UCL_ERROR;
return UCL_SUCCESS;
}
friend class UCL_Kernel;
private:
CUmodule _module;
friend class UCL_Texture;
};
/// Class for dealing with OpenCL kernels
class UCL_Kernel {
public:
UCL_Kernel() : _dimensions(1), _num_args(0), _param_size(0)
{ _num_blocks[0]=0; }
UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _num_args(0), _param_size(0)
{ _num_blocks[0]=0; set_function(program,function); }
~UCL_Kernel() {}
/// Clear any function associated with the kernel
inline void clear() { }
/// Get the kernel function from a program
/** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
inline int set_function(UCL_Program &program, const char *function) {
CUresult err=cuModuleGetFunction(&_kernel,program._module,function);
if (err!=CUDA_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not find function: " << function
<< " in program.\n";
exit(1);
#endif
return UCL_FUNCTION_NOT_FOUND;
}
return UCL_SUCCESS;
}
/// Set the kernel argument.
/** If not a device pointer, this must be repeated each time the argument
* changes
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
template <class dtype>
inline void set_arg(const unsigned index, dtype *arg) {
if (index==_num_args)
add_arg(arg);
else if (index<_num_args)
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
else
assert(0==1); // Must add kernel parameters in sequential order
}
/// Add a kernel argument.
inline void add_arg(const CUdeviceptr* const arg) {
void* ptr = (void*)(size_t)(*arg);
_param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
_offsets.push_back(_param_size);
_param_size+=sizeof(ptr);
_num_args++;
}
/// Add a kernel argument.
template <class dtype>
inline void add_arg(const dtype* const arg) {
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
_offsets.push_back(_param_size);
_param_size+=sizeof(dtype);
_num_args++;
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called after all arguments have been added **/
inline void set_size(const size_t num_blocks, const size_t block_size) {
_dimensions=1;
_num_blocks[0]=num_blocks;
_num_blocks[1]=1;
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
}
/// Set the number of thread blocks and the number of threads in each block
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
}
/// Set the number of thread blocks and the number of threads in each block
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
block_size_z));
}
/// Run the kernel in the default command queue
inline void run() {
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],0));
}
/// Run the kernel in the specified command queue
inline void run(command_queue &cq) {
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],cq));
}
/// Clear any arguments associated with the kernel
inline void clear_args() { _num_args=0; _offsets.clear(); _param_size=0; }
#include "ucl_arg_kludge.h"
private:
CUfunction _kernel;
unsigned _dimensions;
unsigned _num_blocks[2];
unsigned _num_args;
std::vector<unsigned> _offsets;
unsigned _param_size;
friend class UCL_Texture;
};
} // namespace
#endif

View File

@ -0,0 +1,57 @@
#ifndef NVD_MACROS_H
#define NVD_MACROS_H
#include <stdio.h>
#include <cassert>
#include <cuda.h>
#if CUDA_VERSION >= 3020
#define CUDA_INT_TYPE size_t
#else
#define CUDA_INT_TYPE unsigned
#endif
#ifdef MPI_GERYON
#include "mpi.h"
#define NVD_GERYON_EXIT MPI_Abort(MPI_COMM_WORLD,-1)
#else
#define NVD_GERYON_EXIT assert(0==1)
#endif
#ifndef UCL_NO_API_CHECK
#define CU_SAFE_CALL_NS( call ) do { \
CUresult err = call; \
if( CUDA_SUCCESS != err) { \
fprintf(stderr, "Cuda driver error %d in call at file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
NVD_GERYON_EXIT; \
} } while (0)
#ifdef UCL_SYNC_DEBUG
#define CU_SAFE_CALL( call ) do { \
CU_SAFE_CALL_NS( call ); \
CUresult err=cuCtxSynchronize(); \
if( CUDA_SUCCESS != err) { \
fprintf(stderr, "Cuda driver error %d in file '%s' in line %i.\n", \
err, __FILE__, __LINE__ ); \
NVD_GERYON_EXIT; \
} } while (0)
#else
#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )
#endif
#else // not DEBUG
// void macros for performance reasons
#define CU_SAFE_CALL_NS( call ) call
#define CU_SAFE_CALL( call) call
#endif
#endif

54
lib/gpu/geryon/nvd_mat.h Normal file
View File

@ -0,0 +1,54 @@
/***************************************************************************
nvd_mat.h
-------------------
W. Michael Brown
CUDA Driver Specific Vector/Matrix Containers, Memory Management, and I/O
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jan 21 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
/*! \file */
#ifndef NVD_MAT_H
#define NVD_MAT_H
#include "nvd_memory.h"
/// Namespace for CUDA Driver routines
namespace ucl_cudadr {
#define _UCL_MAT_ALLOW
#define _UCL_DEVICE_PTR_MAT
#include "ucl_basemat.h"
#include "ucl_h_vec.h"
#include "ucl_h_mat.h"
#include "ucl_d_vec.h"
#include "ucl_d_mat.h"
#undef _UCL_DEVICE_PTR_MAT
#undef _UCL_MAT_ALLOW
#define UCL_COPY_ALLOW
#include "ucl_copy.h"
#undef UCL_COPY_ALLOW
#define UCL_PRINT_ALLOW
#include "ucl_print.h"
#undef UCL_PRINT_ALLOW
} // namespace ucl_cudadr
#endif

610
lib/gpu/geryon/nvd_memory.h Normal file
View File

@ -0,0 +1,610 @@
/***************************************************************************
nvd_memory.h
-------------------
W. Michael Brown
CUDA Driver Specific Memory Management and Vector/Matrix Containers
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jan 21 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef NVD_MEMORY_H
#define NVD_MEMORY_H
#include <iostream>
#include <cassert>
#include <cstring>
#include "nvd_macros.h"
#include "ucl_types.h"
namespace ucl_cudadr {
// --------------------------------------------------------------------------
// - API Specific Types
// --------------------------------------------------------------------------
//typedef dim3 ucl_kernel_dim;
// --------------------------------------------------------------------------
// - API SPECIFIC DEVICE POINTERS
// --------------------------------------------------------------------------
typedef CUdeviceptr device_ptr;
// --------------------------------------------------------------------------
// - HOST MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind) {
CUresult err=CUDA_SUCCESS;
if (kind==UCL_RW_OPTIMIZED)
err=cuMemAllocHost((void **)mat.host_ptr(),n);
else if (kind==UCL_WRITE_OPTIMIZED)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind) {
CUresult err=CUDA_SUCCESS;
if (kind==UCL_RW_OPTIMIZED)
err=cuMemAllocHost((void **)mat.host_ptr(),n);
else if (kind==UCL_WRITE_OPTIMIZED)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type>
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
if (kind!=UCL_NOT_PINNED)
CU_SAFE_CALL(cuMemFreeHost(mat.begin()));
else
free(mat.begin());
}
// --------------------------------------------------------------------------
// - DEVICE MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind) {
CUresult err=cuMemAlloc(&mat.cbegin(),n);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type>
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind) {
CUresult err=cuMemAlloc(&mat.cbegin(),n);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
CUresult err;
CUDA_INT_TYPE upitch;
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows,16);
pitch=static_cast<size_t>(upitch);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
CUresult err;
unsigned upitch;
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows,16);
pitch=static_cast<size_t>(upitch);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type>
inline void _device_free(mat_type &mat) {
CU_SAFE_CALL(cuMemFree(mat.cbegin()));
}
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
*ptr=in;
}
template <class numtyp>
inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
*ptr=0;
}
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
const size_t offset, const size_t numsize) {
*ptr=in+offset*numsize;
}
template <class numtyp>
inline void _device_view(CUdeviceptr *ptr, numtyp *in,
const size_t offset, const size_t numsize) {
*ptr=0;
}
// --------------------------------------------------------------------------
// - DEVICE IMAGE ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols) {
assert(0==1);
}
template <class mat_type, class copy_type>
inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
const size_t cols) {
assert(0==1);
}
template <class mat_type>
inline void _device_image_free(mat_type &mat) {
assert(0==1);
}
// --------------------------------------------------------------------------
// - ZERO ROUTINES
// --------------------------------------------------------------------------
inline void _host_zero(void *ptr, const size_t n) {
memset(ptr,0,n);
}
template <class mat_type>
inline void _device_zero(mat_type &mat, const size_t n) {
if (n%32==0)
CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4));
else if (n%16==0)
CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2));
else
CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n));
}
// --------------------------------------------------------------------------
// - HELPER FUNCTIONS FOR MEMCPY ROUTINES
// --------------------------------------------------------------------------
inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
const size_t spitch, const size_t cols,
const size_t rows) {
ins.srcXInBytes=0;
ins.srcY=0;
ins.srcPitch=spitch;
ins.dstXInBytes=0;
ins.dstY=0;
ins.dstPitch=dpitch;
ins.WidthInBytes=cols;
ins.Height=rows;
}
template <int mem> struct _nvd_set_2D_mem;
template <> struct _nvd_set_2D_mem<1>
{ static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } };
template <> struct _nvd_set_2D_mem<2>
{ static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } };
template <int mem> struct _nvd_set_2D_mem
{ static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } };
// --------------------------------------------------------------------------
// - MEMCPY ROUTINES
// --------------------------------------------------------------------------
template<int mem1, int mem2> struct _ucl_memcpy;
// Both are images
template<> struct _ucl_memcpy<2,2> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
}
};
// Destination is texture, source on device
template<> struct _ucl_memcpy<2,0> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
}
};
// Destination is texture, source on host
template<> struct _ucl_memcpy<2,1> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
}
};
// Source is texture, dest on device
template<> struct _ucl_memcpy<0,2> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
}
};
// Source is texture, dest on host
template<> struct _ucl_memcpy<1,2> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
}
};
// Neither are textures, destination on host
template <> struct _ucl_memcpy<1,0> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
CU_SAFE_CALL(cuMemcpyDtoH(dst.begin(),src.cbegin(),n));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq) {
CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
}
};
// Neither are textures, source on host
template <> struct _ucl_memcpy<0,1> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
CU_SAFE_CALL(cuMemcpyHtoD(dst.cbegin(),src.begin(),n));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq) {
CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
}
};
// Neither are textures, both on host
template <> struct _ucl_memcpy<1,1> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n)
{ memcpy(dst.begin(),src.begin(),n); }
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq)
{ memcpy(dst.begin(),src.begin(),n); }
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
}
};
// Neither are textures, both on device
template <int mem1, int mem2> struct _ucl_memcpy {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq) {
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
if (p1::PADDED==0 || p2::PADDED==0) {
size_t src_offset=0, dst_offset=0;
for (size_t i=0; i<rows; i++) {
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
src.cbegin()+src_offset,cols));
src_offset+=spitch;
dst_offset+=dpitch;
}
} else {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
}
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, CUstream &cq) {
if (p1::PADDED==0 || p2::PADDED==0) {
size_t src_offset=0, dst_offset=0;
for (size_t i=0; i<rows; i++) {
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
src.cbegin()+src_offset,cols));
src_offset+=spitch;
dst_offset+=dpitch;
}
} else {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
}
}
};
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n);
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
CUstream &cq) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,cq);
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
rows);
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols,
const size_t rows,CUstream &cq) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
rows,cq);
}
} // namespace ucl_cudart
#endif

View File

@ -0,0 +1,71 @@
/***************************************************************************
nvd_texture.h
-------------------
W. Michael Brown
Utilities for dealing with CUDA Driver textures
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Fri Jul 2 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef NVD_TEXTURE
#define NVD_TEXTURE
#include "nvd_kernel.h"
#include "nvd_mat.h"
namespace ucl_cudadr {
/// Class storing a texture reference
class UCL_Texture {
public:
UCL_Texture() {}
~UCL_Texture() {}
/// Construct with a specified texture reference
inline UCL_Texture(UCL_Program &prog, const char *texture_name)
{ get_texture(prog,texture_name); }
/// Set the texture reference for this object
inline void get_texture(UCL_Program &prog, const char *texture_name)
{ CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class mat_typ>
inline void bind_float(mat_typ &vec, const unsigned numel) {
#ifdef UCL_DEBUG
assert(numel!=0 && numel<5);
#endif
CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
vec.numel()*vec.element_size()));
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
}
/// Unbind the texture reference from the memory allocation
inline void unbind() { }
/// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) {
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
}
private:
CUtexref _tex;
friend class UCL_Kernel;
};
} // namespace
#endif

106
lib/gpu/geryon/nvd_timer.h Normal file
View File

@ -0,0 +1,106 @@
/***************************************************************************
nvd_timer.h
-------------------
W. Michael Brown
Class for timing CUDA Driver routines
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Fri Jan 22 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef NVD_TIMER_H
#define NVD_TIMER_H
#include "nvd_macros.h"
namespace ucl_cudadr {
/// Class for timing CUDA Driver events
class UCL_Timer {
public:
inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
{ init(dev); }
inline ~UCL_Timer() { clear(); }
/// Clear any data associated with timer
/** \note init() must be called to reuse timer after a clear() **/
inline void clear() {
if (_initialized) {
CU_SAFE_CALL(cuEventDestroy(start_event));
CU_SAFE_CALL(cuEventDestroy(stop_event));
_initialized=false;
_total_time=0.0;
}
}
/// Initialize default command queue for timing
inline void init(UCL_Device &dev) { init(dev, dev.cq()); }
/// Initialize command queue for timing
inline void init(UCL_Device &dev, command_queue &cq) {
clear();
_cq=cq;
_initialized=true;
CU_SAFE_CALL( cuEventCreate(&start_event,0) );
CU_SAFE_CALL( cuEventCreate(&stop_event,0) );
}
/// Start timing on command queue
inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); }
/// Stop timing on command queue
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
/// Set the time elapsed to zero (not the total_time)
inline void zero() {
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
}
/// Add time from previous start and stop to total
/** Forces synchronization **/
inline double add_to_total()
{ double t=time(); _total_time+=t; return t/1000.0; }
/// Return the time (ms) of last start to stop - Forces synchronization
inline double time() {
float timer;
CU_SAFE_CALL(cuEventSynchronize(stop_event));
CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) );
return timer;
}
/// Return the time (s) of last start to stop - Forces synchronization
inline double seconds() { return time()/1000.0; }
/// Return the total time in ms
inline double total_time() { return _total_time; }
/// Return the total time in seconds
inline double total_seconds() { return _total_time/1000.0; }
private:
CUevent start_event, stop_event;
CUstream _cq;
double _total_time;
bool _initialized;
};
} // namespace
#endif

449
lib/gpu/geryon/ocl_device.h Normal file
View File

@ -0,0 +1,449 @@
/***************************************************************************
ocl_device.h
-------------------
W. Michael Brown
Utilities for dealing with OpenCL devices
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Mon Dec 23 2009
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef OCL_DEVICE
#define OCL_DEVICE
#include <string>
#include <vector>
#include <iostream>
#include "CL/cl.h"
#include "CL/cl_platform.h"
#include "ocl_macros.h"
#include "ucl_types.h"
namespace ucl_opencl {
// --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF
// --------------------------------------------------------------------------
typedef cl_command_queue command_queue;
typedef cl_context context_type;
inline void ucl_sync(cl_command_queue &cq) {
CL_SAFE_CALL(clFinish(cq));
}
struct OCLProperties {
std::string name;
cl_device_type device_type;
cl_ulong global_mem;
cl_ulong shared_mem;
cl_ulong const_mem;
cl_uint compute_units;
cl_uint clock;
size_t work_group_size;
size_t work_item_size[3];
bool double_precision;
int alignment;
size_t timer_resolution;
};
/// Class for looking at data parallel device properties
/** \note Calls to change the device outside of the class results in incorrect
* behavior
* \note There is no error checking for indexing past the number of devices **/
class UCL_Device {
public:
/// Collect properties for every device on the node
/** \note You must set the active GPU with set() before using the device **/
UCL_Device();
~UCL_Device();
/// Return the number of platforms (0 if error or no platforms)
inline int num_platforms() { return _num_platforms; }
/// Return a string with name and info of the current platform
std::string platform_name();
/// Return the number of devices that support OpenCL
inline int num_devices() { return _num_devices; }
/// Set the OpenCL device to the specified device number
/** A context and default command queue will be created for the device **/
void set(int num);
/// Get the current device number
inline int device_num() { return _device; }
/// Returns the context for the current device
inline cl_context & context() { return _context; }
/// Returns the default stream for the current device
inline command_queue & cq() { return cq(0); }
/// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; }
/// Block until all commands in the default stream have completed
inline void sync() { sync(0); }
/// Block until all commands in the specified stream have completed
inline void sync(const int i) { ucl_sync(cq(i)); }
/// Get the number of command queues currently available on device
inline int num_queues()
{ return _cq.size(); }
/// Add a command queue for device computations (with profiling enabled)
inline void push_command_queue() {
cl_int errorv;
_cq.push_back(cl_command_queue());
_cq.back()=clCreateCommandQueue(_context,_cl_device,
CL_QUEUE_PROFILING_ENABLE,&errorv);
if (errorv!=CL_SUCCESS) {
std::cerr << "Could not create command queue on device: " << name()
<< std::endl;
exit(1);
}
}
/// Remove a stream for device computations
/** \note You cannot delete the default stream **/
inline void pop_command_queue() {
if (_cq.size()<2) return;
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
_cq.pop_back();
}
/// Get the current OpenCL device name
inline std::string name() { return name(_device); }
/// Get the OpenCL device name
inline std::string name(const int i)
{ return std::string(_properties[i].name); }
/// Get a string telling the type of the current device
inline std::string device_type_name() { return device_type_name(_device); }
/// Get a string telling the type of the device
inline std::string device_type_name(const int i);
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i);
/// Returns true if double precision is support for the current device
bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device
bool double_precision(const int i) {return _properties[i].double_precision;}
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores
inline unsigned cores(const int i)
{ if (device_type(i)==UCL_CPU) return _properties[i].compute_units;
else return _properties[i].compute_units*8; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
/// Get the gigabytes of global memory
inline double gigabytes(const int i)
{ return static_cast<double>(_properties[i].global_mem)/1073741824; }
/// Get the bytes of global memory in the current device
inline size_t bytes() { return bytes(_device); }
/// Get the bytes of global memory
inline size_t bytes(const int i) { return _properties[i].global_mem; }
/// Return the GPGPU revision number for current device
//inline double revision() { return revision(_device); }
/// Return the GPGPU revision number
//inline double revision(const int i)
// { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
/// Clock rate in GHz for current device
inline double clock_rate() { return clock_rate(_device); }
/// Clock rate in GHz
inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}
/// Return the address alignment in bytes
inline int alignment() { return alignment(_device); }
/// Return the address alignment in bytes
inline int alignment(const int i) { return _properties[i].alignment; }
/// Return the timer resolution
inline size_t timer_resolution() { return timer_resolution(_device); }
/// Return the timer resolution
inline size_t timer_resolution(const int i)
{ return _properties[i].timer_resolution; }
/// Get the maximum number of threads per block
inline size_t group_size() { return group_size(_device); }
/// Get the maximum number of threads per block
inline size_t group_size(const int i)
{ return _properties[i].work_group_size; }
/// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes
inline size_t max_pitch(const int i) { return 0; }
/// List all devices along with all properties
void print_all(std::ostream &out);
/// Return the OpenCL type for the device
inline cl_device_id & cl_device() { return _cl_device; }
private:
int _num_platforms; // Number of platforms
int _platform; // UCL_Device ID for current platform
cl_platform_id _cl_platform; // OpenCL ID for current platform
cl_context _context; // Context used for accessing the device
std::vector<cl_command_queue> _cq;// The default command queue for this device
int _device; // UCL_Device ID for current device
cl_device_id _cl_device; // OpenCL ID for current device
std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices
int _num_devices; // Number of devices
std::vector<OCLProperties> _properties; // Properties for each device
void add_properties(cl_device_id);
void create_context();
};
// Grabs the properties for all devices
inline UCL_Device::UCL_Device() {
cl_int errorv;
cl_uint nplatforms;
_cl_device=0;
_device=-1;
_num_devices=0;
_platform=0;
// --- Get Number of Platforms
errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms);
if (errorv!=CL_SUCCESS) {
_num_platforms=0;
return;
} else
_num_platforms=static_cast<int>(nplatforms);
// --- Get Number of Devices
cl_uint n;
errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n);
_num_devices=n;
if (errorv!=CL_SUCCESS || _num_devices==0) {
_num_devices=0;
return;
}
cl_device_id device_list[_num_devices];
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
&n));
// --- Store properties for each device
for (int i=0; i<_num_devices; i++) {
_cl_devices.push_back(device_list[i]);
add_properties(device_list[i]);
}
}
inline UCL_Device::~UCL_Device() {
if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) {
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
_cq.pop_back();
}
CL_SAFE_CALL(clReleaseContext(_context));
}
}
inline void UCL_Device::create_context() {
cl_int errorv;
cl_context_properties props[3];
props[0]=CL_CONTEXT_PLATFORM;
props[1]=_platform;
props[2]=0;
_context=clCreateContext(0,1,&_cl_device,NULL,NULL,&errorv);
if (errorv!=CL_SUCCESS) {
std::cerr << "Could not create context on device: " << name() << std::endl;
exit(1);
}
push_command_queue();
}
inline void UCL_Device::add_properties(cl_device_id device_list) {
OCLProperties op;
char buffer[1024];
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
op.name=buffer;
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE,
sizeof(op.global_mem),&op.global_mem,NULL));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_LOCAL_MEM_SIZE,
sizeof(op.shared_mem),&op.shared_mem,NULL));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
sizeof(op.const_mem),&op.const_mem,NULL));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_TYPE,
sizeof(op.device_type),&op.device_type,NULL));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(op.compute_units),&op.compute_units,
NULL));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CLOCK_FREQUENCY,
sizeof(op.clock),&op.clock,NULL));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(op.work_group_size),&op.work_group_size,
NULL));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_ITEM_SIZES,
3*sizeof(op.work_item_size[0]),op.work_item_size,
NULL));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN,
sizeof(cl_uint),&op.alignment,NULL));
op.alignment/=8;
// Determine if double precision is supported
cl_uint double_width;
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
sizeof(double_width),&double_width,NULL));
if (double_width==0)
op.double_precision=false;
else
op.double_precision=true;
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PROFILING_TIMER_RESOLUTION,
sizeof(size_t),&op.timer_resolution,NULL));
_properties.push_back(op);
}
inline std::string UCL_Device::platform_name() {
char info[1024];
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
NULL));
std::string ans=std::string(info)+' ';
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info,
NULL));
ans+=std::string(info)+' ';
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info,
NULL));
ans+=std::string(info);
return ans;
}
// Get a string telling the type of the device
inline std::string UCL_Device::device_type_name(const int i) {
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
return "CPU";
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
return "GPU";
else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
return "ACCELERATOR";
else
return "DEFAULT";
}
// Get a string telling the type of the device
inline int UCL_Device::device_type(const int i) {
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
return UCL_CPU;
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
return UCL_GPU;
else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
return UCL_ACCELERATOR;
else
return UCL_DEFAULT;
}
// Set the CUDA device to the specified device number
inline void UCL_Device::set(int num) {
if (_device==num)
return;
if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) {
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
_cq.pop_back();
}
CL_SAFE_CALL(clReleaseContext(_context));
}
cl_device_id device_list[_num_devices];
cl_uint n;
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
device_list,&n));
_device=num;
_cl_device=device_list[_device];
create_context();
}
// List all devices along with all properties
inline void UCL_Device::print_all(std::ostream &out) {
if (num_devices() == 0)
out << "There is no device supporting OpenCL\n";
for (int i=0; i<num_devices(); ++i) {
out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
out << " Type of device: "
<< device_type_name(i).c_str() << std::endl;
out << " Double precision support: ";
if (double_precision(i))
out << "Yes\n";
else
out << "No\n";
out << " Total amount of global memory: "
<< gigabytes(i) << " GB\n";
out << " Number of compute units/multiprocessors: "
<< _properties[i].compute_units << std::endl;
//out << " Number of cores: "
// << cores(i) << std::endl;
out << " Total amount of constant memory: "
<< _properties[i].const_mem << " bytes\n";
out << " Total amount of local/shared memory per block: "
<< _properties[i].shared_mem << " bytes\n";
//out << " Total number of registers available per block: "
// << _properties[i].regsPerBlock << std::endl;
//out << " Warp size: "
// << _properties[i].warpSize << std::endl;
out << " Maximum group size (# of threads per block) "
<< _properties[i].work_group_size << std::endl;
out << " Maximum item sizes (# threads for each dim) "
<< _properties[i].work_item_size[0] << " x "
<< _properties[i].work_item_size[1] << " x "
<< _properties[i].work_item_size[2] << std::endl;
//out << " Maximum sizes of each dimension of a grid: "
// << _properties[i].maxGridSize[0] << " x "
// << _properties[i].maxGridSize[1] << " x "
// << _properties[i].maxGridSize[2] << std::endl;
//out << " Maximum memory pitch: "
// << _properties[i].memPitch) << " bytes\n";
//out << " Texture alignment: "
// << _properties[i].textureAlignment << " bytes\n";
out << " Clock rate: "
<< clock_rate(i) << " GHz\n";
//out << " Concurrent copy and execution: ";
}
}
}
#endif

254
lib/gpu/geryon/ocl_kernel.h Normal file
View File

@ -0,0 +1,254 @@
/***************************************************************************
ocl_kernel.h
-------------------
W. Michael Brown
Utilities for dealing with OpenCL kernels
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Sun Feb 7 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef OCL_KERNEL
#define OCL_KERNEL
#include "ocl_device.h"
#include <fstream>
namespace ucl_opencl {
/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
public:
inline UCL_Program() : _init_done(false) {}
inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
inline ~UCL_Program() { clear(); }
/// Initialize the program with a device
inline void init(UCL_Device &device) {
clear();
_device=device.cl_device();
_context=device.context();
_cq=device.cq();
CL_SAFE_CALL(clRetainContext(_context));
CL_SAFE_CALL(clRetainCommandQueue(_cq));
_init_done=true;
}
/// Clear any data associated with program
/** \note Must call init() after each clear **/
inline void clear() {
if (_init_done) {
CL_SAFE_CALL(clReleaseProgram(_program));
CL_SAFE_CALL(clReleaseContext(_context));
CL_SAFE_CALL(clReleaseCommandQueue(_cq));
_init_done=false;
}
}
/// Load a program from a file and compile with flags
inline int load(const char *filename, const char *flags="",
std::string *log=NULL) {
std::ifstream in(filename);
if (!in || in.is_open()==false) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not open kernel file: "
<< filename << std::endl;
exit(1);
#endif
return UCL_FILE_NOT_FOUND;
}
std::string program((std::istreambuf_iterator<char>(in)),
std::istreambuf_iterator<char>());
in.close();
return load_string(program.c_str(),flags,log);
}
/// Load a program from a string and compile with flags
inline int load_string(const char *program, const char *flags="",
std::string *log=NULL) {
cl_int error_flag;
const char *prog=program;
_program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag);
CL_CHECK_ERR(error_flag);
error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL);
cl_build_status build_status;
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,
CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status),&build_status,
NULL));
if (build_status != CL_SUCCESS || log!=NULL) {
size_t ms;
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
NULL, &ms));
char build_log[ms];
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
build_log, NULL));
if (log!=NULL)
*log=std::string(build_log);
if (build_status != CL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << std::endl
<< "----------------------------------------------------------\n"
<< " UCL Error: Error compiling OpenCL Program...\n"
<< "----------------------------------------------------------\n";
std::cerr << build_log << std::endl;
#endif
return UCL_COMPILE_ERROR;
}
}
return UCL_SUCCESS;
}
friend class UCL_Kernel;
private:
bool _init_done;
cl_program _program;
cl_device_id _device;
cl_context _context;
cl_command_queue _cq;
};
/// Class for dealing with OpenCL kernels
class UCL_Kernel {
public:
UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0)
{ _block_size[0]=0; _num_blocks[0]=0; }
inline UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _function_set(false), _num_args(0)
{ _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); }
inline ~UCL_Kernel() { clear(); }
/// Clear any function associated with the kernel
inline void clear() {
if (_function_set) {
clReleaseKernel(_kernel);
clReleaseProgram(_program);
clReleaseCommandQueue(_cq);
_function_set=false;
}
}
/// Get the kernel function from a program
/** \return UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
inline int set_function(UCL_Program &program, const char *function);
/// Set the kernel argument.
/** If not a device pointer, this must be repeated each time the argument
* changes **/
template <class dtype>
inline void set_arg(const cl_uint index, dtype *arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
if (index>_num_args) _num_args=index;
}
/// Add a kernel argument.
template <class dtype>
inline void add_arg(dtype *arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
_num_args++;
}
/// Set the number of thread blocks and the number of threads in each block
inline void set_size(const size_t num_blocks, const size_t block_size) {
_dimensions=1;
_num_blocks[0]=num_blocks*block_size;
_block_size[0]=block_size;
}
/// Set the number of thread blocks and the number of threads in each block
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y) {
_dimensions=2;
_num_blocks[0]=num_blocks_x*block_size_x;
_block_size[0]=block_size_x;
_num_blocks[1]=num_blocks_y*block_size_y;
_block_size[1]=block_size_y;
}
/// Set the number of thread blocks and the number of threads in each block
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) {
_dimensions=3;
const size_t num_blocks_z=1;
_num_blocks[0]=num_blocks_x*block_size_x;
_block_size[0]=block_size_x;
_num_blocks[1]=num_blocks_y*block_size_y;
_block_size[1]=block_size_y;
_num_blocks[2]=num_blocks_z*block_size_z;
_block_size[2]=block_size_z;
}
/// Run the kernel in the default command queue
inline void run() {
run(_cq);
}
/// Run the kernel in the specified command queue
inline void run(command_queue &cq) {
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,_kernel,_dimensions,NULL,
_num_blocks,_block_size,0,NULL,NULL));
}
/// Clear any arguments associated with the kernel
inline void clear_args() { _num_args=0; }
#include "ucl_arg_kludge.h"
private:
cl_kernel _kernel;
cl_program _program;
cl_uint _dimensions;
size_t _block_size[3];
size_t _num_blocks[3];
bool _function_set;
cl_command_queue _cq; // The default command queue for this kernel
unsigned _num_args;
};
inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
clear();
_function_set=true;
_cq=program._cq;
CL_SAFE_CALL(clRetainCommandQueue(_cq));
_program=program._program;
CL_SAFE_CALL(clRetainProgram(_program));
cl_int error_flag;
_kernel=clCreateKernel(program._program,function,&error_flag);
if (error_flag!=CL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not find function: " << function
<< " in program.\n";
exit(1);
#endif
return UCL_FUNCTION_NOT_FOUND;
}
return UCL_SUCCESS;
}
} // namespace
#endif

56
lib/gpu/geryon/ocl_mat.h Normal file
View File

@ -0,0 +1,56 @@
/***************************************************************************
ocl_mat.h
-------------------
W. Michael Brown
OpenCL Specific Vector/Matrix Containers, Memory Management, and I/O
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Wed Jan 13 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
/*! \file */
#ifndef OCL_MAT_H
#define OCL_MAT_H
#include "ocl_memory.h"
/// Namespace for OpenCL routines
namespace ucl_opencl {
#define _UCL_MAT_ALLOW
#define _UCL_DEVICE_PTR_MAT
#define _OCL_MAT
#include "ucl_basemat.h"
#include "ucl_h_vec.h"
#include "ucl_h_mat.h"
#include "ucl_d_vec.h"
#include "ucl_d_mat.h"
#undef _UCL_DEVICE_PTR_MAT
#undef _OCL_MAT
#undef _UCL_MAT_ALLOW
#define UCL_COPY_ALLOW
#include "ucl_copy.h"
#undef UCL_COPY_ALLOW
#define UCL_PRINT_ALLOW
#include "ucl_print.h"
#undef UCL_PRINT_ALLOW
} // namespace ucl_cudart
#endif

View File

@ -0,0 +1,59 @@
/***************************************************************************
ocl_texture.h
-------------------
W. Michael Brown
Utilities for dealing with OpenCL textures
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Fri Jul 2 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef OCL_TEXTURE
#define OCL_TEXTURE
#include "ocl_kernel.h"
#include "ocl_mat.h"
namespace ucl_opencl {
/// Class storing a texture reference
class UCL_Texture {
public:
UCL_Texture() {}
~UCL_Texture() {}
/// Construct with a specified texture reference
inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
/// Set the texture reference for this object
inline void get_texture(UCL_Program &prog, const char *texture_name) { }
/// Bind a float array where each fetch grabs a vector of length numel
template<class mat_typ>
inline void bind_float(mat_typ &vec, const unsigned numel) { }
/// Unbind the texture reference from the memory allocation
inline void unbind() { }
/// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) { }
private:
friend class UCL_Kernel;
};
} // namespace
#endif

111
lib/gpu/geryon/ocl_timer.h Normal file
View File

@ -0,0 +1,111 @@
/***************************************************************************
ocl_timer.h
-------------------
W. Michael Brown
Class for timing OpenCL routines
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Jan Fri 22 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef OCL_TIMER_H
#define OCL_TIMER_H
#include "ocl_macros.h"
namespace ucl_opencl {
/// Class for timing OpenCL events
class UCL_Timer {
public:
inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
{ init(dev); }
inline ~UCL_Timer() { clear(); }
/// Clear any data associated with timer
/** \note init() must be called to reuse timer after a clear() **/
inline void clear() {
if (_initialized) {
CL_SAFE_CALL(clReleaseCommandQueue(_cq));
clReleaseEvent(start_event);
clReleaseEvent(stop_event);
_initialized=false;
_total_time=0.0;
}
}
/// Initialize default command queue for timing
inline void init(UCL_Device &dev) { init(dev,dev.cq()); }
/// Initialize command queue for timing
inline void init(UCL_Device &dev, command_queue &cq) {
clear();
t_factor=dev.timer_resolution()/1000000000.0;
_cq=cq;
clRetainCommandQueue(_cq);
_initialized=true;
}
/// Start timing on default command queue
inline void start() { clEnqueueMarker(_cq,&start_event); }
/// Stop timing on default command queue
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
/// Set the time elapsed to zero (not the total_time)
inline void zero()
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
/// Add time from previous start and stop to total
/** Forces synchronization **/
inline double add_to_total()
{ double t=time(); _total_time+=t; return t/1000.0; }
/// Return the time (ms) of last start to stop - Forces synchronization
inline double time() {
cl_ulong tstart,tend;
CL_SAFE_CALL(clWaitForEvents(1,&stop_event));
CL_SAFE_CALL(clGetEventProfilingInfo(stop_event,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &tend, NULL));
CL_SAFE_CALL(clGetEventProfilingInfo(start_event,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &tstart, NULL));
return (tend-tstart)*t_factor;
}
/// Return the time (s) of last start to stop - Forces synchronization
inline double seconds() { return time()/1000.0; }
/// Return the total time in ms
inline double total_time() { return _total_time; }
/// Return the total time in seconds
inline double total_seconds() { return _total_time/1000.0; }
private:
cl_event start_event, stop_event;
cl_command_queue _cq;
double _total_time;
bool _initialized;
double t_factor;
};
} // namespace
#endif

View File

@ -0,0 +1,673 @@
/***************************************************************************
ucl_arg_kludge.h
-------------------
W. Michael Brown
Allow multiple arguments to be added for a kernel call at a single time
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Sun Feb 7 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
template <class t1, class t2>
inline void add_args(t1 *a1, t2 *a2) {
add_arg(a1); add_arg(a2);
}
template <class t1, class t2, class t3>
inline void add_args(t1 *a1, t2 *a2, t3 *a3) {
add_arg(a1); add_arg(a2); add_arg(a3);
}
template <class t1, class t2, class t3, class t4>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
}
template <class t1, class t2, class t3, class t4, class t5>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20>
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
}
// ---------------------------------------------------------------------------
template <class t1>
inline void run(t1 *a1) {
clear_args();
add_arg(a1);
run();
}
template <class t1, class t2>
inline void run(t1 *a1, t2 *a2) {
clear_args();
add_arg(a1); add_arg(a2);
run();
}
template <class t1, class t2, class t3>
inline void run(t1 *a1, t2 *a2, t3 *a3) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3);
run();
}
template <class t1, class t2, class t3, class t4>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
run();
}
template <class t1, class t2, class t3, class t4, class t5>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
run();
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20>
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
run();
}
// ---------------------------------------------------------------------------
template <class t1>
inline void run_cq(command_queue &cq, t1 *a1) {
clear_args();
add_arg(a1);
run(cq);
}
template <class t1, class t2>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2) {
clear_args();
add_arg(a1); add_arg(a2);
run(cq);
}
template <class t1, class t2, class t3>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3);
run(cq);
}
template <class t1, class t2, class t3, class t4>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
run(cq);
}

View File

@ -0,0 +1,77 @@
/***************************************************************************
ucl_basemat.h
-------------------
W. Michael Brown
Vector/Matrix Base Container
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW
#include "ucl_types.h"
#define UCL_H_VecT UCL_H_Vec<numtyp>
#define UCL_H_VecD UCL_H_Vec<double>
#define UCL_H_VecS UCL_H_Vec<float>
#define UCL_H_VecI UCL_H_Vec<int>
#define UCL_D_VecT UCL_D_Vec<numtyp>
#define UCL_D_VecD UCL_D_Vec<double>
#define UCL_D_VecS UCL_D_Vec<float>
#define UCL_D_VecI UCL_D_Vec<int>
#define UCL_D_VecI2 UCL_D_Vec<int2>
#define UCL_D_VecU2 UCL_D_Vec<uint2>
#define UCL_D_MatT UCL_D_Mat<numtyp>
#define UCL_D_MatD UCL_D_Mat<double>
#define UCL_D_MatS UCL_D_Mat<float>
#define UCL_D_MatI UCL_D_Mat<int>
#define UCL_ConstMatT UCL_ConstMat<numtyp>
#define UCL_ConstMatD UCL_ConstMat<double>
#define UCL_ConstMatS UCL_ConstMat<float>
#define UCL_ConstMatI UCL_ConstMat<int>
#define UCL_ConstMatD2 UCL_ConstMat<double2>
/// Base class for vector/matrix containers
/** All containers are associated with a default command queue.
* For CUDA, this is the default stream.
*
* The default queue is used for asynchonrous operations on the container
* that do not specify a queue. For OpenCL, this queue is also used in
* calls for reserving and copying memory **/
class UCL_BaseMat {
public:
UCL_BaseMat() : _cq(0) { }
virtual ~UCL_BaseMat() { }
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
/// Block until command_queue associated with matrix is complete
inline void sync() { ucl_sync(_cq); }
#ifdef UCL_DEBUG
// Returns the type of host allocation
virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; }
#endif
protected:
command_queue _cq;
};
#endif

826
lib/gpu/geryon/ucl_copy.h Normal file
View File

@ -0,0 +1,826 @@
/***************************************************************************
ucl_copy.h
-------------------
W. Michael Brown
Routines for copying matrix/vector data onto and off coprocessor device
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Mon Jan 4 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
/***************************************************************************
The ucl_copy and ucl_cast_copy routines provide a general prototype for
copying data between host and device memory (including texture memory)
for the matrix and vector types in nvc_memory.
For host/host and host/device transfers, typecasting is performed
automatically as necessary.
The routines are written so that all branches can be removed by the
compiler during template instantiation.
The routines currently assume row-major ordering for all types.
For asynchronous copy in the default command queue, async is boolean true;
For asynchronous copy in a specified command queue, async is command queue
Otherwise, set async to boolean false;
When performing frequent data copies that require casting, it is more
efficient to allocate a casting buffer once and then pass that buffer
to the copy routine. This can be accomplished with the ucl_cast_copy
routines.
Examples
(x's represent alignment padding - to maintain alignment)
(o's represent a larger matrix in memory)
(vectors represented as single row)
----------------------------------------------------------------
dst src command
----------------------------------------------------------------
0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async)
0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async)
0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async)
3 4 5
0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async)
3 4 5
0 1 2 <-- 0 1 2 ucl_copy(dst,src,async)
3 4 5 3 4 5
0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async)
3 4 5 3 4 5
5 6 7
0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async)
4 5 6 4 5 6 7
8 9 10 11
0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async)
3 4 5 x x 3 4 5
0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async)
3 4 5 3 4 5 x x
0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async)
3 4 5 o o 3 4 5
o o o o o
0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async)
3 4 5 o o
o o o o o
0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async)
2 3 o o o
o o o o o
0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
5 6 7 o o 5 6 7 8 9
o o o o o 10 11 12 13 14
0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
5 6 7 8 9
10 11 12 13 14
***************************************************************************/
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
#ifdef UCL_COPY_ALLOW
// --------------------------------------------------------------------------
// - HOST-HOST COPY ROUTINES
// --------------------------------------------------------------------------
// Have to use specialization because some types don't have operator[]
template <int host_t1, int host_t2> struct _host_host_copy;
// Both on host
template <> struct _host_host_copy<1,1> {
template <class mat1, class mat2>
static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
#ifdef UCL_DEBUG
assert(mat1::PADDED==0 && mat2::PADDED==0);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
else
for (size_t i=0; i<numel; i++)
dst[i]=static_cast<typename mat1::data_type>(src[i]);
}
template <class mat1, class mat2>
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols) {
#ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
size_t dst_row_size, src_row_size;
if (mat1::VECTOR)
dst_row_size=cols;
else
dst_row_size=dst.row_size();
if (mat2::VECTOR)
src_row_size=cols;
else
src_row_size=src.row_size();
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
for (size_t i=0; i<rows; i++)
memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
cols*sizeof(typename mat1::data_type));
else
for (size_t j=0; j<rows; j++) {
int dst_i=j*dst_row_size;
int d_end=dst_i+cols;
int src_i=j*src_row_size;
for (; dst_i<d_end; dst_i++) {
dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
src_i++;
}
}
}
};
// Should never be here
template <int host_t1, int host_t2> struct _host_host_copy {
template <class mat1, class mat2>
static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
assert(0==1);
}
template <class mat1, class mat2>
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols) {
assert(0==1);
}
};
// --------------------------------------------------------------------------
// - TEMPLATE HELPER FUNCTIONS FOR SPECIALIZED CASTING
// --------------------------------------------------------------------------
// Helper functions for ucl_cast_copy
template <int host_type1, int host_type2> struct _ucl_cast_copy;
// Destination is on host
template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type));
for (size_t i=0; i<numel; i++)
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer,command_queue &cq) {
ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type),cq);
cast_buffer.sync();
for (size_t i=0; i<numel; i++)
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
// Asynchronous currently pointless here
#ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
#endif
if (mat1::VECTOR) {
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
for (size_t i=0; i<rows*cols; i++)
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
} else {
if (mat2::VECTOR)
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
cols*sizeof(typename mat2::data_type),
cols*sizeof(typename mat2::data_type),rows);
else
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type),
rows);
int dst_i=0;
int buff_i=0;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
buff_i++;
dst_i++;
}
dst_i+=dst.cols()-cols;
}
}
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
// Asynchronous currently pointless here
#ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
#endif
if (mat1::VECTOR) {
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
cast_buffer.sync();
for (size_t i=0; i<rows*cols; i++)
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
} else {
if (mat2::VECTOR)
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
cols*sizeof(typename mat2::data_type),
cols*sizeof(typename mat2::data_type),rows,cq);
else
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type),
rows,cq);
cast_buffer.sync();
int dst_i=0;
int buff_i=0;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
buff_i++;
dst_i++;
}
dst_i+=dst.cols()-cols;
}
}
}
};
// Source is on host
template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
for (size_t i=0; i<numel; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type));
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
for (size_t i=0; i<numel; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type),cq);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
#ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
#endif
if (mat2::VECTOR) {
for (size_t i=0; i<rows*cols; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows);
} else if (mat1::VECTOR) {
int src_i=0;
int buf_i=0;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++;
src_i++;
}
src_i+=src.cols()-cols;
}
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
} else {
int src_i=0;
int buf_i=0;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++;
src_i++;
}
src_i+=src.cols()-cols;
}
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows);
}
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
#ifdef UCL_DEBUG
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
#endif
if (mat2::VECTOR) {
for (size_t i=0; i<rows*cols; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,dst.row_bytes(),
cast_buffer,cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows,cq);
} else if (mat1::VECTOR) {
int src_i=0;
int buf_i=0;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++;
src_i++;
}
src_i+=src.cols()-cols;
}
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
} else {
int src_i=0;
int buf_i=0;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++;
src_i++;
}
src_i+=src.cols()-cols;
}
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows,cq);
}
}
};
// Neither on host or both on host
template <> struct _ucl_cast_copy<1,1> {
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
assert(0==1);
}
};
// Neither on host or both on host
template <> struct _ucl_cast_copy<0,0> {
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
assert(0==1);
}
};
// --------------------------------------------------------------------------
// - 1D COPY - SPECIFIED NUMBER OF BYTES
// --------------------------------------------------------------------------
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
/** \param numel Number of elements (not bytes) to copy
* \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed
* - Padding for 2D matrices is not considered in this routine.
* - Currently does not handle textures **/
template <class mat1, class mat2, class mat3>
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
#ifdef UCL_DEBUG
assert(dst.numel()>=numel && src.numel()>=numel);
assert(cast_buffer.numel()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,numel,cq);
else
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq);
}
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
/** \param numel Number of elements (not bytes) to copy
* \param async Perform non-blocking copy on default stream
* \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed
* - Padding for 2D matrices is not considered in this routine.
* - Currently does not handle textures **/
template <class mat1, class mat2, class mat3>
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, const bool async) {
#ifdef UCL_DEBUG
assert(dst.numel()>=numel && src.numel()>=numel);
assert(cast_buffer.numel()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,numel,async);
else if (async)
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,dst.cq());
else
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer);
}
/// Asynchronous copy of matrix/vector (memory already allocated)
/** \param numel Number of elements (not bytes) to copy
* - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine.
* - Padding for 2D matrices is not considered in this routine.
* - Currently does not handle textures **/
template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
command_queue &cq) {
#ifdef UCL_DEBUG
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq);
} else {
UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq);
}
} else
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
}
/// Copy matrix/vector (memory already allocated)
/** \param numel Number of elements (not bytes) to copy
* \param async Perform non-blocking copy (ignored for host to host copy)
* - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine.
* - Padding for 2D matrices is not considered in this routine.
* - The default stream is used for asynchronous copy
* - Currently does not handle textures **/
template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
const bool async) {
#ifdef UCL_DEBUG
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
else if (async)
ucl_copy(dst,src,numel,dst.cq());
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer);
} else {
UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer);
}
} else
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
}
// --------------------------------------------------------------------------
// - 2D COPY - SPECIFIED NUMBER OF ROWS/COLS
// --------------------------------------------------------------------------
/// Asynchronous copy subset matrix rows/cols with cast (Device/Host transfer)
/** \param async Perform non-blocking copy on default stream
* \param cast_buffer Buffer on host with enough storage for casting
* - If src is a vector, routine assumes row-major rows by cols copy
* - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy
* - If dst is a matrix, routine will copy into left tile of matrix
* - If the data types for the two matrices are same, no cast performed
* - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template <class mat1, class mat2, class mat3>
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
const bool async) {
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,rows,cols,async);
else if (async)
ucl_copy(dst,src,rows,cols,dst.cq());
else
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer);
}
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
/** \param cast_buffer Buffer on host with enough storage for casting
* - If src is a vector, routine assumes row-major rows by cols copy
* - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy
* - If dst is a matrix, routine will copy into upper left tile of matrix
* - If the data types for the two matrices are same, no cast performed
* - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template <class mat1, class mat2, class mat3>
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,rows,cols,cq);
else
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer,cq);
}
/// Asynchronous copy of subset matrix rows,cols (memory already allocated)
/** - If src is a vector, routine assumes row-major rows by cols copy
* - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy
* - If dst is a matrix, routine will copy into left tile of matrix
* - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine.
* - The copy should handle padding for 2D alignment correctly
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, command_queue &cq) {
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer,cq);
} else {
UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer,cq);
}
// If we are here, at least one of the matrices must have VECTOR=0
} else if (mat1::VECTOR) {
#ifdef UCL_DEBUG
assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
cols*sizeof(typename mat1::data_type),rows,
cq);
} else if (mat2::VECTOR) {
#ifdef UCL_DEBUG
assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows,cq);
} else {
#ifdef UCL_DEBUG
assert(src.rows()>=rows && src.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
cols*sizeof(typename mat1::data_type),rows,cq);
}
}
/// Copy subset of matrix rows,cols (memory already allocated)
/** \param async Perform non-blocking copy (ignored for host to host copy)
* - If src is a vector, routine assumes row-major rows by cols copy
* - If src is a matrix, routine will copy upper left tile of matrix
* - If dst is a vector, routine assumes row-major rows by cols copy
* - If dst is a matrix, routine will copy into left tile of matrix
* - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine.
* - The copy should handle padding for 2D alignment correctly
* - Copy from vector to matrix and vice versa allowed
* - The default stream is used for asynchronous copy
* - Currently does not handle textures **/
template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, const bool async) {
if (async)
ucl_copy(dst,src,rows,cols,dst.cq());
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer);
} else {
UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer);
}
// If we are here, at least one of the matrices must have VECTOR=0
} else if (mat1::VECTOR) {
#ifdef UCL_DEBUG
assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(mat2::VECTOR==0);
#endif
ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
cols*sizeof(typename mat1::data_type),rows);
} else if (mat2::VECTOR) {
#ifdef UCL_DEBUG
assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(mat1::VECTOR==0);
#endif
ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows);
} else {
#ifdef UCL_DEBUG
assert(src.rows()>=rows && src.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
cols*sizeof(typename mat1::data_type),rows);
}
}
// --------------------------------------------------------------------------
// - 1D/2D COPY
// --------------------------------------------------------------------------
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
/** \param async Perform non-blocking copy on default stream
* \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed
* - The number of bytes copied is determined by entire src data
* - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template <class mat1, class mat2, class mat3>
inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
mat3 &cast_buffer, const bool async) {
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,async);
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
ucl_cast_copy(dst,src,src.rows(),src.cols(),cast_buffer,async);
else if (mat1::PADDED==1)
ucl_cast_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,async);
else
ucl_cast_copy(dst,src,src.numel(),cast_buffer,async);
}
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
/** \param cast_buffer Buffer on host with enough storage for casting
* - If the data types for the two matrices are same, no cast performed
* - The number of bytes copied is determined by entire src data
* - Padding for 2D matrices is not considered in this routine.
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template <class mat1, class mat2, class mat3>
inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
mat3 &cast_buffer, command_queue &cq) {
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,cq);
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
ucl_copy(dst,src,src.rows(),src.cols(),cast_buffer,cq);
else if (mat1::PADDED==1)
ucl_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,cq);
else
ucl_copy(dst,src,src.numel(),cast_buffer,cq);
}
/// Asynchronous copy of matrix/vector (memory already allocated)
/** - The number of bytes copied is determined by entire src data
* - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine.
* - The copy should handle padding for 2D alignment correctly
* - Copy from vector to matrix and vice versa allowed
* - Currently does not handle textures **/
template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, command_queue &cq) {
if (dst.row_bytes()==src.row_bytes() &&
src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
(int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,src.row_size()*src.rows(),cq);
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
ucl_copy(dst,src,src.rows(),src.cols(),cq);
else if (mat1::PADDED==1)
ucl_copy(dst,src,dst.rows(),dst.cols(),cq);
else
ucl_copy(dst,src,src.numel(),cq);
}
/// Copy matrix/vector (memory already allocated)
/** \param async Perform non-blocking copy (ignored for host to host copy)
* - The number of bytes copied is determined by entire src data
* - If the data types of the two matrices are not the same,
* casting will be performed automatically as long as the copy is
* not device to device. For host/device transfers, a temporary
* buffer is created for copy. When multiple casts occur, it is
* more efficient to create a permanent casting buffer that can
* be passed to an alternative copy routine.
* - The copy should handle padding for 2D alignment correctly
* - Copy from vector to matrix and vice versa allowed
* - The default stream is used for asynchronous copy
* - Currently does not handle textures **/
template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
if (async)
ucl_copy(dst,src,dst.cq());
else if (dst.row_bytes()==src.row_bytes() &&
src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
(int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,src.row_size()*src.rows(),async);
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
ucl_copy(dst,src,src.rows(),src.cols(),async);
else if (mat1::PADDED==1)
ucl_copy(dst,src,dst.rows(),dst.cols(),async);
else
ucl_copy(dst,src,src.numel(),async);
}
#endif

430
lib/gpu/geryon/ucl_d_mat.h Normal file
View File

@ -0,0 +1,430 @@
/***************************************************************************
ucl_d_mat.h
-------------------
W. Michael Brown
Matrix Container on Device
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW
/// 2D Matrix on device (can have extra column storage to get correct alignment)
template <class numtyp>
class UCL_D_Mat : public UCL_BaseMat {
public:
// Traits for copying data
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
enum traits {
DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
MEM_TYPE = 0,
PADDED = 1,
ROW_MAJOR = 1,
VECTOR = 0
};
typedef numtyp data_type;
UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {}
~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); }
/// Construct with specified rows and cols
/** \sa alloc() **/
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); }
/// Row major matrix on device
/** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param cq Default command queue for operations copied from another mat
* \note - Coalesced access using adjacent cols on same row
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
clear();
_kind=kind;
_rows=rows;
_cols=cols;
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
_row_size=_pitch/sizeof(numtyp);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_row_size*cols;
#endif
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
std::cerr << "UCL Error: Could not allocate "
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
exit(1);
}
#endif
#ifdef _OCL_MAT
_offset=0;
#endif
return err;
}
/// Row major matrix on device
/** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param device Used to get the default command queue for operations
* \note - Coalesced access using adjacent cols on same row
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
clear();
_kind=kind;
_rows=rows;
_cols=cols;
int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
_row_size=_pitch/sizeof(numtyp);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_row_size*cols;
#endif
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
std::cerr << "UCL Error: Could not allocate "
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
exit(1);
}
#endif
#ifdef _OCL_MAT
_offset=0;
#endif
return err;
}
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) {
clear();
_kind=UCL_VIEW;
_rows=rows;
_cols=cols;
_pitch=stride*sizeof(numtyp);
_row_size=stride;
this->_cq=input.cq();
#ifdef _OCL_MAT
_offset=0;
_array=input.cbegin();
#else
_device_view(&_array,input.begin());
#endif
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols)
{ view(input,rows,cols,input.row_size()); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
inline void view(ucl_type &input)
{ view(input,input.rows(),input.cols()); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols,
const size_t stride, UCL_Device &dev) {
clear();
_kind=UCL_VIEW;
_cols=cols;
_rows=rows;
_pitch=stride*sizeof(numtyp);
_row_size=stride;
this->_cq=dev.cq();
_array=input;
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
#ifdef _OCL_MAT
_offset=0;
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols,
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ptr_type>
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols, const size_t stride) {
clear();
_kind=UCL_VIEW;
_cols=cols;
_rows=rows;
_pitch=stride*sizeof(numtyp);
_row_size=stride;
this->_cq=input.cq();
#ifdef _OCL_MAT
_array=input.begin();
_offset=offset;
#else
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols)
{ view_offset(offset,input,rows,cols,input.row_size()); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
inline void view_offset(const size_t offset, ucl_type &input) {
if (input.rows()==1)
view_offset(offset,input,1,input.cols()-offset);
else
view_offset(offset,input,input.rows()-offset/input.row_size(),
input.cols());
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols,const size_t stride,
UCL_Device &dev) {
clear();
_kind=UCL_VIEW;
_cols=cols;
_rows=rows;
_pitch=stride*sizeof(numtyp);
_row_size=stride;
this->_cq=dev.cq();
#ifdef _OCL_MAT
_array=input;
_offset=offset;
#else
#ifdef _UCL_DEVICE_PTR_MAT
_array=input+offset*sizeof(numtyp);
#else
_array=input+offset;
#endif
#endif
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols, UCL_Device &dev)
{ view_offset(offset,input,rows,cols,cols,dev); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev)
{ view_offset(offset,input,1,cols,dev); }
/// Free memory and set size to 0
inline void clear()
{ _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
/// Set each element to zero
inline void zero() { _device_zero(*this,row_bytes()*_rows); }
/// Set first n elements to zero
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
#ifdef _UCL_DEVICE_PTR_MAT
/// For OpenCL, returns a (void *) device pointer to memory allocation
inline device_ptr & begin() { return _array; }
/// For OpenCL, returns a (void *) device pointer to memory allocation
inline const device_ptr & begin() const { return _array; }
#else
/// For CUDA-RT, get device pointer to first element
inline numtyp * begin() { return _array; }
/// For CUDA-RT, get device pointer to first element
inline const numtyp * begin() const { return _array; }
/// For CUDA-RT, get device pointer to one past last element
inline numtyp * end() { return _end; }
/// For CUDA-RT, get device pointer to one past last element
inline const numtyp * end() const { return _end; }
#endif
#ifdef _UCL_DEVICE_PTR_MAT
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns void** **/
inline device_ptr & cbegin() { return _array; }
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns void** **/
inline const device_ptr & cbegin() const { return _array; }
#else
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns numtyp** **/
inline numtyp ** cbegin() { return &_array; }
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns numtyp** **/
inline const numtyp ** cbegin() const { return &_array; }
#endif
/// Get the number of elements
inline size_t numel() const { return _cols*_rows; }
/// Get the number of rows
inline size_t rows() const { return _rows; }
/// Get the number of columns
inline size_t cols() const { return _cols; }
///Get the size of a row (including any padding) in elements
inline size_t row_size() const { return _row_size; }
/// Get the size of a row (including any padding) in bytes
inline size_t row_bytes() const { return _pitch; }
/// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); }
#ifdef _OCL_MAT
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return _offset; }
#else
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return 0; }
#endif
/// Return the offset (in bytes) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t byteoff() const { return offset()*sizeof(numtyp); }
private:
size_t _pitch, _row_size, _rows, _cols;
enum UCL_MEMOPT _kind;
#ifdef _UCL_DEVICE_PTR_MAT
device_ptr _array;
#else
numtyp *_array,*_end;
#endif
#ifdef _OCL_MAT
size_t _offset;
#endif
};
#endif

442
lib/gpu/geryon/ucl_d_vec.h Normal file
View File

@ -0,0 +1,442 @@
/***************************************************************************
ucl_d_vec.h
-------------------
W. Michael Brown
Vector Container on Device
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW
/// Row vector on device
template <class numtyp>
class UCL_D_Vec : public UCL_BaseMat {
public:
// Traits for copying data
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
enum traits {
DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
MEM_TYPE = 0,
PADDED = 0,
ROW_MAJOR = 1,
VECTOR = 1
};
typedef numtyp data_type;
UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {}
~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); }
/// Construct with n columns
/** \sa alloc() **/
UCL_D_Vec(const size_t n, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); }
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param cq Default command queue for operations copied from another mat
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
clear();
_kind=kind;
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
int err=_device_alloc(*this,cq,_row_bytes,kind);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+cols;
#endif
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on device.\n";
exit(1);
}
#endif
#ifdef _OCL_MAT
_offset=0;
#endif
return err;
}
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
clear();
_kind=kind;
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
int err=_device_alloc(*this,device,_row_bytes,kind);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+cols;
#endif
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on device.\n";
exit(1);
}
#endif
#ifdef _OCL_MAT
_offset=0;
#endif
return err;
}
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
clear();
_kind=UCL_VIEW;
_cols=cols;
_row_bytes=_cols*sizeof(numtyp);
this->_cq=input.cq();
#ifdef _OCL_MAT
_offset=0;
_array=input.cbegin();
#else
_device_view(&_array,input.begin());
#endif
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) { view(input,rows,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
inline void view(ucl_type &input)
{ view(input,input.rows()*input.row_size()); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols,
UCL_Device &dev) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
clear();
_kind=UCL_VIEW;
_cols=cols;
_row_bytes=_cols*sizeof(numtyp);
this->_cq=dev.cq();
_array=input;
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
#ifdef _OCL_MAT
_offset=0;
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols,
const size_t stride, UCL_Device &dev)
{ view(input,rows,cols,stride); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ptr_type>
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
clear();
_kind=UCL_VIEW;
_cols=cols;
_row_bytes=_cols*sizeof(numtyp);
this->_cq=input.cq();
#ifdef _OCL_MAT
_array=input.begin();
_offset=offset;
#else
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols, const size_t stride)
{ view_offset(offset,input,rows,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
inline void view_offset(const size_t offset, ucl_type &input)
{ view_offset(offset,input,input.rows()*input.row_size()-offset); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols, UCL_Device &dev) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
clear();
_kind=UCL_VIEW;
_cols=cols;
_row_bytes=_cols*sizeof(numtyp);
this->_cq=dev.cq();
#ifdef _OCL_MAT
_array=input;
_offset=offset;
#else
#ifdef _UCL_DEVICE_PTR_MAT
_array=input+offset*sizeof(numtyp);
#else
_array=input+offset;
#endif
#endif
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_cols;
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols,const size_t stride,UCL_Device &dev)
{ view_offset(offset,input,rows,cols,stride); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev)
{ view_offset(offset,input,1,cols,dev); }
/// Free memory and set size to 0
inline void clear()
{ if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
/// Set each element to zero
inline void zero() { _device_zero(*this,row_bytes()); }
/// Set first n elements to zero
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
#ifdef _UCL_DEVICE_PTR_MAT
/// For OpenCL, returns a (void *) device pointer to memory allocation
inline device_ptr & begin() { return _array; }
/// For OpenCL, returns a (void *) device pointer to memory allocation
inline const device_ptr & begin() const { return _array; }
#else
/// For CUDA-RT, get device pointer to first element
inline numtyp * begin() { return _array; }
/// For CUDA-RT, get device pointer to first element
inline const numtyp * begin() const { return _array; }
/// For CUDA-RT, get device pointer to one past last element
inline numtyp * end() { return _end; }
/// For CUDA-RT, get device pointer to one past last element
inline const numtyp * end() const { return _end; }
#endif
#ifdef _UCL_DEVICE_PTR_MAT
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns void** **/
inline device_ptr & cbegin() { return _array; }
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns void** **/
inline const device_ptr & cbegin() const { return _array; }
#else
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns numtyp** **/
inline numtyp ** cbegin() { return &_array; }
/// Returns an API specific device pointer
/** - For OpenCL, returns a &cl_mem object
* - For CUDA Driver, returns a &CUdeviceptr
* - For CUDA-RT, returns numtyp** **/
inline const numtyp ** cbegin() const { return &_array; }
/// For CUDA-RT, allocate row vector and bind texture
inline void safe_alloc(const size_t cols, UCL_Device &dev,
textureReference *t)
{ alloc(cols,dev); assign_texture(t); bind(); }
/// For CUDA-RT, assign a texture to matrix
inline void assign_texture(textureReference *t) { _tex_ptr=t; }
/// For CUDA-RT, bind to texture
inline void bind() {
cuda_gb_get_channel<numtyp>(_channel);
(*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
(*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
(*_tex_ptr).filterMode = cudaFilterModePoint;
(*_tex_ptr).normalized = false;
CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,_array,&_channel));
}
/// For CUDA-RT, unbind texture
inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
#endif
/// Get the number of elements
inline size_t numel() const { return _cols; }
/// Get the number of rows
inline size_t rows() const { return 1; }
/// Get the number of columns
inline size_t cols() const { return _cols; }
///Get the size of a row (including any padding) in elements
inline size_t row_size() const { return _cols; }
/// Get the size of a row (including any padding) in bytes
inline size_t row_bytes() const { return _row_bytes; }
/// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); }
#ifdef _OCL_MAT
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return _offset; }
#else
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return 0; }
#endif
/// Return the offset (in bytes) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t byteoff() const { return offset()*sizeof(numtyp); }
private:
size_t _row_bytes, _row_size, _rows, _cols;
enum UCL_MEMOPT _kind;
#ifdef _UCL_DEVICE_PTR_MAT
device_ptr _array;
#else
numtyp *_array,*_end;
cudaChannelFormatDesc _channel;
textureReference *_tex_ptr;
#endif
#ifdef _OCL_MAT
size_t _offset;
#endif
};
#endif

View File

@ -0,0 +1,48 @@
/***************************************************************************
nvc_get_devices.h
-------------------
W. Michael Brown
List properties of cuda devices
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Wed Jan 28 2009
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifdef UCL_OPENCL
#include "ocl_device.h"
using namespace ucl_opencl;
#endif
#ifdef UCL_CUDADR
#include "nvd_device.h"
using namespace ucl_cudadr;
#endif
#ifdef UCL_CUDART
#include "nvc_device.h"
using namespace ucl_cudart;
#endif
int main(int argc, char** argv) {
UCL_Device cop;
std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
if (cop.num_platforms()>0) {
std::cout << "Using platform: " << cop.platform_name() << std::endl;
cop.print_all(std::cout);
}
return 0;
}

378
lib/gpu/geryon/ucl_h_mat.h Normal file
View File

@ -0,0 +1,378 @@
/***************************************************************************
ucl_h_mat.h
-------------------
W. Michael Brown
Matrix Container on Host
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW
/// Matrix on Host with options for pinning (page locked)
template <class numtyp>
class UCL_H_Mat : public UCL_BaseMat {
public:
// Traits for copying data
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
enum traits {
DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
MEM_TYPE = 1,
PADDED = 0,
ROW_MAJOR = 1,
VECTOR = 0
};
typedef numtyp data_type;
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
/// Construct with specied number of rows and columns
/** \sa alloc() **/
UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED)
{ _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* \param cq Default command queue for operations copied from another mat
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
clear();
_cols=cols;
_rows=rows;
_row_bytes=cols*sizeof(numtyp);
_kind=kind;
int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
<< " bytes on host.\n";
exit(1);
}
#endif
_end=_array+rows*cols;
return err;
}
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
clear();
_cols=cols;
_rows=rows;
_row_bytes=cols*sizeof(numtyp);
_kind=kind;
int err=_host_alloc(*this,device,_row_bytes*_rows,kind);
_end=_array+rows*cols;
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
<< " bytes on host.\n";
exit(1);
}
#endif
return err;
}
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) {
assert(rows==1 || stride==cols);
clear();
_kind=UCL_VIEW;
_cols=cols;
_rows=rows;
_row_bytes=stride*sizeof(numtyp);
this->_cq=input.cq();
_array=input.begin();
_end=_array+_cols;
#ifdef _OCL_MAT
_carray=input.cbegin();
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols)
{ view(input,rows,cols,input.row_size()); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input)
{ view(input,input.rows(),input.cols()); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols,
const size_t stride, UCL_Device &dev) {
assert(rows==1 || stride==cols);
clear();
_kind=UCL_VIEW;
_cols=cols;
_rows=rows;
_row_bytes=stride*sizeof(numtyp);
this->_cq=dev.cq();
_array=input;
_end=_array+_cols;
#ifdef _OCL_MAT
_host_alloc(*this,dev,_row_bytes,UCL_VIEW);
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols,
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols, const size_t stride) {
assert(rows==1 || stride==cols);
clear();
_kind=UCL_VIEW;
_cols=cols;
_rows=rows;
_row_bytes=stride*sizeof(numtyp);
this->_cq=input.cq();
_array=input.begin()+offset;
_end=_array+_cols;
#ifdef _OCL_MAT
_host_alloc(*this,input,_row_bytes,UCL_VIEW);
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols)
{ view_offset(offset,input,rows,cols,input.row_size()); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset, ucl_type &input) {
if (input.rows()==1)
view_offset(offset,input,1,input.cols()-offset);
else
view_offset(offset,input,input.rows()-offset/input.row_size(),
input.cols());
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols, UCL_Device &dev)
{ view(input+offset,rows,cols,dev); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols,const size_t stride,UCL_Device &dev)
{ view(input+offset,rows,cols,stride,dev); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type *input,
const size_t cols, UCL_Device &dev)
{ view(input+offset,1,cols,dev); }
/// Free memory and set size to 0
inline void clear()
{ if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }}
/// Set each element to zero
inline void zero() { _host_zero(_array,_rows*row_bytes()); }
/// Set first n elements to zero
inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
/// Get host pointer to first element
inline numtyp * begin() { return _array; }
/// Get host pointer to first element
inline const numtyp * begin() const { return _array; }
/// Get host pointer to one past last element
inline numtyp * end() { return _end; }
/// Get host pointer to one past last element
inline const numtyp * end() const { return _end; }
/// Get the number of elements
inline size_t numel() const { return _rows*_cols; }
/// Get the number of rows
inline size_t rows() const { return _rows; }
/// Get the number of columns
inline size_t cols() const { return _cols; }
///Get the size of a row (including any padding) in elements
inline size_t row_size() const { return _cols; }
/// Get the size of a row (including any padding) in bytes
inline size_t row_bytes() const { return _row_bytes; }
/// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); }
/// Get element at index i
inline numtyp & operator[](const int i) { return _array[i]; }
/// Get element at index i
inline const numtyp & operator[](const int i) const { return _array[i]; }
/// 2D access (row should always be 0)
inline numtyp & operator()(const int row, const int col)
{ return _array[row*_cols+col]; }
/// 2D access (row should always be 0)
inline const numtyp & operator()(const int row, const int col) const
{ return _array[row*_cols+col]; }
/// Returns pointer to memory pointer for allocation on host
inline numtyp ** host_ptr() { return &_array; }
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return 0; }
/// Return the offset (in bytes) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t byteoff() const { return 0; }
#ifdef _OCL_MAT
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
inline device_ptr & cbegin() { return _carray; }
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
inline const device_ptr & cbegin() const { return _carray; }
#else
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
inline void ** cbegin() { return (void **)&_array; }
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
inline const void ** cbegin() const { return (const void **)&_array; }
#endif
private:
enum UCL_MEMOPT _kind;
numtyp *_array, *_end;
size_t _row_bytes, _rows, _cols;
#ifdef _OCL_MAT
device_ptr _carray;
#endif
};
#endif

370
lib/gpu/geryon/ucl_h_vec.h Normal file
View File

@ -0,0 +1,370 @@
/***************************************************************************
ucl_h_vec.h
-------------------
W. Michael Brown
Vector Container on Host
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu Jun 25 2009
copyright : (C) 2009 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2009) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW
/// Row Vector on Host with options for pinning (page locked)
template <class numtyp>
class UCL_H_Vec : public UCL_BaseMat {
public:
// Traits for copying data
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
enum traits {
DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
MEM_TYPE = 1,
PADDED = 0,
ROW_MAJOR = 1,
VECTOR = 1
};
typedef numtyp data_type;
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
/// Construct with n columns
/** \sa alloc() **/
UCL_H_Vec(const size_t n, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED)
{ _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* \param cq Default command queue for operations copied from another mat
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
clear();
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
_kind=kind;
int err=_host_alloc(*this,cq,_row_bytes,kind);
_end=_array+cols;
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on host.\n";
exit(1);
}
#endif
return err;
}
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
clear();
_cols=cols;
_row_bytes=cols*sizeof(numtyp);
_kind=kind;
int err=_host_alloc(*this,device,_row_bytes,kind);
_end=_array+cols;
#ifndef UCL_NO_EXIT
if (err!=UCL_SUCCESS) {
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on host.\n";
exit(1);
}
#endif
return err;
}
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
clear();
_kind=UCL_VIEW;
_cols=cols;
_row_bytes=_cols*sizeof(numtyp);
this->_cq=input.cq();
_array=input.begin();
_end=_array+_cols;
#ifdef _OCL_MAT
_carray=input.cbegin();
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols,
const size_t stride) { view(input,rows,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t cols)
{ view(input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input)
{ view(input,input.rows()*input.row_size()); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols,
UCL_Device &dev) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
clear();
_kind=UCL_VIEW;
_cols=cols;
_row_bytes=_cols*sizeof(numtyp);
this->_cq=dev.cq();
_array=input;
_end=_array+_cols;
#ifdef _OCL_MAT
_host_alloc(*this,dev,_row_bytes,UCL_VIEW);
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols,
const size_t stride, UCL_Device &dev)
{ view(input,rows,cols,stride); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
clear();
_kind=UCL_VIEW;
_cols=cols;
_row_bytes=_cols*sizeof(numtyp);
this->_cq=input.cq();
_array=input.begin()+offset;
_end=_array+_cols;
#ifdef _OCL_MAT
_host_alloc(*this,input,_row_bytes,UCL_VIEW);
#endif
}
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols, const size_t stride)
{ view_offset(offset,input,rows,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
{ view_offset(offset,input,1,cols); }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset, ucl_type &input)
{ view_offset(offset,input,input.rows()*input.row_size()-offset); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols, UCL_Device &dev)
{ view(input+offset,rows,cols,dev); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
const size_t cols,const size_t stride,UCL_Device &dev)
{ view(input+offset,rows,cols,stride,dev); }
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type *input,
const size_t cols, UCL_Device &dev)
{ view(input+offset,1,cols,dev); }
/// Free memory and set size to 0
inline void clear()
{ if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
/// Set each element to zero
inline void zero() { _host_zero(_array,row_bytes()); }
/// Set first n elements to zero
inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
/// Get host pointer to first element
inline numtyp * begin() { return _array; }
/// Get host pointer to first element
inline const numtyp * begin() const { return _array; }
/// Get host pointer to one past last element
inline numtyp * end() { return _end; }
/// Get host pointer to one past last element
inline const numtyp * end() const { return _end; }
/// Get the number of elements
inline size_t numel() const { return _cols; }
/// Get the number of rows
inline size_t rows() const { return 1; }
/// Get the number of columns
inline size_t cols() const { return _cols; }
///Get the size of a row (including any padding) in elements
inline size_t row_size() const { return _cols; }
/// Get the size of a row (including any padding) in bytes
inline size_t row_bytes() const { return _row_bytes; }
/// Get the size in bytes of 1 element
inline int element_size() const { return sizeof(numtyp); }
/// Get element at index i
inline numtyp & operator[](const int i) { return _array[i]; }
/// Get element at index i
inline const numtyp & operator[](const int i) const { return _array[i]; }
/// 2D access (row should always be 0)
inline numtyp & operator()(const int row, const int col)
{ return _array[col]; }
/// 2D access (row should always be 0)
inline const numtyp & operator()(const int row, const int col) const
{ return _array[col]; }
/// Returns pointer to memory pointer for allocation on host
inline numtyp ** host_ptr() { return &_array; }
/// Return the offset (in elements) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t offset() const { return 0; }
/// Return the offset (in bytes) from begin() pointer where data starts
/** \note Always 0 for host matrices and CUDA APIs **/
inline size_t byteoff() const { return 0; }
#ifdef _OCL_MAT
/// For OpenCL, returns a reference to the cl_mem object
inline device_ptr & cbegin() { return _carray; }
/// For OpenCL, returns a reference to the cl_mem object
inline const device_ptr & cbegin() const { return _carray; }
#endif
private:
enum UCL_MEMOPT _kind;
numtyp *_array, *_end;
size_t _row_bytes, _cols;
#ifdef _OCL_MAT
device_ptr _carray;
#endif
};
#endif

View File

@ -0,0 +1,42 @@
/***************************************************************************
ucl_nv_kernel.h
-------------------
W. Michael Brown
Preprocessor macros for OpenCL/CUDA compatibility
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Mon May 3 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifndef UCL_NV_KERNEL_H
#define UCL_NV_KERNEL_H
#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_SIZE_X blockDim.x
#define BLOCK_SIZE_Y blockDim.y
#define __kernel extern "C" __global__
#define __local __shared__
#define mul24 __mul24
#define __global
#define __inline static __inline__ __device__
#endif

273
lib/gpu/geryon/ucl_print.h Normal file
View File

@ -0,0 +1,273 @@
/***************************************************************************
ucl_print.h
-------------------
W. Michael Brown
Routines for printing debugging output for matrix/vector data
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Mon Jan 11 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
#ifdef UCL_PRINT_ALLOW
template <int mem> struct _ucl_print;
template <> struct _ucl_print<1> {
template <class mat_type>
static inline void p(mat_type &mat, const size_t n, std::ostream &out,
const std::string delim) {
for (size_t i=0; i<n-1; i++)
out << mat[i] << delim;
out << mat[n-1];
}
template <class mat_type>
static inline void p(const mat_type &mat, const size_t n, std::ostream &out,
const std::string delim, UCL_Device &dev) {
p(mat,n,out,delim);
}
template <class mat_type>
static inline void p(mat_type &mat, const size_t rows, const size_t cols,
std::ostream &out, const std::string delim,
const std::string row_delim) {
int offset=0;
int row_size=cols;
if (mat_type::VECTOR==0)
row_size=mat.row_size();
for (size_t j=0; j<rows; j++) {
size_t lend=offset+cols-1;
for (size_t i=offset; i<lend; i++)
out << mat[i] << delim;
out << mat[lend];
if (j!=rows-1)
out << row_delim;
offset+=row_size;
}
}
template <class mat_type>
static inline void p(const mat_type &mat,const size_t rows,const size_t cols,
std::ostream &out,const std::string delim,
const std::string row_delim, UCL_Device &dev) {
p(mat,rows,cols,out,delim,row_delim);
}
};
template <int mem> struct _ucl_print {
template <class mat_type>
static inline void p(mat_type &mat, const size_t n, std::ostream &out,
const std::string delim) {
UCL_H_Vec<typename mat_type::data_type> temp;
temp.alloc(n,mat);
ucl_copy(temp,mat,n,false);
_ucl_print<1>::p(temp,n,out,delim);
}
template <class mat_type>
static inline void p(const mat_type &mat, const size_t n, std::ostream &out,
const std::string delim, UCL_Device &dev) {
UCL_H_Vec<typename mat_type::data_type> temp;
temp.alloc(n,dev);
ucl_copy(temp,mat,n,false);
_ucl_print<1>::p(temp,n,out,delim);
}
template <class mat_type>
static inline void p(mat_type &mat, const size_t rows, const size_t cols,
std::ostream &out, const std::string delim,
const std::string row_delim) {
UCL_H_Vec<typename mat_type::data_type> temp;
temp.alloc(mat.rows()*mat.cols(),mat);
if (mat_type::VECTOR==1)
ucl_copy(temp,mat,rows*cols,false);
else
ucl_copy(temp,mat,rows,cols,false);
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
}
template <class mat_type>
static inline void p(const mat_type &mat, const size_t rows,
const size_t cols,std::ostream &out,
const std::string delim,
const std::string row_delim, UCL_Device &dev) {
UCL_H_Vec<typename mat_type::data_type> temp;
temp.alloc(mat.rows()*mat.cols(),dev);
if (mat_type::VECTOR==1)
ucl_copy(temp,mat,rows*cols,false);
else
ucl_copy(temp,mat,rows,cols,false);
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
}
};
// -------------------------------------------------------------------------
// - Non-const routines that do not require a device object
// -------------------------------------------------------------------------
/// Outputs n elements of mat delimited by the string delim
template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out,
const std::string delim) {
if (n>mat.numel()) {
std::cerr << "Attempted to ucl_print " << n << " elements of matrix "
<< "that only has " << mat.numel() << " elements.";
exit(1);
}
_ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim);
}
/// Outputs n elements of mat delimited by a space
template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) {
ucl_print(mat,n,out," ");
}
/// Outputs n elements of mat delimited by a space to standard out
template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t n) {
ucl_print(mat,n,std::cout," ");
}
/// Outputs upper left rows and cols of mat delimited by the string delim
template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
std::ostream &out, const std::string delim,
const std::string row_delim) {
if (rows*cols>mat.numel()) {
std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
<< "that only has " << mat.numel() << " elements.";
exit(1);
}
_ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim);
}
/// Outputs upper left rows and cols of mat delimited by a space
template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
std::ostream &out) {
ucl_print(mat,rows,cols,out," ","\n");
}
/// Outputs upper left rows and cols of mat delimited by a space to std out
template <class mat_type>
inline void ucl_print(mat_type &mat, const size_t rows,
const size_t cols) {
ucl_print(mat,rows,cols,std::cout," ","\n");
}
/// Outputs mat delimited by a space to standard out
template <class mat_type>
inline void ucl_print(mat_type &mat) {
ucl_print(mat,std::cout);
}
/// Outputs mat delimited by a space
template <class mat_type>
inline void ucl_print(mat_type &mat, std::ostream &out) {
if (mat_type::VECTOR==1)
ucl_print(mat,mat.cols(),out," ");
else
ucl_print(mat,mat.rows(),mat.cols(),out," ","\n");
}
// -------------------------------------------------------------------------
// - Const routines that do not require a device object
// -------------------------------------------------------------------------
/// Outputs n elements of mat delimited by the string delim
template <class mat_type>
inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
const std::string delim, UCL_Device &dev) {
if (n>mat.numel()) {
std::cerr << "Attempted to ucl_print " << n << " elements of matrix "
<< "that only has " << mat.numel() << " elements.";
exit(1);
}
_ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim,dev);
}
/// Outputs n elements of mat delimited by a space
template <class mat_type>
inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
UCL_Device &dev) {
ucl_print(mat,n,out," ",dev);
}
/// Outputs n elements of mat delimited by a space to standard out
template <class mat_type>
inline void ucl_print(const mat_type &mat, const size_t n,
UCL_Device &dev) {
ucl_print(mat,n,std::cout," ",dev);
}
/// Outputs upper left rows and cols of mat delimited by the string delim
template <class mat_type>
inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
std::ostream &out, const std::string delim,
const std::string row_delim, UCL_Device &dev) {
if (rows*cols>mat.numel()) {
std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
<< "that only has " << mat.numel() << " elements.";
exit(1);
}
_ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim,dev);
}
/// Outputs upper left rows and cols of mat delimited by a space
template <class mat_type>
inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
std::ostream &out, UCL_Device &dev) {
ucl_print(mat,rows,cols,out," ","\n",dev);
}
/// Outputs upper left rows and cols of mat delimited by a space to std out
template <class mat_type>
inline void ucl_print(const mat_type &mat, const size_t rows,
const size_t cols, UCL_Device &dev) {
ucl_print(mat,rows,cols,std::cout," ","\n",dev);
}
/// Outputs mat delimited by a space to standard out
template <class mat_type>
inline void ucl_print(const mat_type &mat, UCL_Device &dev) {
ucl_print(mat,std::cout,dev);
}
/// Outputs mat delimited by a space
template <class mat_type>
inline void ucl_print(const mat_type &mat, std::ostream &out, UCL_Device &dev) {
if (mat_type::VECTOR==1)
ucl_print(mat,mat.cols(),out," ",dev);
else
ucl_print(mat,mat.rows(),mat.cols(),out," ","\n",dev);
}
// -------------------------------------------------------------------------
// - Operator << Overloading
// -------------------------------------------------------------------------
template <class numtyp>
inline std::ostream & operator << (std::ostream &out, UCL_H_Vec<numtyp> &mat)
{ ucl_print(mat,out); return out; }
template <class numtyp>
inline std::ostream & operator << (std::ostream &out, UCL_H_Mat<numtyp> &mat)
{ ucl_print(mat,out); return out; }
template <class numtyp>
inline std::ostream & operator << (std::ostream &out, UCL_D_Vec<numtyp> &mat)
{ ucl_print(mat,out); return out; }
template <class numtyp>
inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
{ ucl_print(mat,out); return out; }
#endif

121
lib/gpu/geryon/ucl_types.h Normal file
View File

@ -0,0 +1,121 @@
/***************************************************************************
ucl_types.h
-------------------
W. Michael Brown
Data type definitions for Coprocessor library
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Mon Jan 4 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef UCL_TYPES_H
#define UCL_TYPES_H
// Assign an integer id based on the data type: (int, float, double, etc)
template <class eltype> struct _UCL_DATA_ID;
template <> struct _UCL_DATA_ID<double> {
enum { id=1 };
static inline const char * name() { return "double"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }
};
template <> struct _UCL_DATA_ID<float> {
enum { id=2 };
static inline const char * name() { return "float"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }
};
template <> struct _UCL_DATA_ID<unsigned> {
enum { id=3 };
static inline const char * name() { return "unsigned"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }
};
template <> struct _UCL_DATA_ID<int> {
enum { id=4 };
static inline const char * name() { return "int"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }
};
template <> struct _UCL_DATA_ID<char> {
enum { id=5 };
static inline const char * name() { return "char"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }
};
template <> struct _UCL_DATA_ID<unsigned char> {
enum { id=6 };
static inline const char * name() { return "unsigned char"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }
};
template <> struct _UCL_DATA_ID<short> {
enum { id=7 };
static inline const char * name() { return "short"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }
};
template <> struct _UCL_DATA_ID<unsigned short> {
enum { id=8 };
static inline const char * name() { return "unsigned short"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }
};
template <> struct _UCL_DATA_ID<long> {
enum { id=9 };
static inline const char * name() { return "long"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }
};
template <> struct _UCL_DATA_ID<unsigned long> {
enum { id=10 };
static inline const char * name() { return "unsigned long"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }
};
template <> struct _UCL_DATA_ID<long double> {
enum { id=11 };
static inline const char * name() { return "long double"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }
};
template <class eltype> struct _UCL_DATA_ID {
enum { id=0 };
static inline const char * name() { return "error_type"; }
static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }
};
// Host memory allocation types
enum UCL_MEMOPT {
UCL_WRITE_ONLY, ///< Allow any optimizations for memory that is write only
UCL_READ_ONLY, ///< Allow any optimizations for memory that is read only
UCL_READ_WRITE, ///< Allow read and write
UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined)
UCL_RW_OPTIMIZED, ///< Allow host memory to be pinned
UCL_NOT_PINNED, ///< Host memory is not to be pinned
UCL_VIEW ///< View of another memory allocation
};
enum UCL_DEVICE_TYPE {
UCL_DEFAULT, ///< Unknown device type
UCL_CPU, ///< Device is a CPU
UCL_GPU, ///< Device is a GPU
UCL_ACCELERATOR ///< Device is an Accelerator
};
enum UCL_ERROR_FLAG {
UCL_SUCCESS, ///< No error
UCL_ERROR, ///< Unqualified error
UCL_FILE_NOT_FOUND, ///< File not found
UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found
UCL_COMPILE_ERROR, ///< Error compiling kernel
UCL_MEMORY_ERROR
};
template <class numtyp>
const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }
#endif

123
lib/gpu/lj96_cut_gpu.cpp Normal file
View File

@ -0,0 +1,123 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "lj96_cut_gpu_memory.h"
using namespace std;
static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
LJ96MF.clear();
gpu_mode=LJ96MF.device->gpu_mode();
double gpu_split=LJ96MF.device->particle_split();
int first_gpu=LJ96MF.device->first_device();
int last_gpu=LJ96MF.device->last_device();
int world_me=LJ96MF.device->world_me();
int gpu_rank=LJ96MF.device->gpu_rank();
int procs_per_gpu=LJ96MF.device->procs_per_gpu();
LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
bool message=false;
if (world_me==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
if (world_me==0) {
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
if (!init_ok)
return false;
}
MPI_Barrier(MPI_COMM_WORLD);
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum,
nall, 300, maxspecial, cell_size, gpu_split,
screen);
if (!init_ok)
return false;
}
MPI_Barrier(LJ96MF.device->gpu_comm);
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
}
void lj96_gpu_clear() {
LJ96MF.clear();
}
int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success);
}
void lj96_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success) {
LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double lj96_gpu_bytes() {
return LJ96MF.host_memory_usage();
}

View File

@ -0,0 +1,281 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJ96_GPU_KERNEL
#define LJ96_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (r2inv<lj1[mtype].z) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp r3inv = sqrt(r6inv);
numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (ii<4)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
if (eflag>0)
lj3[ii]=lj3_in[ii];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp r3inv = sqrt(r6inv);
numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -0,0 +1,150 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "lj96_cut_gpu_cl.h"
#else
#include "lj96_cut_gpu_ptx.h"
#endif
#include "lj96_cut_gpu_memory.h"
#include <cassert>
#define LJ96_GPU_MemoryT LJ96_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
LJ96_GPU_MemoryT::LJ96_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
LJ96_GPU_MemoryT::~LJ96_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
bool LJ96_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj96_cut_gpu_kernel);
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return true;
}
template <class numtyp, class acctyp>
void LJ96_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJ96_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJ96_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int ainum=this->atom->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch);
}
this->time_pair.stop();
}
template class LJ96_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -0,0 +1,71 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJ96_GPU_MEMORY_H
#define LJ96_GPU_MEMORY_H
#include "atomic_gpu_memory.h"
template <class numtyp, class acctyp>
class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
public:
LJ96_GPU_Memory();
~LJ96_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

124
lib/gpu/lj_cut_gpu.cpp Normal file
View File

@ -0,0 +1,124 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "lj_cut_gpu_memory.h"
using namespace std;
static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool ljl_gpu_init(const int ntypes, double **cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen) {
LJLMF.clear();
gpu_mode=LJLMF.device->gpu_mode();
double gpu_split=LJLMF.device->particle_split();
int first_gpu=LJLMF.device->first_device();
int last_gpu=LJLMF.device->last_device();
int world_me=LJLMF.device->world_me();
int gpu_rank=LJLMF.device->gpu_rank();
int procs_per_gpu=LJLMF.device->procs_per_gpu();
LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
bool message=false;
if (world_me==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
if (world_me==0) {
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
if (!init_ok)
return false;
}
MPI_Barrier(MPI_COMM_WORLD);
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split,
screen);
if (!init_ok)
return false;
}
MPI_Barrier(LJLMF.device->gpu_comm);
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
}
void ljl_gpu_clear() {
LJLMF.clear();
}
int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success);
}
void ljl_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success) {
LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double ljl_gpu_bytes() {
return LJLMF.host_memory_usage();
}

View File

@ -0,0 +1,279 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJ_GPU_KERNEL
#define LJ_GPU_KERNEL
#define MAX_SHARED_TYPES 8
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#ifdef NV_KERNEL
#include "geryon/ucl_nv_kernel.h"
texture<float4> pos_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X;
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (r2inv<lj1[mtype].z) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nall, const int nbor_pitch) {
// ii indexes the two interacting particles in gi
int ii=THREAD_ID_X;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (ii<4)
sp_lj[ii]=sp_lj_in[ii];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[ii]=lj1_in[ii];
if (eflag>0)
lj3[ii]=lj3_in[ii];
}
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
__syncthreads();
if (ii<inum) {
acctyp energy=(numtyp)0;
acctyp4 f;
f.x=(numtyp)0;
f.y=(numtyp)0;
f.z=(numtyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(numtyp)0;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
if (j < nall)
factor_lj = (numtyp)1.0;
else {
factor_lj = sp_lj[j/nall];
j %= nall;
}
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
// Store answers
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

View File

@ -0,0 +1,150 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifdef USE_OPENCL
#include "lj_cut_gpu_cl.h"
#else
#include "lj_cut_gpu_ptx.h"
#endif
#include "lj_cut_gpu_memory.h"
#include <cassert>
#define LJL_GPU_MemoryT LJL_GPU_Memory<numtyp, acctyp>
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
template <class numtyp, class acctyp>
LJL_GPU_MemoryT::LJL_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
LJL_GPU_MemoryT::~LJL_GPU_Memory() {
clear();
}
template <class numtyp, class acctyp>
int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
bool LJL_GPU_MemoryT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_cut_gpu_kernel);
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return true;
}
template <class numtyp, class acctyp>
void LJL_GPU_MemoryT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJL_GPU_MemoryT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJL_GPU_Memory<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
int ainum=this->atom->inum();
int anall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag,
&ainum, &anall, &nbor_pitch);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->atom->dev_ans.begin(),
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
&anall, &nbor_pitch);
}
this->time_pair.stop();
}
template class LJL_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -0,0 +1,71 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef LJL_GPU_MEMORY_H
#define LJL_GPU_MEMORY_H
#include "atomic_gpu_memory.h"
template <class numtyp, class acctyp>
class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
public:
LJL_GPU_Memory();
~LJL_GPU_Memory();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device **/
bool init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
#endif

View File

@ -16,206 +16,270 @@
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include "nvc_macros.h"
#include "nvc_timer.h"
#include "nvc_device.h"
#include "pair_gpu_texture.h"
#include "pair_gpu_cell.h"
#include "lj_gpu_memory.cu"
#include <string.h>
#include "cudatimer.h"
#include "lj_tex.h"
#include "neigh.h"
#include "cell.h"
#include "lj_gpu_kernel.h"
#ifdef WINDLL
#define EXTERN extern "C" __declspec(dllexport)
#else
#define EXTERN
#endif
static float h_boxlo[3], h_boxhi[3];
static float cell_size;
static float *energy = NULL, *d_energy = NULL;
static float3 *d_force = NULL, *f_temp = NULL, *v_temp = NULL, *d_virial = NULL;
static float4 *d_pos = NULL, *temp_pos = NULL;
static int *d_type = NULL;
static int ncellx, ncelly, ncellz;
static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
#define LJMT LJ_GPU_Memory<numtyp,acctyp>
static neigh_list_gpu d_neigh_list;
static cell_list_gpu d_cell_list;
// ---------------------------------------------------------------------------
// Convert something to a string
// ---------------------------------------------------------------------------
#include <sstream>
template <class t>
inline string lj_gpu_toa(const t& in) {
ostringstream o;
o.precision(2);
o << in;
return o.str();
}
#define TIMING(x)
// ---------------------------------------------------------------------------
// Return string with GPU info
// ---------------------------------------------------------------------------
EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) {
string sname=LJMF.gpu.name(id)+", "+
lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ";
strcpy(name,sname.c_str());
EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name)
{
struct cudaDeviceProp prop;
CUDA_SAFE_CALL( cudaGetDeviceProperties(&prop, id) );
#ifdef _WIN32
strcpy_s(name, strlen(prop.name)+1, prop.name);
#else
strncpy(name, prop.name, strlen(prop.name)+1);
#endif
}
static bool _pc_cell_alloc;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
EXTERN bool lj_gpu_init(int &ij_size, const int ntypes,
double **cutsq,double **sigma,
double **epsilon, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, double *boxlo, double *boxhi,
double cellsize, double skin,
const int max_nbors, const int gpu_id)
{
int num_devices;
inline void _lj_gpu_clear() {
if (_pc_cell_alloc) {
free(energy);
free(v_temp);
cudaFreeHost(f_temp);
cudaFree(d_force);
cudaFree(d_energy);
cudaFree(d_virial);
clear_cell_list(cell_list_gpu);
_pc_cell_alloc=false;
/* get device count */
CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
if (num_devices == 0) {
printf("NO CUDA-capable GPU detected.\n");
exit(1);
}
if (gpu_id > num_devices) {
printf("gpu_id %d is larger than the number of GPUs %d\n",
gpu_id, num_devices);
exit(1);
}
/* set CUDA device to the specified GPU */
cudaThreadExit();
CUDA_SAFE_CALL( cudaSetDevice(gpu_id) );
ij_size=0;
cell_size = cellsize;
ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
for (int i = 0; i < 3; i++) {
h_boxhi[i] = boxhi[i];
h_boxlo[i] = boxlo[i];
}
init_force_const(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
init_cell_list_const(cellsize, skin, boxlo, boxhi);
return true;
}
// ---------------------------------------------------------------------------
// Clear memory on host and device
// ---------------------------------------------------------------------------
EXTERN void lj_gpu_clear() {
_lj_gpu_clear();
LJMF.clear();
free(energy);
free(v_temp);
CUDA_SAFE_CALL( cudaFreeHost(f_temp) );
if (d_force) CUDA_SAFE_CALL( cudaFree(d_force) );
if (d_energy) CUDA_SAFE_CALL( cudaFree(d_energy) );
if (d_virial) CUDA_SAFE_CALL( cudaFree(d_virial) );
if (d_pos) CUDA_SAFE_CALL( cudaFree(d_pos) );
if (d_type) CUDA_SAFE_CALL( cudaFree(d_type) );
if (temp_pos) CUDA_SAFE_CALL( cudaFreeHost(temp_pos) );
clear_neigh_list_gpu(d_neigh_list);
clear_cell_list_gpu(d_cell_list);
if (useCache) {
unbind_pos();
unbind_type();
}
//LJMF.clear();
}
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
double **epsilon, double **host_lj1, double **host_lj2,
double **host_lj3, double **host_lj4, double **offset,
double *special_lj, double *boxlo, double *boxhi,
double cell_size, double skin,
const int max_nbors, const int gpu_id) {
if (LJMF.is_allocated())
lj_gpu_clear();
else
_pc_cell_alloc=false;
LJMF.gpu.init();
if (LJMF.gpu.num_devices()==0)
return false;
ij_size=IJ_SIZE;
bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id,
0,0);
ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
init_cell_list_const(cell_size, skin, boxlo, boxhi);
return ret;
}
template <class numtyp, class acctyp>
double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
double **host_x, int *host_type, const int inum,
const int nall, const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
double _lj_gpu_neigh(double **force, double *virial,
double **host_x, int *host_type, const int inum,
const int nall, const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
cudaError_t err;
ljm.atom.nall(nall);
ljm.atom.inum(inum);
ljm.nbor.time_nbor.start();
ljm.nbor.time_nbor.stop();
double evdwl=0.0;
static int blockSize = BLOCK_1D;
static int ncell = ncellx*ncelly*ncellz;
static int first_call = 1;
TIMING( static CUDATimer cuTimer );
TIMING( static CTimer cTimer );
TIMING( static CTimer cTimer2 );
double *atom_pos = host_x[0];
static int szTailList = inum*32;
TIMING( cTimer.Start() );
TIMING( cTimer2.Start() );
/* MPI communication just happened, reallocate space using new inum & nall
FIXME: this is costly: ~ total kernel time! Use a DIY GPU memory allocator.*/
if (first_call || ago == 0) {
first_call = 0;
_lj_gpu_clear();
if (!first_call) {
if (useCache) {
unbind_pos();
unbind_type();
}
CUDA_SAFE_CALL( cudaFree(d_force) );
CUDA_SAFE_CALL( cudaFree(d_energy) );
CUDA_SAFE_CALL( cudaFree(d_virial) );
CUDA_SAFE_CALL( cudaFree(d_pos) );
CUDA_SAFE_CALL( cudaFree(d_type) );
clear_neigh_list_gpu(d_neigh_list);
CUDA_SAFE_CALL( cudaFreeHost(f_temp) );
CUDA_SAFE_CALL( cudaFreeHost(temp_pos) );
free(energy);
free(v_temp);
}
CUDA_SAFE_CALL( cudaMalloc((void**)&d_force, inum*sizeof(float3)) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_energy, inum*sizeof(float)) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) );
init_neigh_list_gpu(d_neigh_list, inum, NEIGH_BIN_SIZE, szTailList);
CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) );
CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp, inum*sizeof(float3)) );
energy = (float*) malloc(inum*sizeof(float));
v_temp = (float3*)malloc(inum*2*sizeof(float3));
cudaMallocHost((void**)&f_temp, inum*sizeof(float3));
cudaMalloc((void**)&d_force, inum*sizeof(float3));
cudaMalloc((void**)&d_energy, inum*sizeof(float));
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
if (useCache) {
bind_pos(d_pos, nall);
bind_type(d_type, nall);
}
first_call = 0;
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int),
cudaMemcpyHostToDevice) );
init_cell_list(cell_list_gpu, nall, ncell, blockSize);
_pc_cell_alloc=true;
}
// build cell-list on GPU
ljm.atom.time_atom.start();
build_cell_list(host_x[0], host_type, cell_list_gpu,
ncell, ncellx, ncelly, ncellz, blockSize, inum, nall, ago);
ljm.atom.time_atom.stop();
TIMING( static double mallocTime = 0. );
TIMING( mallocTime += cTimer2.GetET() );
TIMING( printf("malloc time = %f ms\n", mallocTime*1e3) );
ljm.time_pair.start();
TIMING( cTimer2.Start() );
for (int i = 0; i < 3*nall; i+=3) {
temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f);
}
#ifdef TIMING
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
TIMING( static double copyTime = 0. );
TIMING( copyTime += cTimer2.GetET() );
TIMING( printf("position copy time = %f ms\n", copyTime*1e3) );
#define KERNEL_LJ_CELL(e, v, b, s) kernel_lj_cell<e,v,b><<<GX, BX, s>>> \
(d_force, d_energy, d_virial, \
cell_list_gpu.pos, \
cell_list_gpu.idx, \
cell_list_gpu.type, \
cell_list_gpu.natom, \
inum, nall, ncell, ncellx, ncelly, ncellz);
// call the cell-list force kernel
const int BX=blockSize;
dim3 GX(ncellx, ncelly*ncellz);
if (eflag == 0 && vflag == 0) {
if (blockSize == 64 ) KERNEL_LJ_CELL(false, false, 64, 0);
if (blockSize == 128) KERNEL_LJ_CELL(false, false, 128, 0);
if (blockSize == 256) KERNEL_LJ_CELL(false, false, 256, 0);
} else {
if (blockSize == 64) KERNEL_LJ_CELL(true, true, 64, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
if (blockSize == 128) KERNEL_LJ_CELL(true, true, 128, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
if (blockSize == 256) KERNEL_LJ_CELL(true, true, 256, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
TIMING( cTimer2.Start() );
CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4), cudaMemcpyHostToDevice) );
TIMING( static double h2dTime = 0. );
TIMING( h2dTime += cTimer2.GetET() );
TIMING( printf("h2d copy time = %f ms\n", h2dTime*1e3) );
TIMING( cTimer2.Start() );
if (ago == 0) {
build_neigh_list_gpu(d_pos,
d_neigh_list,
h_boxlo, h_boxhi, cell_size,
inum, nall);
}
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("LJ force kernel launch error: %d\n", err);
exit(1);
TIMING( static double neighTime = 0. );
TIMING( neighTime += cTimer2.GetET() );
TIMING( printf("Neigh List time = %f ms\n", neighTime*1e3) );
TIMING( cTimer2.Start() );
calc_lj_neigh_gpu(d_force, d_energy, d_virial,
d_pos, d_type,
d_neigh_list,
inum, nall,
eflag, vflag);
TIMING( static double forceTime = 0. );
TIMING( forceTime += cTimer2.GetET() );
TIMING( printf("Force time = %f ms\n", forceTime*1e3) );
TIMING( printf("GPU kernel time = %f ms\n", (forceTime + neighTime)*1e3) );
TIMING( cTimer2.Start() );
CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost) );
TIMING( static double d2hTime = 0. );
TIMING( d2hTime += cTimer2.GetET() );
TIMING( printf("d2h copy time = %f ms\n", d2hTime*1e3) );
TIMING( printf("GPU-CPU data transfer time = %f ms\n", (h2dTime+d2hTime)*1e3) );
TIMING( cTimer2.Start() );
for (int i = 0; i < inum; i++) {
force[i][0] += f_temp[i].x;
force[i][1] += f_temp[i].y;
force[i][2] += f_temp[i].z;
}
#ifdef TIMING
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float kTime;
cudaEventElapsedTime(&kTime, start, stop);
kernelTime += kTime;
printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
cudaEventDestroy(start);
cudaEventDestroy(stop);
#endif
// copy results from GPU to CPU
cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
if (eflag) {
cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy,
inum*sizeof(float), cudaMemcpyDeviceToHost) );
for (int i = 0; i < inum; i++) {
evdwl += energy[i];
}
evdwl *= 0.5f;
}
if (vflag) {
cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3),
cudaMemcpyDeviceToHost) );
for (int i = 0; i < inum; i++) {
virial[0] += v_temp[2*i].x;
virial[1] += v_temp[2*i].y;
@ -228,43 +292,175 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
virial[i] *= 0.5f;
}
for (int i = 0; i < inum; i++) {
force[i][0] += f_temp[i].x;
force[i][1] += f_temp[i].y;
force[i][2] += f_temp[i].z;
}
ljm.time_pair.stop();
ljm.atom.time_atom.add_to_total();
ljm.nbor.time_nbor.add_to_total();
ljm.time_pair.add_to_total();
TIMING( static double postTime = 0. );
TIMING( postTime += cTimer2.GetET() );
TIMING( printf("postprocess Time = %f ms\n", postTime*1e3) );
TIMING( printf("Data process time = %f ms\n", (postTime+copyTime)*1e3) );
TIMING( static double totalTime = 0. );
TIMING( totalTime += cTimer.GetET() );
TIMING( printf("lj_gpu time = %f ms\n", totalTime*1e3) );
return evdwl;
}
EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall,
const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
EXTERN double lj_gpu_neigh(double **force, double *virial,
double **host_x, int *host_type,
const int inum, const int nall,
const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
return _lj_gpu_cell<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall,
ago, eflag, vflag, boxlo, boxhi);
return _lj_gpu_neigh<float,float>(force, virial,
host_x, host_type, inum, nall,
ago, eflag, vflag, boxlo, boxhi);
}
template <class numtyp, class acctyp>
double _lj_gpu_cell(double **force, double *virial,
double **host_x, int *host_type, const int inum,
const int nall, const int ago,
const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
double evdwl=0.0;
static int ncell = ncellx*ncelly*ncellz;
static int first_call = 1;
// allocate memory on CPU and GPU
if (first_call || ago == 0) {
if (!first_call) {
if (useCache) {
unbind_pos();
unbind_type();
}
free(energy);
free(v_temp);
CUDA_SAFE_CALL( cudaFree(d_force) );
CUDA_SAFE_CALL( cudaFree(d_energy) );
CUDA_SAFE_CALL( cudaFree(d_virial) );
CUDA_SAFE_CALL( cudaFree(d_pos) );
CUDA_SAFE_CALL( cudaFree(d_type) );
CUDA_SAFE_CALL( cudaFreeHost(f_temp) );
CUDA_SAFE_CALL( cudaFreeHost(temp_pos) );
clear_cell_list_gpu(d_cell_list);
}
energy = (float*) malloc(inum*sizeof(float));
v_temp = (float3*)malloc(inum*2*sizeof(float3));
cudaMalloc((void**)&d_force, inum*sizeof(float3));
cudaMalloc((void**)&d_energy, inum*sizeof(float));
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) );
CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp, inum*sizeof(float3)) );
CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) );
init_cell_list_gpu(d_cell_list, nall, ncell);
CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int),
cudaMemcpyHostToDevice) );
if (useCache) {
bind_pos(d_pos, nall);
bind_type(d_type, nall);
}
first_call = 0;
}
/* build cell-list on GPU */
double *atom_pos = host_x[0];
for (int i = 0; i < 3*nall; i+=3) {
temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f);
}
CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4),
cudaMemcpyHostToDevice) );
if (ago == 0) {
build_cell_list_gpu(d_pos, d_cell_list, h_boxlo, h_boxhi,
cell_size, inum, nall);
}
calc_lj_cell_gpu(d_force, d_energy, d_virial,
d_pos, d_type, d_cell_list,
inum, nall, ncellx,
ncelly, ncellz, cell_size,
eflag, vflag);
CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3),
cudaMemcpyDeviceToHost) );
for (int i = 0; i < inum; i++) {
force[i][0] += f_temp[i].x;
force[i][1] += f_temp[i].y;
force[i][2] += f_temp[i].z;
}
if (eflag) {
CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy,
inum*sizeof(float), cudaMemcpyDeviceToHost) );
for (int i = 0; i < inum; i++) {
evdwl += energy[i];
}
evdwl *= 0.5f;
}
if (vflag) {
CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3),
cudaMemcpyDeviceToHost) );
for (int i = 0; i < inum; i++) {
virial[0] += v_temp[2*i].x;
virial[1] += v_temp[2*i].y;
virial[2] += v_temp[2*i].z;
virial[3] += v_temp[2*i+1].x;
virial[4] += v_temp[2*i+1].y;
virial[5] += v_temp[2*i+1].z;
}
for (int i = 0; i < 6; i++)
virial[i] *= 0.5f;
}
return evdwl;
}
EXTERN double lj_gpu_cell(double **force, double *virial,
double **host_x, int *host_type,
const int inum, const int nall,
const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
return _lj_gpu_cell<float,float>(force, virial,
host_x, host_type, inum, nall,
ago, eflag, vflag, boxlo, boxhi);
}
EXTERN void lj_gpu_time() {
cout.precision(4);
cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n";
cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n";
cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n";
/* cout.precision(4);
cout << "Atom copy: " << LJMF.time_atom.total_seconds() << " s.\n";
cout << "Neighbor copy: " << LJMF.time_nbor.total_seconds() << " s.\n";
cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n";*/
//cout << "Answer copy: " << LJMF.time_answer.total_seconds() << " s.\n";
}
EXTERN int lj_gpu_num_devices() {
return LJMF.gpu.num_devices();
int num_devices;
CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
return num_devices;
}
EXTERN double lj_gpu_bytes() {
return LJMF.host_memory_usage();
return 0.0;
}

View File

@ -1,220 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef LJ_GPU_KERNEL
#define LJ_GPU_KERNEL
/* Cell list version of LJ kernel */
template<bool eflag, bool vflag, int blockSize>
__global__ void kernel_lj_cell(float3 *force3,
float *energy, float3 *virial,
float3 *cell_list, unsigned int *cell_idx,
int *cell_type, int *cell_atom,
const int inum, const int nall, const int ncell,
const int ncellx, const int ncelly, const int ncellz)
{
// calculate 3D block idx from 2d block
int bx = blockIdx.x;
int by = blockIdx.y % ncelly;
int bz = blockIdx.y / ncelly;
int tid = threadIdx.x;
// compute cell idx from 3D block idx
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
__shared__ int typeSh[blockSize];
__shared__ float posSh[blockSize*3];
__shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
extern __shared__ float smem[];
__shared__ float *lj3Sh;
__shared__ float *lj4Sh;
__shared__ float *offsetSh;
// load force parameters into shared memory
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
int itype = i/MAX_SHARED_TYPES;
int jtype = i%MAX_SHARED_TYPES;
cutsqSh[i] = _cutsq_<float>(itype,jtype);
lj1Sh[i] = _lj1_<float>(itype,jtype).x;
lj2Sh[i] = _lj1_<float>(itype,jtype).y;
}
// Only allocate shared memory when needed,
// this reduces shared memory limitation on occupancy
if (eflag || vflag) {
lj3Sh = smem;
lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
int itype = i/MAX_SHARED_TYPES;
int jtype = i%MAX_SHARED_TYPES;
lj3Sh[i] = _lj3_<float>(itype,jtype).x+0.01;
lj4Sh[i] = _lj3_<float>(itype,jtype).y;
offsetSh[i]= _offset_<float>(itype,jtype);
}
}
__syncthreads();
int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1),
nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1),
nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1);
for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) {
float3 f = {0.0f, 0.0f, 0.0f};
float ener = 0.0f;
float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
int itype;
float ix, iy, iz;
int i = tid + ii*blockSize;
unsigned int answer_pos = cell_idx[cid*blockSize+i];
// load current cell atom position and type into sMem
for (int j = tid; j < cell_atom[cid]; j += blockSize) {
int pid = cid*blockSize + j;
float3 pos = cell_list[pid];
posSh[j ] = pos.x;
posSh[j+ blockSize] = pos.y;
posSh[j+2*blockSize] = pos.z;
typeSh[j] = cell_type[pid];
}
__syncthreads();
if (answer_pos < inum) {
itype = typeSh[i];
ix = posSh[i ];
iy = posSh[i+ blockSize];
iz = posSh[i+2*blockSize];
// compute force from current cell
for (int j = 0; j < cell_atom[cid]; j++) {
if (j == i) continue;
float delx = ix - posSh[j ];
float dely = iy - posSh[j+ blockSize];
float delz = iz - posSh[j+2*blockSize];
int jtype = typeSh[j];
int mtype = itype + jtype*MAX_SHARED_TYPES;
float r2inv = delx*delx + dely*dely + delz*delz;
if (r2inv < cutsqSh[mtype]) {
r2inv = 1.0f/r2inv;
float r6inv = r2inv * r2inv * r2inv;
float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
f.x += delx * force;
f.y += dely * force;
f.z += delz * force;
if (eflag) {
float e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
ener += (e - offsetSh[mtype]);
}
if (vflag) {
v0.x += delx*delx*force;
v0.y += dely*dely*force;
v0.z += delz*delz*force;
v1.x += delx*dely*force;
v1.y += delx*delz*force;
v1.z += dely*delz*force;
}
}
}
}
__syncthreads();
// compute force from neigboring cells
for (int nborz = nborz0; nborz <= nborz1; nborz++) {
for (int nbory = nbory0; nbory <= nbory1; nbory++) {
for (int nborx = nborx0; nborx <= nborx1; nborx++) {
if (nborz == bz && nbory == by && nborx == bx) continue;
// compute cell id
int cid_nbor = nborx + INT_MUL(nbory,ncellx) +
INT_MUL(nborz,INT_MUL(ncellx,ncelly));
// load neighbor cell position and type into smem
for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) {
int pid = INT_MUL(cid_nbor,blockSize) + j;
float3 pos = cell_list[pid];
posSh[j ] = pos.x;
posSh[j+ blockSize] = pos.y;
posSh[j+2*blockSize] = pos.z;
typeSh[j] = cell_type[pid];
}
__syncthreads();
// compute force
if (answer_pos < inum) {
for (int j = 0; j < cell_atom[cid_nbor]; j++) {
float delx = ix - posSh[j ];
float dely = iy - posSh[j+ blockSize];
float delz = iz - posSh[j+2*blockSize];
int jtype = typeSh[j];
int mtype = itype + jtype*MAX_SHARED_TYPES;
float r2inv = delx*delx + dely*dely + delz*delz;
if (r2inv < cutsqSh[mtype]) {
r2inv = 1.0f/r2inv;
float r6inv = r2inv * r2inv * r2inv;
float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
f.x += delx * force;
f.y += dely * force;
f.z += delz * force;
if (eflag) {
float e=r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
ener += (e-offsetSh[mtype]);
}
if (vflag) {
v0.x += delx*delx*force;
v0.y += dely*dely*force;
v0.z += delz*delz*force;
v1.x += delx*dely*force;
v1.y += delx*delz*force;
v1.z += dely*delz*force;
}
}
}
}
__syncthreads();
}
}
}
if (answer_pos < inum) {
force3[answer_pos] = f;
if (eflag)
energy[answer_pos] = ener;
if (vflag) {
virial[2*answer_pos] = v0;
virial[2*answer_pos+1] = v1;
}
}
}
}
#endif

View File

@ -1,147 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include "lj_gpu_memory.h"
#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
template <class numtyp, class acctyp>
int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
double **host_cutsq, double **host_sigma,
double **host_epsilon, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int max_nbors,
const int me, const int nlocal, const int nall) {
if (allocated)
clear();
if (me>=gpu.num_devices())
return false;
gpu.set(me);
if (gpu.revision()<1.0)
return false;
// Initialize timers for the selected GPU
time_pair.init();
// Initialize atom and nbor data
max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
if (max_local==0)
max_local=1000;
if (nall<=nlocal)
max_atoms=max_local*2;
else
max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
if (!atom.init(max_atoms))
return false;
if (!nbor.init(ij_size,max_local,max_nbors))
return false;
// Get a stream for computing pair potentials
CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
// Use the write buffer from atom for data initialization
NVC_HostT &host_write=atom.host_write;
assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
// Copy data for bonded interactions
special_lj.safe_alloc(4);
special_lj.cast_copy(host_special_lj,host_write);
// Copy sigma, epsilon, and cutsq onto GPU
sigma.safe_alloc(ntypes,ntypes,sigma_get_texture<numtyp>());
sigma.cast_copy(host_sigma[0],host_write);
epsilon.safe_alloc(ntypes,ntypes,epsilon_get_texture<numtyp>());
epsilon.cast_copy(host_epsilon[0],host_write);
cutsq.safe_alloc(ntypes,ntypes,cutsq_get_texture<numtyp>());
cutsq.cast_copy(host_cutsq[0],host_write);
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
if (lj_types<=MAX_SHARED_TYPES) {
lj_types=MAX_SHARED_TYPES;
shared_types=true;
}
offset.safe_alloc(lj_types,lj_types,offset_get_texture<numtyp>());
offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes);
double *t1=host_lj1[0];
double *t2=host_lj2[0];
for (int i=0; i<ntypes*ntypes; i++) {
host_write[i*2]=t1[i];
host_write[i*2+1]=t2[i];
}
lj1.safe_alloc(lj_types,lj_types,lj1_get_texture<numtyp>());
lj1.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
ntypes,ntypes);
t1=host_lj3[0];
t2=host_lj4[0];
for (int i=0; i<ntypes*ntypes; i++) {
host_write[i*2]=t1[i];
host_write[i*2+1]=t2[i];
}
lj3.safe_alloc(lj_types,lj_types,lj3_get_texture<numtyp>());
lj3.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
ntypes,ntypes);
dev_error.safe_alloc(1);
dev_error.zero();
allocated=true;
return true;
}
template <class numtyp, class acctyp>
void LJ_GPU_MemoryT::clear() {
if (!allocated)
return;
allocated=false;
// Check for any pair style specific errors here
int err_flag;
dev_error.copy_to_host(&err_flag);
atom.clear();
nbor.clear();
CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream));
dev_error.clear();
sigma.clear();
epsilon.clear();
special_lj.clear();
cutsq.clear();
offset.clear();
lj1.clear();
lj3.clear();
}
template <class numtyp, class acctyp>
double LJ_GPU_MemoryT::host_memory_usage() const {
return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+
sizeof(LJ_GPU_Memory<numtyp,acctyp>);
}
template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>;

View File

@ -1,87 +0,0 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#ifndef LJ_GPU_MEMORY_H
#define LJ_GPU_MEMORY_H
#include "nvc_device.h"
#include "nvc_traits.h"
#include "pair_gpu_atom.h"
#include "pair_gpu_nbor.h"
#define BLOCK_1D 64 // max value = 256
#define CELL_SIZE BLOCK_1D
#define MAX_SHARED_TYPES 8
#define BIG_NUMBER 100000000
template <class numtyp, class acctyp>
class LJ_GPU_Memory {
public:
LJ_GPU_Memory() : allocated(false) {}
~LJ_GPU_Memory() { clear(); }
inline bool is_allocated() { return allocated; }
/// Allocate memory on host and device
bool init(const int ij_size, const int ntypes, double **host_cutsq,
double **host_sigma, double **host_epsilon,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
const int max_nbors, const int me, const int nlocal,
const int nall);
/// Free any memory on host and device
void clear();
/// Returns memory usage on GPU per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library
double host_memory_usage() const;
// ------------------------- DATA -----------------------------
// Device Properties
NVCDevice gpu;
// Device Error Flag
NVC_VecI dev_error;
// Stream for asynchronous work
cudaStream_t pair_stream;
// Atom Data
PairGPUAtom<numtyp,acctyp> atom;
// Neighbor Data
PairGPUNbor nbor;
// --------------- Const Data for Atoms
NVC_ConstMatT sigma, epsilon, cutsq, offset;
NVC_ConstMat< typename nvc_vec_traits<numtyp>::vec2 > lj1, lj3;
NVC_VecT special_lj;
size_t max_atoms, max_local;
// Timing for pair calculation
NVCTimer time_pair;
// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
protected:
bool allocated;
};
#endif

129
lib/gpu/ljc_cut_gpu.cpp Normal file
View File

@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#include <iostream>
#include <cassert>
#include <math.h>
#include "ljc_cut_gpu_memory.h"
using namespace std;
static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
LJCMF.clear();
gpu_mode=LJCMF.device->gpu_mode();
double gpu_split=LJCMF.device->particle_split();
int first_gpu=LJCMF.device->first_device();
int last_gpu=LJCMF.device->last_device();
int world_me=LJCMF.device->world_me();
int gpu_rank=LJCMF.device->gpu_rank();
int procs_per_gpu=LJCMF.device->procs_per_gpu();
LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
bool message=false;
if (world_me==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
if (world_me==0) {
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e);
if (!init_ok)
return false;
}
MPI_Barrier(MPI_COMM_WORLD);
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0) {
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split,
screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e);
if (!init_ok)
return false;
}
MPI_Barrier(LJCMF.device->gpu_comm);
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
return true;
}
void ljc_gpu_clear() {
LJCMF.clear();
}
int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *boxlo, double *boxhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q) {
return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
boxhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, cpu_time, success, host_q);
}
void ljc_gpu_compute(const int timestep, const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q) {
LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q);
}
double ljc_gpu_bytes() {
return LJCMF.host_memory_usage();
}

Some files were not shown because too many files have changed in this diff Show More