forked from lijiext/lammps
Changes from Mike Brown.
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
ae536ce7d0
commit
5a82c99485
|
@ -1,72 +0,0 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
BIN_DIR = .
|
||||
OBJ_DIR = .
|
||||
AR = ar
|
||||
CUDA_CPP = /cygdrive/c/CUDA/bin/nvcc -I/cygdrive/c/CUDA/include -O3 -DWINDLL -DUNIX -Xptxas -v --use_fast_math
|
||||
CUDA_ARCH = -arch=sm_13
|
||||
CUDA_PREC = -D_SINGLE_SINGLE
|
||||
CUDA_LINK = -L/cygdrive/c/CUDA/lib -lcudart $(CUDA_LIB)
|
||||
|
||||
CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
|
||||
|
||||
CUDA_LIB = $(OBJ_DIR)/gpu.dll
|
||||
|
||||
# Headers for CUDA Stuff
|
||||
NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
|
||||
# Headers for Pair Stuff
|
||||
PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
|
||||
# Dependencies for the Texture Tar
|
||||
TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
|
||||
lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
|
||||
gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
|
||||
|
||||
ALL_H = $(NVC_H) $(PAIR_H)
|
||||
|
||||
EXECS = $(BIN_DIR)/nvc_get_devices
|
||||
OBJS = $(OBJ_DIR)/nvc_device.obj $(OBJ_DIR)/pair_gpu_nbor.obj \
|
||||
$(OBJ_DIR)/pair_tex_tar.obj $(OBJ_DIR)/pair_gpu_cell.obj
|
||||
|
||||
all: $(CUDA_LIB) $(EXECS)
|
||||
|
||||
$(OBJ_DIR)/nvc_device.obj : nvc_device.cu $(NVC_H)
|
||||
$(CUDA) -o $@ -c nvc_device.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor.obj: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
|
||||
$(CUDA) -o $@ -c pair_gpu_nbor.cu
|
||||
|
||||
$(OBJ_DIR)/pair_tex_tar.obj: $(TAR_H)
|
||||
$(CUDA) -o $@ -c pair_tex_tar.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_cell.obj: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
|
||||
$(CUDA) -o $@ -c pair_gpu_cell.cu
|
||||
|
||||
$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.obj
|
||||
$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.obj
|
||||
|
||||
$(CUDA_LIB): $(OBJS) $(TAR_H)
|
||||
$(CUDA) -o $@ -shared $(OBJS)
|
||||
|
||||
clean:
|
||||
rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.exe *.exp *.lib *.dll *.linkinfo
|
||||
|
||||
veryclean: clean
|
||||
rm -rf *~ *.linkinfo
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
CUDA_HOME = $(HOME)/cuda
|
||||
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||
|
||||
CUDA_ARCH = -arch=sm_13
|
||||
CUDA_PRECISION = -D_SINGLE_DOUBLE
|
||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include
|
||||
CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./obj
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Nvidia.makefile
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/
|
||||
NVCC = nvcc
|
||||
|
||||
CUDA_ARCH = -arch=sm_13
|
||||
CUDA_PRECISION = -D_SINGLE_SINGLE
|
||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON
|
||||
CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./obj
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Nvidia.makefile
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Makefile for NCSA's lincoln GPU cluster. Tested with "soft +cuda-2.3"
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
CUDA_HOME = /usr/local/cuda-2.3
|
||||
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||
|
||||
CUDA_ARCH = -arch=sm_13
|
||||
CUDA_PRECISION = -D_SINGLE_SINGLE
|
||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON
|
||||
CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./obj
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
|
||||
include Nvidia.makefile
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
CUDA_HOME = /usr/local/cuda
|
||||
NVCC = nvcc
|
||||
|
||||
CUDA_ARCH = -arch=sm_13
|
||||
CUDA_PRECISION = -D_SINGLE_SINGLE
|
||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
||||
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./obj
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Nvidia.makefile
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
||||
OCL_LINK = -lOpenCL
|
||||
OCL_PREC = -D_SINGLE_SINGLE
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./ocl_obj
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Opencl.makefile
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Makefile for the TACC longhorn cluster. Use "module load cuda".
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
CUDA_HOME = $(TACC_CUDA_DIR)
|
||||
NVCC = nvcc
|
||||
CUDA_ARCH = -arch=sm_13
|
||||
CUDA_PRECISION = -D_SINGLE_SINGLE
|
||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||
CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
|
||||
CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
||||
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./obj
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
|
||||
include Nvidia.makefile
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
CUDA_HOME = /usr/local/cuda
|
||||
NVCC = nvcc
|
||||
|
||||
CUDA_ARCH = -arch=sm_11
|
||||
CUDA_PRECISION = -D_SINGLE_SINGLE
|
||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||
CUDA_LIB = -L$(CUDA_HOME)/lib
|
||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
|
||||
|
||||
CUDR_CPP = mpic++
|
||||
CUDR_OPTS = -O2 -m32 -g
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./obj
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Nvidia.makefile
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
|
||||
OCL_LINK = -framework OpenCL
|
||||
OCL_PREC = -D_SINGLE_SINGLE
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./ocl_obj
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Opencl.makefile
|
||||
|
|
@ -1,72 +0,0 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
BIN_DIR = .
|
||||
OBJ_DIR = .
|
||||
AR = ar
|
||||
CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||
CUDA_ARCH = -arch=sm_13
|
||||
CUDA_PREC = -D_SINGLE_SINGLE
|
||||
CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
|
||||
|
||||
CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
|
||||
|
||||
CUDA_LIB = $(OBJ_DIR)/libgpu.a
|
||||
|
||||
# Headers for CUDA Stuff
|
||||
NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
|
||||
# Headers for Pair Stuff
|
||||
PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
|
||||
# Dependencies for the Texture Tar
|
||||
TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
|
||||
lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
|
||||
gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
|
||||
|
||||
ALL_H = $(NVC_H) $(PAIR_H)
|
||||
|
||||
EXECS = $(BIN_DIR)/nvc_get_devices
|
||||
OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/pair_gpu_nbor.cu_o \
|
||||
$(OBJ_DIR)/pair_tex_tar.cu_o $(OBJ_DIR)/pair_gpu_cell.cu_o
|
||||
|
||||
all: $(CUDA_LIB) $(EXECS)
|
||||
|
||||
$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H)
|
||||
$(CUDA) -o $@ -c nvc_device.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
|
||||
$(CUDA) -o $@ -c pair_gpu_nbor.cu
|
||||
|
||||
$(OBJ_DIR)/pair_tex_tar.cu_o: $(TAR_H)
|
||||
$(CUDA) -o $@ -c pair_tex_tar.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_cell.cu_o: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
|
||||
$(CUDA) -o $@ -c pair_gpu_cell.cu
|
||||
|
||||
$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
|
||||
$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o
|
||||
|
||||
$(CUDA_LIB): $(OBJS)
|
||||
$(AR) -crusv $(CUDA_LIB) $(OBJS)
|
||||
|
||||
clean:
|
||||
rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo
|
||||
|
||||
veryclean: clean
|
||||
rm -rf *~ *.linkinfo
|
||||
|
|
@ -0,0 +1,218 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
CUDA = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
|
||||
$(CUDA_PRECISION)
|
||||
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
|
||||
-Icudpp_mini
|
||||
CUDA_LINK = $(CUDA_LIB) -lcudart
|
||||
|
||||
GPU_LIB = $(LIB_DIR)/libgpu.a
|
||||
|
||||
# Headers for Geryon
|
||||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||
NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H)
|
||||
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H)
|
||||
# Headers for Pair Stuff
|
||||
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
|
||||
pair_gpu_device.h pair_gpu_balance.h
|
||||
|
||||
ALL_H = $(NVD_H) $(PAIR_H)
|
||||
|
||||
EXECS = $(BIN_DIR)/nvc_get_devices
|
||||
CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
|
||||
$(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
|
||||
$(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
|
||||
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
||||
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
|
||||
$(OBJ_DIR)/charge_gpu_memory.o \
|
||||
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
|
||||
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
|
||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
|
||||
$(CUDPP)
|
||||
PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
|
||||
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
|
||||
$(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
|
||||
$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
|
||||
$(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
|
||||
$(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
|
||||
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
|
||||
|
||||
all: $(GPU_LIB) $(EXECS)
|
||||
|
||||
$(OBJ_DIR)/cudpp.o: cudpp_mini/cudpp.cpp
|
||||
$(CUDR) -o $@ -c cudpp_mini/cudpp.cpp -Icudpp_mini
|
||||
|
||||
$(OBJ_DIR)/cudpp_plan.o: cudpp_mini/cudpp_plan.cpp
|
||||
$(CUDR) -o $@ -c cudpp_mini/cudpp_plan.cpp -Icudpp_mini
|
||||
|
||||
$(OBJ_DIR)/cudpp_maximal_launch.o: cudpp_mini/cudpp_maximal_launch.cpp
|
||||
$(CUDR) -o $@ -c cudpp_mini/cudpp_maximal_launch.cpp -Icudpp_mini
|
||||
|
||||
$(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
|
||||
$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
|
||||
|
||||
$(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
|
||||
$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
|
||||
|
||||
$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
|
||||
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_atom_kernel.ptx: pair_gpu_atom_kernel.cu
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_atom_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
|
||||
$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor_ptx.h: $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_build_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c pair_gpu_device.cpp
|
||||
|
||||
$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
||||
$(CUDR) -o $@ -c atomic_gpu_memory.cpp
|
||||
|
||||
$(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
|
||||
$(CUDR) -o $@ -c charge_gpu_memory.cpp
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_kernel_lj.ptx: gb_gpu_kernel_lj.cu pair_gpu_precision.h gb_gpu_extra.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_lj.cu
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx: gb_gpu_kernel_nbor.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_nbor.cu
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_ptx.h: $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_ptx.h
|
||||
$(CUDR) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
|
||||
$(CUDR) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_cut_gpu_kernel.ptx: lj_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lj_cut_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
|
||||
$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ ljc_cut_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
|
||||
$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ ljcl_cut_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
||||
$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
|
||||
$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
|
||||
$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_long_gpu_kernel.cu
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
|
||||
$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
|
||||
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK)
|
||||
|
||||
$(GPU_LIB): $(OBJS)
|
||||
$(AR) -crusv $(GPU_LIB) $(OBJS)
|
||||
|
||||
clean:
|
||||
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo
|
||||
|
||||
veryclean: clean
|
||||
rm -rf *~ *.linkinfo
|
|
@ -0,0 +1,155 @@
|
|||
# /* ----------------------------------------------------------------------
|
||||
# LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
# http://lammps.sandia.gov, Sandia National Laboratories
|
||||
# Steve Plimpton, sjplimp@sandia.gov
|
||||
#
|
||||
# Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
# certain rights in this software. This software is distributed under
|
||||
# the GNU General Public License.
|
||||
#
|
||||
# See the README file in the top-level LAMMPS directory.
|
||||
# ------------------------------------------------------------------------- */
|
||||
#
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
OCL = $(OCL_CPP) $(OCL_PREC) -DUSE_OPENCL
|
||||
OCL_LIB = $(LIB_DIR)/libgpu.a
|
||||
# Headers for Geryon
|
||||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H)
|
||||
# Headers for Pair Stuff
|
||||
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
|
||||
pair_gpu_device.h pair_gpu_balance.h
|
||||
|
||||
ALL_H = $(OCL_H) $(PAIR_H)
|
||||
|
||||
EXECS = $(BIN_DIR)/ocl_get_devices
|
||||
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
||||
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
|
||||
$(OBJ_DIR)/charge_gpu_memory.o \
|
||||
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
|
||||
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
|
||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o
|
||||
KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
|
||||
$(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
|
||||
$(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
|
||||
$(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
|
||||
$(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h
|
||||
|
||||
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
||||
|
||||
all: $(OCL_LIB) $(EXECS)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_cl.h
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
|
||||
$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
|
||||
$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
|
||||
$(OCL) -o $@ -c pair_gpu_device.cpp
|
||||
|
||||
$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
||||
$(OCL) -o $@ -c atomic_gpu_memory.cpp
|
||||
|
||||
$(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
|
||||
$(OCL) -o $@ -c charge_gpu_memory.cpp
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_cl.h: gb_gpu_kernel.cu gb_gpu_kernel_lj.cu gb_gpu_extra.h
|
||||
cat gb_gpu_extra.h gb_gpu_kernel.cu > $(OBJ_DIR)/gb_gpu_kernel.tar; \
|
||||
cat gb_gpu_extra.h gb_gpu_kernel_lj.cu > $(OBJ_DIR)/gb_gpu_kernel_lj.tar; \
|
||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_cl.h; \
|
||||
rm -f $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar
|
||||
|
||||
$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h
|
||||
$(OCL) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
|
||||
$(OCL) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
|
||||
$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
|
||||
$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||
$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
||||
$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
|
||||
$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
|
||||
$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
|
||||
$(BSH) ./geryon/file_to_cstr.sh cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_cl.h;
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||
$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
|
||||
$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
|
||||
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
|
||||
|
||||
$(OCL_LIB): $(OBJS) $(PTXS)
|
||||
$(AR) -crusv $(OCL_LIB) $(OBJS)
|
||||
|
||||
opencl: $(OCL_EXECS)
|
||||
|
||||
clean:
|
||||
rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo
|
||||
|
||||
veryclean: clean
|
||||
rm -rf *~ *.linkinfo
|
||||
|
|
@ -12,7 +12,7 @@
|
|||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
@ -20,57 +20,91 @@
|
|||
GENERAL NOTES
|
||||
|
||||
This library, libgpu.a, provides routines for GPU acceleration
|
||||
of LAMMPS pair styles. Currently, only CUDA enabled GPUs are
|
||||
supported. Compilation of this library requires installing the CUDA
|
||||
GPU driver and CUDA toolkit for your operating system. In addition to
|
||||
the LAMMPS library, the binary nvc_get_devices will also be
|
||||
built. This can be used to query the names and properties of GPU
|
||||
devices on your system.
|
||||
of LAMMPS pair styles. Compilation of this library requires
|
||||
installing the CUDA GPU driver and CUDA toolkit for your operating
|
||||
system. In addition to the LAMMPS library, the binary nvc_get_devices
|
||||
will also be built. This can be used to query the names and
|
||||
properties of GPU devices on your system. A Makefile for OpenCL
|
||||
compilation is provided, but support for OpenCL use is not currently
|
||||
provided by the developers.
|
||||
|
||||
NOTE: Installation of the CUDA SDK is not required.
|
||||
|
||||
Current pair styles supporting GPU acceleration:
|
||||
|
||||
1. lj/cut/gpu
|
||||
2. gayberne/gpu
|
||||
2. lj/cut/coul/cut/gpu
|
||||
3. lj/cut/coul/long/gpu
|
||||
4. lj96/cut/gpu
|
||||
5. gayberne/gpu
|
||||
6. cmm/cg/gpu
|
||||
7. cmm/cg/coul/long/gpu
|
||||
|
||||
MULTIPLE LAMMPS PROCESSES
|
||||
|
||||
When using GPU acceleration, you are restricted to one physical GPU
|
||||
per LAMMPS process. This can be multiple GPUs on a single node or
|
||||
across multiple nodes. Intructions on GPU assignment can be found in
|
||||
the LAMMPS documentation.
|
||||
|
||||
SPEEDUPS
|
||||
|
||||
The speedups that can be obtained using this library are highly
|
||||
dependent on the GPU architecture and the computational expense of the
|
||||
pair potential. When comparing a single precision Tesla C1060 run to a
|
||||
serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for
|
||||
lj/cut with a cutoff of 2.5. For gayberne with a cutoff of 7, the
|
||||
speedup is >103x for 8000 particles. The speedup will improve with an
|
||||
increase in the number of particles or an increase in the cutoff.
|
||||
Multiple LAMMPS MPI processes can share GPUs on the system, but multiple
|
||||
GPUs cannot be utilized by a single MPI process. In many cases, the
|
||||
best performance will be obtained by running as many MPI processes as
|
||||
CPU cores available with the condition that the number of MPI processes
|
||||
is an integer multiple of the number of GPUs being used. See the
|
||||
LAMMPS user manual for details on running with GPU acceleration.
|
||||
|
||||
BUILDING AND PRECISION MODES
|
||||
|
||||
To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for
|
||||
your machine. Type make. Additionally, the GPU package must be installed and
|
||||
compiled for LAMMPS. The library supports 3 precision modes as determined by
|
||||
the CUDA_PREC variable:
|
||||
To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME, NVCC, CUDA_INCLUD,
|
||||
CUDA_LIB and CUDA_OPTS variables in one of the Makefiles. CUDA_ARCH should
|
||||
be set based on the compute capability of your GPU. This can be verified by
|
||||
running the nvc_get_devices executable after the build is complete.
|
||||
Additionally, the GPU package must be installed and compiled for LAMMPS.
|
||||
This may require editing the gpu_SYSPATH variable in the LAMMPS makefile.
|
||||
|
||||
Please note that the GPU library accesses the CUDA driver library directly,
|
||||
so it needs to be linked not only to the CUDA runtime library (libcudart.so)
|
||||
that ships with the CUDA toolkit, but also with the CUDA driver library
|
||||
(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS
|
||||
on the head node of a GPU cluster, this library may not be installed,
|
||||
so you may need to copy it over from one of the compute nodes (best into
|
||||
this directory).
|
||||
|
||||
The gpu library supports 3 precision modes as determined by
|
||||
the CUDA_PRECISION variable:
|
||||
|
||||
CUDA_PREC = -D_SINGLE_SINGLE # Single precision for all calculations
|
||||
CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations
|
||||
CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
|
||||
|
||||
NOTE: For the lj/cut pair style, only single precision will be used, even
|
||||
if double precision is specified.
|
||||
|
||||
NOTE: Double precision is only supported on certain GPUS (with
|
||||
NOTE: Double precision is only supported on certain GPUs (with
|
||||
compute capability>=1.3).
|
||||
|
||||
NOTE: For Tesla and other graphics cards with compute capability>=1.3,
|
||||
make sure that -arch=sm_13 is set on the CUDA_ARCH line.
|
||||
|
||||
NOTE: For Fermi, make sure that -arch=sm_20 is set on the CUDA_ARCH line.
|
||||
|
||||
NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
|
||||
package has been installed before installing the GPU package in LAMMPS.
|
||||
|
||||
|
||||
NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be
|
||||
installed if the USER-CG-CMM package has been installed before
|
||||
installing the GPU package in LAMMPS.
|
||||
|
||||
NOTE: The lj/cut/coul/long/gpu and cg/cmm/coul/long/gpu style will only be
|
||||
installed if the KSPACE package has been installed before installing
|
||||
the GPU package in LAMMPS.
|
||||
|
||||
EXAMPLE BUILD PROCESS
|
||||
|
||||
cd ~/lammps/lib/gpu
|
||||
emacs Makefile.linux
|
||||
make -f Makefile.linux
|
||||
./nvc_get_devices
|
||||
cd ../../src
|
||||
emacs ./MAKE/Makefile.linux
|
||||
make yes-asphere
|
||||
make yes-kspace
|
||||
make yes-gpu
|
||||
make linux
|
||||
|
||||
------------------------------------------------------------------------
|
||||
Last merge with gpulammps: r561 on 2010-11-12
|
||||
------------------------------------------------------------------------
|
||||
|
|
|
@ -0,0 +1,262 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "atomic_gpu_memory.h"
|
||||
#define AtomicGPUMemoryT AtomicGPUMemory<numtyp, acctyp>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) {
|
||||
device=&pair_gpu_device;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
AtomicGPUMemoryT::~AtomicGPUMemory() {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const char *pair_program) {
|
||||
nbor_time_avail=false;
|
||||
screen=_screen;
|
||||
|
||||
bool gpu_nbor=false;
|
||||
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
|
||||
gpu_nbor=true;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
|
||||
_gpu_host,max_nbors,cell_size,false))
|
||||
return false;
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
nbor=&device->nbor;
|
||||
|
||||
_block_size=BLOCK_1D;
|
||||
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
|
||||
_block_size=ucl_device->group_size();
|
||||
compile_kernels(*ucl_device,pair_program);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_split);
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init(*ucl_device);
|
||||
time_pair.zero();
|
||||
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
|
||||
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AtomicGPUMemoryT::clear_atomic() {
|
||||
// Output any timing information
|
||||
acc_timers();
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
|
||||
|
||||
if (_compiled) {
|
||||
k_pair_fast.clear();
|
||||
k_pair.clear();
|
||||
delete pair_program;
|
||||
_compiled=false;
|
||||
}
|
||||
|
||||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
device->clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy neighbor list from host
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
||||
int *numj, int **firstneigh, bool &success) {
|
||||
success=true;
|
||||
|
||||
nbor_time_avail=true;
|
||||
|
||||
int mn=nbor->max_nbor_loop(inum,numj);
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,mn,success);
|
||||
if (!success)
|
||||
return false;
|
||||
|
||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
return ilist;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Build neighbor list on device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
|
||||
const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag,
|
||||
int **nspecial, int **special,
|
||||
bool &success) {
|
||||
nbor_time_avail=true;
|
||||
|
||||
success=true;
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,host_inum,nbor->max_nbors(),success);
|
||||
if (!success)
|
||||
return;
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
acc_timers();
|
||||
if (inum_full==0) {
|
||||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
|
||||
nbor->gpu_nbor());
|
||||
atom->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
}
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
acc_timers();
|
||||
if (inum_full==0) {
|
||||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
|
||||
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
|
||||
atom->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
boxlo, boxhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
hd_balancer.start_timer();
|
||||
} else {
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
return device->nbor.host_nbor.begin();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double AtomicGPUMemoryT::host_memory_usage_atomic() const {
|
||||
return device->atom.host_memory_usage()+
|
||||
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
||||
sizeof(AtomicGPUMemory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AtomicGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
||||
std::string(OCL_PRECISION_COMPILE);
|
||||
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,flags.c_str());
|
||||
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
|
||||
k_pair.set_function(*pair_program,"kernel_pair");
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
template class AtomicGPUMemory<PRECISION,ACC_PRECISION>;
|
||||
|
|
@ -0,0 +1,180 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef ATOMIC_GPU_MEMORY_H
|
||||
#define ATOMIC_GPU_MEMORY_H
|
||||
|
||||
#define BLOCK_1D 64
|
||||
|
||||
#include "pair_gpu_device.h"
|
||||
#include "pair_gpu_balance.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "geryon/ocl_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class AtomicGPUMemory {
|
||||
public:
|
||||
AtomicGPUMemory();
|
||||
virtual ~AtomicGPUMemory();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const char *pair_program);
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(inum, nall, success))
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \note olist_size=total number of local particles **/
|
||||
inline void resize_local(const int inum, const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear_atomic();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom_atomic(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage_atomic() const;
|
||||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (nbor_time_avail) {
|
||||
nbor->time_nbor.add_to_total();
|
||||
nbor->time_kernel.add_to_total();
|
||||
nbor_time_avail=false;
|
||||
}
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
inline void zero_timers() {
|
||||
nbor_time_avail=false;
|
||||
time_pair.zero();
|
||||
atom->zero_timers();
|
||||
}
|
||||
|
||||
/// Copy neighbor list from host
|
||||
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
|
||||
int **firstneigh, bool &success);
|
||||
|
||||
/// Build neighbor list on device
|
||||
void build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
int **special, bool &success);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int timestep, const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int * compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
PairGPUDevice<numtyp,acctyp> *device;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *ucl_device;
|
||||
|
||||
/// Device Timers
|
||||
UCL_Timer time_pair;
|
||||
|
||||
/// Host device load balancer
|
||||
PairGPUBalance<numtyp,acctyp> hd_balancer;
|
||||
|
||||
/// LAMMPS pointer for screen output
|
||||
FILE *screen;
|
||||
|
||||
// --------------------------- ATOM DATA --------------------------
|
||||
|
||||
/// Atom Data
|
||||
PairGPUAtom<numtyp,acctyp> *atom;
|
||||
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor data
|
||||
PairGPUNbor *nbor;
|
||||
|
||||
/// True if we need to accumulate time for neighboring
|
||||
bool nbor_time_avail;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_pair_fast, k_pair;
|
||||
inline int block_size() { return _block_size; }
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,270 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "charge_gpu_memory.h"
|
||||
#define ChargeGPUMemoryT ChargeGPUMemory<numtyp, acctyp>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
|
||||
device=&pair_gpu_device;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
ChargeGPUMemoryT::~ChargeGPUMemory() {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const char *pair_program) {
|
||||
nbor_time_avail=false;
|
||||
screen=_screen;
|
||||
|
||||
bool gpu_nbor=false;
|
||||
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
|
||||
gpu_nbor=true;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
|
||||
_gpu_host,max_nbors,cell_size,false))
|
||||
return false;
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
nbor=&device->nbor;
|
||||
|
||||
_block_size=BLOCK_1D;
|
||||
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
|
||||
_block_size=ucl_device->group_size();
|
||||
compile_kernels(*ucl_device,pair_program);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_split);
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init(*ucl_device);
|
||||
time_pair.zero();
|
||||
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
|
||||
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void ChargeGPUMemoryT::clear_atomic() {
|
||||
// Output any timing information
|
||||
acc_timers();
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
|
||||
|
||||
if (_compiled) {
|
||||
k_pair_fast.clear();
|
||||
k_pair.clear();
|
||||
delete pair_program;
|
||||
_compiled=false;
|
||||
}
|
||||
|
||||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
device->clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy neighbor list from host
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
||||
int *numj, int **firstneigh, bool &success) {
|
||||
success=true;
|
||||
|
||||
nbor_time_avail=true;
|
||||
|
||||
int mn=nbor->max_nbor_loop(inum,numj);
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,mn,success);
|
||||
if (!success)
|
||||
return false;
|
||||
|
||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
return ilist;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Build neighbor list on device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
|
||||
const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag,
|
||||
int **nspecial, int **special,
|
||||
bool &success) {
|
||||
nbor_time_avail=true;
|
||||
|
||||
success=true;
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,host_inum,nbor->max_nbors(),success);
|
||||
if (!success)
|
||||
return;
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q) {
|
||||
acc_timers();
|
||||
if (inum_full==0) {
|
||||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
|
||||
nbor->gpu_nbor());
|
||||
atom->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
}
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_q_data(host_q);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
atom->add_other_data();
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success,
|
||||
double *host_q) {
|
||||
acc_timers();
|
||||
if (inum_full==0) {
|
||||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
|
||||
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
|
||||
atom->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
boxlo, boxhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
atom->cast_q_data(host_q);
|
||||
hd_balancer.start_timer();
|
||||
} else {
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_q_data(host_q);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
atom->add_other_data();
|
||||
|
||||
loop(eflag,vflag);
|
||||
atom->copy_answers(eflag,vflag,eatom,vatom);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
return device->nbor.host_nbor.begin();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double ChargeGPUMemoryT::host_memory_usage_atomic() const {
|
||||
return device->atom.host_memory_usage()+
|
||||
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
||||
sizeof(ChargeGPUMemory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void ChargeGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
||||
std::string(OCL_PRECISION_COMPILE);
|
||||
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,flags.c_str());
|
||||
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
|
||||
k_pair.set_function(*pair_program,"kernel_pair");
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
q_tex.get_texture(*pair_program,"q_tex");
|
||||
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
template class ChargeGPUMemory<PRECISION,ACC_PRECISION>;
|
||||
|
|
@ -0,0 +1,183 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef CHARGE_GPU_MEMORY_H
|
||||
#define CHARGE_GPU_MEMORY_H
|
||||
|
||||
#define BLOCK_1D 64
|
||||
|
||||
#include "pair_gpu_device.h"
|
||||
#include "pair_gpu_balance.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "geryon/ocl_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class ChargeGPUMemory {
|
||||
public:
|
||||
ChargeGPUMemory();
|
||||
virtual ~ChargeGPUMemory();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const char *pair_program);
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(inum, nall, success)) {
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \note olist_size=total number of local particles **/
|
||||
inline void resize_local(const int inum, const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear_atomic();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom_atomic(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage_atomic() const;
|
||||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (nbor_time_avail) {
|
||||
nbor->time_nbor.add_to_total();
|
||||
nbor->time_kernel.add_to_total();
|
||||
nbor_time_avail=false;
|
||||
}
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
inline void zero_timers() {
|
||||
nbor_time_avail=false;
|
||||
time_pair.zero();
|
||||
atom->zero_timers();
|
||||
}
|
||||
|
||||
/// Copy neighbor list from host
|
||||
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
|
||||
int **firstneigh, bool &success);
|
||||
|
||||
/// Build neighbor list on device
|
||||
void build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
int **special, bool &success);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int timestep, const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success,
|
||||
double *charge);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int * compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *boxlo,
|
||||
double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *charge);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
PairGPUDevice<numtyp,acctyp> *device;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *ucl_device;
|
||||
|
||||
/// Device Timers
|
||||
UCL_Timer time_pair;
|
||||
|
||||
/// Host device load balancer
|
||||
PairGPUBalance<numtyp,acctyp> hd_balancer;
|
||||
|
||||
/// LAMMPS pointer for screen output
|
||||
FILE *screen;
|
||||
|
||||
// --------------------------- ATOM DATA --------------------------
|
||||
|
||||
/// Atom Data
|
||||
PairGPUAtom<numtyp,acctyp> *atom;
|
||||
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor data
|
||||
PairGPUNbor *nbor;
|
||||
|
||||
/// True if we need to accumulate time for neighboring
|
||||
bool nbor_time_avail;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_pair_fast, k_pair;
|
||||
inline int block_size() { return _block_size; }
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
UCL_Texture q_tex;
|
||||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <math.h>
|
||||
|
||||
#include "cmm_cut_gpu_memory.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen) {
|
||||
CMMMF.clear();
|
||||
gpu_mode=CMMMF.device->gpu_mode();
|
||||
double gpu_split=CMMMF.device->particle_split();
|
||||
int first_gpu=CMMMF.device->first_device();
|
||||
int last_gpu=CMMMF.device->last_device();
|
||||
int world_me=CMMMF.device->world_me();
|
||||
int gpu_rank=CMMMF.device->gpu_rank();
|
||||
int procs_per_gpu=CMMMF.device->procs_per_gpu();
|
||||
|
||||
CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (world_me==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split,
|
||||
screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
MPI_Barrier(CMMMF.device->gpu_comm);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
void cmm_gpu_clear() {
|
||||
CMMMF.clear();
|
||||
}
|
||||
|
||||
int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success);
|
||||
}
|
||||
|
||||
void cmm_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||
}
|
||||
|
||||
double cmm_gpu_bytes() {
|
||||
return CMMMF.host_memory_usage();
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,296 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef CMM_GPU_KERNEL
|
||||
#define CMM_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
texture<float4> pos_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
__inline double4 fetch_pos(const int& i, const double4 *pos)
|
||||
{
|
||||
return pos[i];
|
||||
}
|
||||
#else
|
||||
__inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{
|
||||
return tex1Dfetch(pos_tex, i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (r2inv<lj1[mtype].x) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp inv1,inv2;
|
||||
|
||||
if (lj1[mtype].y == 2) {
|
||||
inv1=r2inv*r2inv;
|
||||
inv2=inv1*inv1;
|
||||
} else if (lj1[mtype].y == 1) {
|
||||
inv2=r2inv*sqrt(r2inv);
|
||||
inv1=inv2*inv2;
|
||||
} else {
|
||||
inv1=r2inv*r2inv*r2inv;
|
||||
inv2=inv1;
|
||||
}
|
||||
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
if (eflag>0)
|
||||
energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
|
||||
lj3[mtype].z;
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,__global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<lj1[mtype].x) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp inv1,inv2;
|
||||
|
||||
if (lj1[mtype].y == (numtyp)2) {
|
||||
inv1=r2inv*r2inv;
|
||||
inv2=inv1*inv1;
|
||||
} else if (lj1[mtype].y == (numtyp)1) {
|
||||
inv2=r2inv*sqrt(r2inv);
|
||||
inv1=inv2*inv2;
|
||||
} else {
|
||||
inv1=r2inv*r2inv*r2inv;
|
||||
inv2=inv1;
|
||||
}
|
||||
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
if (eflag>0)
|
||||
energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
|
||||
lj3[mtype].z;
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii*/
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "cmm_cut_gpu_cl.h"
|
||||
#else
|
||||
#include "cmm_cut_gpu_ptx.h"
|
||||
#endif
|
||||
|
||||
#include "cmm_cut_gpu_memory.h"
|
||||
#include <cassert>
|
||||
#define CMM_GPU_MemoryT CMM_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
CMM_GPU_MemoryT::CMM_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
CMM_GPU_MemoryT::~CMM_GPU_Memory() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,cmm_cut_gpu_kernel);
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int cmm_types=ntypes;
|
||||
shared_types=false;
|
||||
if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
cmm_types=MAX_SHARED_TYPES;
|
||||
shared_types=true;
|
||||
}
|
||||
_cmm_types=cmm_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_OPTIMIZED);
|
||||
|
||||
for (int i=0; i<cmm_types*cmm_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
|
||||
host_cg_type,host_lj1,host_lj2);
|
||||
|
||||
lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||
ucl_copy(sp_lj,dview,false);
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void CMM_GPU_MemoryT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
lj1.clear();
|
||||
lj3.clear();
|
||||
sp_lj.clear();
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double CMM_GPU_MemoryT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(CMM_GPU_Memory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
template class CMM_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,71 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef CMM_GPU_MEMORY_H
|
||||
#define CMM_GPU_MEMORY_H
|
||||
|
||||
#include "atomic_gpu_memory.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
||||
public:
|
||||
CMM_GPU_Memory();
|
||||
~CMM_GPU_Memory();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// lj1.x = cutsq, lj1.y=cg_type, lj1.z = lj1, lj1.w = lj2
|
||||
UCL_D_Vec<numtyp4> lj1;
|
||||
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
|
||||
UCL_D_Vec<numtyp4> lj3;
|
||||
/// Special LJ values
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _cmm_types;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <math.h>
|
||||
|
||||
#include "cmmc_long_gpu_memory.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
CMMLMF.clear();
|
||||
gpu_mode=CMMLMF.device->gpu_mode();
|
||||
double gpu_split=CMMLMF.device->particle_split();
|
||||
int first_gpu=CMMLMF.device->first_device();
|
||||
int last_gpu=CMMLMF.device->last_device();
|
||||
int world_me=CMMLMF.device->world_me();
|
||||
int gpu_rank=CMMLMF.device->gpu_rank();
|
||||
int procs_per_gpu=CMMLMF.device->procs_per_gpu();
|
||||
|
||||
CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (world_me==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
|
||||
host_lj3, host_lj4, offset, special_lj, inum,
|
||||
nall, 300, maxspecial, cell_size, gpu_split,
|
||||
screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e,g_ewald);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
|
||||
host_lj3, host_lj4, offset, special_lj, inum,
|
||||
nall, 300, maxspecial, cell_size, gpu_split,
|
||||
screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
MPI_Barrier(CMMLMF.device->gpu_comm);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
void cmml_gpu_clear() {
|
||||
CMMLMF.clear();
|
||||
}
|
||||
|
||||
int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q) {
|
||||
return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success, host_q);
|
||||
}
|
||||
|
||||
void cmml_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q) {
|
||||
CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||
host_q);
|
||||
}
|
||||
|
||||
double cmml_gpu_bytes() {
|
||||
return CMMLMF.host_memory_usage();
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,378 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef CMML_GPU_KERNEL
|
||||
#define CMML_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#define EWALD_F (numtyp)1.12837917
|
||||
#define EWALD_P (numtyp)0.3275911
|
||||
#define A1 (numtyp)0.254829592
|
||||
#define A2 (numtyp)-0.284496736
|
||||
#define A3 (numtyp)1.421413741
|
||||
#define A4 (numtyp)-1.453152027
|
||||
#define A5 (numtyp)1.061405429
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
__inline double4 fetch_pos(const int& i, const double4 *pos)
|
||||
{
|
||||
return pos[i];
|
||||
}
|
||||
__inline double fetch_q(const int& i, const double *q)
|
||||
{
|
||||
return q[i];
|
||||
}
|
||||
#else
|
||||
__inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{
|
||||
return tex1Dfetch(pos_tex, i);
|
||||
}
|
||||
__inline float fetch_q(const int& i, const float *q)
|
||||
{
|
||||
return tex1Dfetch(q_tex, i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define fetch_q(i,y) q_[i]
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__local numtyp sp_lj[8];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
sp_lj[4]=sp_lj_in[4];
|
||||
sp_lj[5]=sp_lj_in[5];
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
if (ii<inum) {
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp e_coul=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
if (j < nall) {
|
||||
factor_lj = (numtyp)1.0;
|
||||
factor_coul = (numtyp)0.0;
|
||||
} else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<lj1[mtype].x) {
|
||||
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
|
||||
numtyp r2inv=(numtyp)1.0/rsq;
|
||||
|
||||
if (rsq < lj1[mtype].y) {
|
||||
if (lj3[mtype].x == (numtyp)2) {
|
||||
inv1=r2inv*r2inv;
|
||||
inv2=inv1*inv1;
|
||||
} else if (lj3[mtype].x == (numtyp)1) {
|
||||
inv2=r2inv*sqrt(r2inv);
|
||||
inv1=inv2*inv2;
|
||||
} else {
|
||||
inv1=r2inv*r2inv*r2inv;
|
||||
inv2=inv1;
|
||||
}
|
||||
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp r = sqrt(rsq);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = exp(-grij*grij);
|
||||
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else {
|
||||
forcecoul = (numtyp)0.0;
|
||||
prefactor = (numtyp)0.0;
|
||||
}
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < lj1[mtype].y) {
|
||||
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
|
||||
lj3[mtype].w;
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
*ap1=e_coul;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (ii<8)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
lj3[ii]=lj3_in[ii];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp e_coul=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
if (j < nall) {
|
||||
factor_lj = (numtyp)1.0;
|
||||
factor_coul = (numtyp)0.0;
|
||||
} else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<lj1[mtype].x) {
|
||||
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
|
||||
numtyp r2inv=(numtyp)1.0/rsq;
|
||||
|
||||
if (rsq < lj1[mtype].y) {
|
||||
if (lj3[mtype].x == (numtyp)2) {
|
||||
inv1=r2inv*r2inv;
|
||||
inv2=inv1*inv1;
|
||||
} else if (lj3[mtype].x == (numtyp)1) {
|
||||
inv2=r2inv*sqrt(r2inv);
|
||||
inv1=inv2*inv2;
|
||||
} else {
|
||||
inv1=r2inv*r2inv*r2inv;
|
||||
inv2=inv1;
|
||||
}
|
||||
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp r = sqrt(rsq);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = exp(-grij*grij);
|
||||
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else {
|
||||
forcecoul = (numtyp)0.0;
|
||||
prefactor = (numtyp)0.0;
|
||||
}
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < lj1[mtype].y) {
|
||||
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
|
||||
lj3[mtype].w;
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
*ap1=e_coul;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii*/
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "cmmc_long_gpu_cl.h"
|
||||
#else
|
||||
#include "cmmc_long_gpu_ptx.h"
|
||||
#endif
|
||||
|
||||
#include "cmmc_long_gpu_memory.h"
|
||||
#include <cassert>
|
||||
#define CMML_GPU_MemoryT CMML_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
CMML_GPU_MemoryT::CMML_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
|
||||
_allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
CMML_GPU_MemoryT::~CMML_GPU_Memory() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
double **host_cut_ljsq,
|
||||
const double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,cmmc_long_gpu_kernel);
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_OPTIMIZED);
|
||||
|
||||
for (int i=0; i<lj_types*lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq,
|
||||
host_cut_ljsq,host_lj1,host_lj2);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3,
|
||||
host_lj4,host_offset);
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
host_write[i]=host_special_lj[i];
|
||||
host_write[i+4]=host_special_coul[i];
|
||||
}
|
||||
ucl_copy(sp_lj,host_write,8,false);
|
||||
|
||||
_cut_coulsq=host_cut_coulsq;
|
||||
_qqrd2e=qqrd2e;
|
||||
_g_ewald=g_ewald;
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void CMML_GPU_MemoryT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
lj1.clear();
|
||||
lj3.clear();
|
||||
sp_lj.clear();
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double CMML_GPU_MemoryT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(CMML_GPU_Memory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
template class CMML_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,75 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef CMML_GPU_MEMORY_H
|
||||
#define CMML_GPU_MEMORY_H
|
||||
|
||||
#include "charge_gpu_memory.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
|
||||
public:
|
||||
CMML_GPU_Memory();
|
||||
~CMML_GPU_Memory();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq, int ** cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
|
||||
UCL_D_Vec<numtyp4> lj1;
|
||||
/// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
|
||||
UCL_D_Vec<numtyp4> lj3;
|
||||
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
This is a stripped down and customized version
|
||||
of the CUDA performance primitives library for
|
||||
use with the GPU package in LAMMPS.
|
||||
Don't use for anything else, get the real thing
|
||||
from http://code.google.com/p/cudpp/ instead!
|
|
@ -0,0 +1,337 @@
|
|||
// -------------------------------------------------------------
|
||||
// CUDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
#include <cudpp_globals.h>
|
||||
#include "cudpp_radixsort.h"
|
||||
#include "cta/scan_cta.cu"
|
||||
#include <cudpp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cudpp_util.h>
|
||||
#include <math.h>
|
||||
#include "sharedmem.h"
|
||||
|
||||
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
#define __EMUSYNC __syncthreads()
|
||||
#else
|
||||
#define __EMUSYNC
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @file
|
||||
* sort_cta.cu
|
||||
*
|
||||
* @brief CUDPP CTA-level sort routines
|
||||
*/
|
||||
|
||||
/** \addtogroup cudpp_cta
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @name Radix Sort Functions
|
||||
* @{
|
||||
*/
|
||||
|
||||
|
||||
typedef unsigned int uint;
|
||||
|
||||
/**
|
||||
* @brief Flips bits of single-precision floating-point number (parameterized by doFlip)
|
||||
*
|
||||
* flip a float for sorting
|
||||
* finds SIGN of fp number.
|
||||
* if it's 1 (negative float), it flips all bits
|
||||
* if it's 0 (positive float), it flips the sign only
|
||||
* @param[in] f floating-point input (passed as unsigned int)
|
||||
* @see floatUnflip
|
||||
**/
|
||||
|
||||
template <bool doFlip>
|
||||
__device__ uint floatFlip(uint f)
|
||||
{
|
||||
if (doFlip)
|
||||
{
|
||||
uint mask = -int(f >> 31) | 0x80000000;
|
||||
return f ^ mask;
|
||||
}
|
||||
else
|
||||
return f;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Reverses bit-flip of single-precision floating-point number (parameterized by doFlip)
|
||||
*
|
||||
* flip a float back (invert FloatFlip)
|
||||
* signed was flipped from above, so:
|
||||
* if sign is 1 (negative), it flips the sign bit back
|
||||
* if sign is 0 (positive), it flips all bits back
|
||||
* @param[in] f floating-point input (passed as unsigned int)
|
||||
* @see floatFlip
|
||||
**/
|
||||
template <bool doFlip>
|
||||
__device__ uint floatUnflip(uint f)
|
||||
{
|
||||
if (doFlip)
|
||||
{
|
||||
uint mask = ((f >> 31) - 1) | 0x80000000;
|
||||
return f ^ mask;
|
||||
}
|
||||
else
|
||||
return f;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Scans one warp quickly, optimized for 32-element warps, using shared memory
|
||||
*
|
||||
* Scans each warp in parallel ("warp-scan"), one element per thread.
|
||||
* uses 2 numElements of shared memory per thread (64 numElements per warp)
|
||||
*
|
||||
* @param[in] val Elements per thread to scan
|
||||
* @param[in,out] sData
|
||||
**/
|
||||
template<class T, int maxlevel>
|
||||
__device__ T scanwarp(T val, volatile T* sData)
|
||||
{
|
||||
// The following is the same as 2 * WARP_SIZE * warpId + threadInWarp =
|
||||
// 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE - 1))
|
||||
int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
|
||||
sData[idx] = 0;
|
||||
idx += WARP_SIZE;
|
||||
T t = sData[idx] = val; __EMUSYNC;
|
||||
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
t = sData[idx - 1]; __EMUSYNC;
|
||||
sData[idx] += t; __EMUSYNC;
|
||||
t = sData[idx - 2]; __EMUSYNC;
|
||||
sData[idx] += t; __EMUSYNC;
|
||||
t = sData[idx - 4]; __EMUSYNC;
|
||||
sData[idx] += t; __EMUSYNC;
|
||||
t = sData[idx - 8]; __EMUSYNC;
|
||||
sData[idx] += t; __EMUSYNC;
|
||||
t = sData[idx - 16]; __EMUSYNC;
|
||||
sData[idx] += t; __EMUSYNC;
|
||||
#else
|
||||
if (0 <= maxlevel) { sData[idx] = t = t + sData[idx - 1]; } __EMUSYNC;
|
||||
if (1 <= maxlevel) { sData[idx] = t = t + sData[idx - 2]; } __EMUSYNC;
|
||||
if (2 <= maxlevel) { sData[idx] = t = t + sData[idx - 4]; } __EMUSYNC;
|
||||
if (3 <= maxlevel) { sData[idx] = t = t + sData[idx - 8]; } __EMUSYNC;
|
||||
if (4 <= maxlevel) { sData[idx] = t = t + sData[idx -16]; } __EMUSYNC;
|
||||
#endif
|
||||
return sData[idx] - val; // convert inclusive -> exclusive
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Scans 4*CTA_SIZE unsigned ints in a block
|
||||
*
|
||||
* scan4 scans 4*CTA_SIZE numElements in a block (4 per
|
||||
* thread), using a warp-scan algorithm
|
||||
*
|
||||
* @param[in] idata 4-vector of integers to scan
|
||||
**/
|
||||
__device__ uint4 scan4(uint4 idata)
|
||||
{
|
||||
extern __shared__ uint ptr[];
|
||||
|
||||
uint idx = threadIdx.x;
|
||||
|
||||
uint4 val4 = idata;
|
||||
uint sum[3];
|
||||
sum[0] = val4.x;
|
||||
sum[1] = val4.y + sum[0];
|
||||
sum[2] = val4.z + sum[1];
|
||||
|
||||
uint val = val4.w + sum[2];
|
||||
|
||||
val = scanwarp<uint, 4>(val, ptr);
|
||||
__syncthreads();
|
||||
|
||||
if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
|
||||
{
|
||||
ptr[idx >> 5] = val + val4.w + sum[2];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
#ifndef __DEVICE_EMULATION__
|
||||
if (idx < WARP_SIZE)
|
||||
#endif
|
||||
{
|
||||
ptr[idx] = scanwarp<uint, 2>(ptr[idx], ptr);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
val += ptr[idx >> 5];
|
||||
|
||||
val4.x = val;
|
||||
val4.y = val + sum[0];
|
||||
val4.z = val + sum[1];
|
||||
val4.w = val + sum[2];
|
||||
|
||||
return val4;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Computes output position for each thread given predicate; trues come first then falses
|
||||
*
|
||||
* Rank is the core of the radix sort loop. Given a predicate, it
|
||||
* computes the output position for each thread in an ordering where all
|
||||
* True threads come first, followed by all False threads.
|
||||
* This version handles 4 predicates per thread; hence, "rank4".
|
||||
*
|
||||
* @param[in] preds true/false values for each of the 4 elements in this thread
|
||||
*
|
||||
* @todo is the description of "preds" correct?
|
||||
**/
|
||||
template <int ctasize>
|
||||
__device__ uint4 rank4(uint4 preds)
|
||||
{
|
||||
uint4 address = scan4(preds);
|
||||
|
||||
__shared__ uint numtrue;
|
||||
if (threadIdx.x == ctasize-1)
|
||||
{
|
||||
numtrue = address.w + preds.w;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
uint4 rank;
|
||||
uint idx = threadIdx.x << 2;
|
||||
rank.x = (preds.x) ? address.x : numtrue + idx - address.x;
|
||||
rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
|
||||
rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
|
||||
rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;
|
||||
|
||||
return rank;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Sorts one block
|
||||
*
|
||||
* Uses rank to sort one bit at a time: Sorts a block according
|
||||
* to bits startbit -> nbits + startbit
|
||||
* @param[in,out] key
|
||||
* @param[in,out] value
|
||||
**/
|
||||
template<uint nbits, uint startbit>
|
||||
__device__ void radixSortBlock(uint4 &key, uint4 &value)
|
||||
{
|
||||
extern __shared__ uint sMem1[];
|
||||
for(uint shift = startbit; shift < (startbit + nbits); ++shift)
|
||||
{
|
||||
uint4 lsb;
|
||||
lsb.x = !((key.x >> shift) & 0x1);
|
||||
lsb.y = !((key.y >> shift) & 0x1);
|
||||
lsb.z = !((key.z >> shift) & 0x1);
|
||||
lsb.w = !((key.w >> shift) & 0x1);
|
||||
|
||||
uint4 r = rank4<256>(lsb);
|
||||
|
||||
#if 1
|
||||
// This arithmetic strides the ranks across 4 SORT_CTA_SIZE regions
|
||||
sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
|
||||
sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
|
||||
sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
|
||||
sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w;
|
||||
__syncthreads();
|
||||
|
||||
// The above allows us to read without 4-way bank conflicts:
|
||||
key.x = sMem1[threadIdx.x];
|
||||
key.y = sMem1[threadIdx.x + SORT_CTA_SIZE];
|
||||
key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
|
||||
key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = value.x;
|
||||
sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = value.y;
|
||||
sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = value.z;
|
||||
sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = value.w;
|
||||
__syncthreads();
|
||||
|
||||
value.x = sMem1[threadIdx.x];
|
||||
value.y = sMem1[threadIdx.x + SORT_CTA_SIZE];
|
||||
value.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
|
||||
value.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
|
||||
#else
|
||||
sMem1[r.x] = key.x;
|
||||
sMem1[r.y] = key.y;
|
||||
sMem1[r.z] = key.z;
|
||||
sMem1[r.w] = key.w;
|
||||
__syncthreads();
|
||||
|
||||
// This access has 4-way bank conflicts
|
||||
key = sMem[threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
sMem1[r.x] = value.x;
|
||||
sMem1[r.y] = value.y;
|
||||
sMem1[r.z] = value.z;
|
||||
sMem1[r.w] = value.w;
|
||||
__syncthreads();
|
||||
|
||||
value = sMem[threadIdx.x];
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Sorts one block. Key-only version.
|
||||
*
|
||||
* Uses rank to sort one bit at a time: Sorts a block according
|
||||
* to bits startbit -> nbits + startbit
|
||||
* @param[in,out] key
|
||||
**/
|
||||
|
||||
template<uint nbits, uint startbit>
|
||||
__device__ void radixSortBlockKeysOnly(uint4 &key)
|
||||
{
|
||||
extern __shared__ uint sMem1[];
|
||||
for(uint shift = startbit; shift < (startbit + nbits); ++shift)
|
||||
{
|
||||
uint4 lsb;
|
||||
lsb.x = !((key.x >> shift) & 0x1);
|
||||
lsb.y = !((key.y >> shift) & 0x1);
|
||||
lsb.z = !((key.z >> shift) & 0x1);
|
||||
lsb.w = !((key.w >> shift) & 0x1);
|
||||
|
||||
uint4 r = rank4<256>(lsb);
|
||||
|
||||
#if 1
|
||||
// This arithmetic strides the ranks across 4 CTA_SIZE regions
|
||||
sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
|
||||
sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
|
||||
sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
|
||||
sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w;
|
||||
__syncthreads();
|
||||
|
||||
// The above allows us to read without 4-way bank conflicts:
|
||||
key.x = sMem1[threadIdx.x];
|
||||
key.y = sMem1[threadIdx.x + SORT_CTA_SIZE];
|
||||
key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
|
||||
key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
|
||||
#else
|
||||
sMem1[r.x] = key.x;
|
||||
sMem1[r.y] = key.y;
|
||||
sMem1[r.z] = key.z;
|
||||
sMem1[r.w] = key.w;
|
||||
__syncthreads();
|
||||
|
||||
// This access has 4-way bank conflicts
|
||||
key = sMem[threadIdx.x];
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
/** @} */ // end radix sort functions
|
||||
/** @} */ // end cudpp_cta
|
|
@ -0,0 +1,619 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 5633 $
|
||||
// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* scan_cta.cu
|
||||
*
|
||||
* @brief CUDPP CTA-level scan routines
|
||||
*/
|
||||
|
||||
/** \defgroup cudpp_cta CUDPP CTA-Level API
|
||||
* The CUDPP CTA-Level API contains functions that run on the GPU
|
||||
* device. These are CUDA \c __device__ functions that are called
|
||||
* from within other CUDA device functions (typically
|
||||
* \link cudpp_kernel CUDPP Kernel-Level API\endlink functions).
|
||||
* They are called CTA-level functions because they typically process
|
||||
* s_data "owned" by each CTA within shared memory, and are agnostic of
|
||||
* any other CTAs that may be running (or how many CTAs are running),
|
||||
* other than to compute appropriate global memory addresses.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @name Scan Functions
|
||||
* @{
|
||||
*/
|
||||
|
||||
#include <cudpp_globals.h>
|
||||
#include <cudpp_util.h>
|
||||
#include <math.h>
|
||||
#include <cudpp.h>
|
||||
|
||||
/**
|
||||
* @brief Macro to insert necessary __syncthreads() in device emulation mode
|
||||
*/
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
#define __EMUSYNC __syncthreads()
|
||||
#else
|
||||
#define __EMUSYNC
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Template class containing compile-time parameters to the scan functions
|
||||
*
|
||||
* ScanTraits is passed as a template parameter to all scan functions. By
|
||||
* using these compile-time functions we can enable generic code while
|
||||
* maintaining the highest performance. This is crucial for the performance
|
||||
* of low-level workhorse algorithms like scan.
|
||||
*
|
||||
* @param T The datatype of the scan
|
||||
* @param oper The ::CUDPPOperator to use for the scan (add, max, etc.)
|
||||
* @param multiRow True if this is a multi-row scan
|
||||
* @param unroll True if scan inner loops should be unrolled
|
||||
* @param sums True if each block should write it's sum to the d_blockSums array (false for single-block scans)
|
||||
* @param backward True if this is a backward scan
|
||||
* @param fullBlock True if all blocks in this scan are full (CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements)
|
||||
* @param exclusive True for exclusive scans, false for inclusive scans
|
||||
*/
|
||||
template <class T, CUDPPOperator oper, bool backward, bool exclusive,
|
||||
bool multiRow, bool sums, bool fullBlock>
|
||||
class ScanTraits
|
||||
{
|
||||
public:
|
||||
|
||||
//! Returns true if this is a backward scan
|
||||
static __device__ bool isBackward() { return backward; };
|
||||
//! Returns true if this is an exclusive scan
|
||||
static __device__ bool isExclusive() { return exclusive; };
|
||||
//! Returns true if this a multi-row scan.
|
||||
static __device__ bool isMultiRow() { return multiRow; };
|
||||
//! Returns true if this scan writes the sum of each block to the d_blockSums array (multi-block scans)
|
||||
static __device__ bool writeSums() { return sums; };
|
||||
//! Returns true if this is a full scan -- all blocks process CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements
|
||||
static __device__ bool isFullBlock() { return fullBlock; };
|
||||
|
||||
|
||||
//! The operator function used for the scan
|
||||
static __device__ T op(const T a, const T b)
|
||||
{
|
||||
return Operator<T, oper>::op(a, b);
|
||||
}
|
||||
|
||||
//! The identity value used by the scan
|
||||
static __device__ T identity() { return Operator<T, oper>::identity(); }
|
||||
};
|
||||
|
||||
//! This is used to insert syncthreads to avoid perf loss caused by 128-bit
|
||||
//! load overlap that happens on G80. This gives about a 15% boost on scans on
|
||||
//! G80.
|
||||
//! @todo Parameterize this in case this perf detail changes on future GPUs.
|
||||
#define DISALLOW_LOADSTORE_OVERLAP 1
|
||||
|
||||
/**
|
||||
* @brief Handles loading input s_data from global memory to shared memory
|
||||
* (vec4 version)
|
||||
*
|
||||
* Load a chunk of 8*blockDim.x elements from global memory into a
|
||||
* shared memory array. Each thread loads two T4 elements (where
|
||||
* T4 is, e.g. int4 or float4), computes the scan of those two vec4s in
|
||||
* thread local arrays (in registers), and writes the two total sums of the
|
||||
* vec4s into shared memory, where they will be cooperatively scanned with
|
||||
* the other partial sums by all threads in the CTA.
|
||||
*
|
||||
* @param[out] s_out The output (shared) memory array
|
||||
* @param[out] threadScan0 Intermediate per-thread partial sums array 1
|
||||
* @param[out] threadScan1 Intermediate per-thread partial sums array 2
|
||||
* @param[in] d_in The input (device) memory array
|
||||
* @param[in] numElements The number of elements in the array being scanned
|
||||
* @param[in] iDataOffset the offset of the input array in global memory for this
|
||||
* thread block
|
||||
* @param[out] ai The shared memory address for the thread's first element
|
||||
* (returned for reuse)
|
||||
* @param[out] bi The shared memory address for the thread's second element
|
||||
* (returned for reuse)
|
||||
* @param[out] aiDev The device memory address for this thread's first element
|
||||
* (returned for reuse)
|
||||
* @param[out] biDev The device memory address for this thread's second element
|
||||
* (returned for reuse)
|
||||
*/
|
||||
template <class T, class traits>
|
||||
__device__ void loadSharedChunkFromMem4(T *s_out,
|
||||
T threadScan0[4],
|
||||
T threadScan1[4],
|
||||
const T *d_in,
|
||||
int numElements,
|
||||
int iDataOffset,
|
||||
int &ai,
|
||||
int &bi,
|
||||
int &aiDev,
|
||||
int &biDev)
|
||||
{
|
||||
int thid = threadIdx.x;
|
||||
aiDev = iDataOffset + thid;
|
||||
biDev = aiDev + blockDim.x;
|
||||
|
||||
// convert to 4-vector
|
||||
typename typeToVector<T,4>::Result tempData;
|
||||
typename typeToVector<T,4>::Result* inData = (typename typeToVector<T,4>::Result*)d_in;
|
||||
|
||||
ai = thid;
|
||||
bi = thid + blockDim.x;
|
||||
|
||||
// read into tempData;
|
||||
if (traits::isBackward())
|
||||
{
|
||||
int i = aiDev * 4;
|
||||
if (traits::isFullBlock() || i + 3 < numElements)
|
||||
{
|
||||
tempData = inData[aiDev];
|
||||
threadScan0[3] = tempData.w;
|
||||
threadScan0[2] = traits::op(tempData.z, threadScan0[3]);
|
||||
threadScan0[1] = traits::op(tempData.y, threadScan0[2]);
|
||||
threadScan0[0] = s_out[ai]
|
||||
= traits::op(tempData.x, threadScan0[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
threadScan0[3] = traits::identity();
|
||||
threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[3]);
|
||||
threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[2]);
|
||||
threadScan0[0] = s_out[ai]
|
||||
= traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan0[1]);
|
||||
}
|
||||
|
||||
#ifdef DISALLOW_LOADSTORE_OVERLAP
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
i = biDev * 4;
|
||||
if (traits::isFullBlock() || i + 3 < numElements)
|
||||
{
|
||||
tempData = inData[biDev];
|
||||
threadScan1[3] = tempData.w;
|
||||
threadScan1[2] = traits::op(tempData.z, threadScan1[3]);
|
||||
threadScan1[1] = traits::op(tempData.y, threadScan1[2]);
|
||||
threadScan1[0] = s_out[bi]
|
||||
= traits::op(tempData.x, threadScan1[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
threadScan1[3] = traits::identity();
|
||||
threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[3]);
|
||||
threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[2]);
|
||||
threadScan1[0] = s_out[bi]
|
||||
= traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan1[1]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// reverse s_data in shared memory
|
||||
if (ai < CTA_SIZE)
|
||||
{
|
||||
unsigned int leftIdx = ai;
|
||||
unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
|
||||
|
||||
if (leftIdx < rightIdx)
|
||||
{
|
||||
T tmp = s_out[leftIdx];
|
||||
s_out[leftIdx] = s_out[rightIdx];
|
||||
s_out[rightIdx] = tmp;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
else
|
||||
{
|
||||
int i = aiDev * 4;
|
||||
if (traits::isFullBlock() || i + 3 < numElements)
|
||||
{
|
||||
tempData = inData[aiDev];
|
||||
threadScan0[0] = tempData.x;
|
||||
threadScan0[1] = traits::op(tempData.y, threadScan0[0]);
|
||||
threadScan0[2] = traits::op(tempData.z, threadScan0[1]);
|
||||
threadScan0[3] = s_out[ai]
|
||||
= traits::op(tempData.w, threadScan0[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
threadScan0[0] = (i < numElements) ? d_in[i] : traits::identity();
|
||||
threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[0]);
|
||||
threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[1]);
|
||||
threadScan0[3] = s_out[ai]
|
||||
= traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan0[2]);
|
||||
}
|
||||
|
||||
|
||||
#ifdef DISALLOW_LOADSTORE_OVERLAP
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
i = biDev * 4;
|
||||
if (traits::isFullBlock() || i + 3 < numElements)
|
||||
{
|
||||
tempData = inData[biDev];
|
||||
threadScan1[0] = tempData.x;
|
||||
threadScan1[1] = traits::op(tempData.y, threadScan1[0]);
|
||||
threadScan1[2] = traits::op(tempData.z, threadScan1[1]);
|
||||
threadScan1[3] = s_out[bi]
|
||||
= traits::op(tempData.w, threadScan1[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
threadScan1[0] = (i < numElements) ? d_in[i] : traits::identity();
|
||||
threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[0]);
|
||||
threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[1]);
|
||||
threadScan1[3] = s_out[bi]
|
||||
= traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan1[2]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Handles storing result s_data from shared memory to global memory
|
||||
* (vec4 version)
|
||||
*
|
||||
* Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory
|
||||
* into a device memory array. Each thread stores reads two elements from shared
|
||||
* memory, adds them to the intermediate sums computed in
|
||||
* loadSharedChunkFromMem4(), and writes two T4 elements (where
|
||||
* T4 is, e.g. int4 or float4) to global memory.
|
||||
*
|
||||
* @param[out] d_out The output (device) memory array
|
||||
* @param[in] threadScan0 Intermediate per-thread partial sums array 1
|
||||
* (contents computed in loadSharedChunkFromMem4())
|
||||
* @param[in] threadScan1 Intermediate per-thread partial sums array 2
|
||||
* (contents computed in loadSharedChunkFromMem4())
|
||||
* @param[in] s_in The input (shared) memory array
|
||||
* @param[in] numElements The number of elements in the array being scanned
|
||||
* @param[in] oDataOffset the offset of the output array in global memory
|
||||
* for this thread block
|
||||
* @param[in] ai The shared memory address for the thread's first element
|
||||
* (computed in loadSharedChunkFromMem4())
|
||||
* @param[in] bi The shared memory address for the thread's second element
|
||||
* (computed in loadSharedChunkFromMem4())
|
||||
* @param[in] aiDev The device memory address for this thread's first element
|
||||
* (computed in loadSharedChunkFromMem4())
|
||||
* @param[in] biDev The device memory address for this thread's second element
|
||||
* (computed in loadSharedChunkFromMem4())
|
||||
*/
|
||||
template <class T, class traits>
|
||||
__device__ void storeSharedChunkToMem4(T *d_out,
|
||||
T threadScan0[4],
|
||||
T threadScan1[4],
|
||||
T *s_in,
|
||||
int numElements,
|
||||
int oDataOffset,
|
||||
int ai,
|
||||
int bi,
|
||||
int aiDev,
|
||||
int biDev)
|
||||
{
|
||||
// Convert to 4-vector
|
||||
typename typeToVector<T,4>::Result tempData;
|
||||
typename typeToVector<T,4>::Result* outData = (typename typeToVector<T,4>::Result*)d_out;
|
||||
|
||||
// write results to global memory
|
||||
if (traits::isBackward())
|
||||
{
|
||||
if (ai < CTA_SIZE)
|
||||
{
|
||||
|
||||
unsigned int leftIdx = ai;
|
||||
unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
|
||||
|
||||
if (leftIdx < rightIdx)
|
||||
{
|
||||
T tmp = s_in[leftIdx];
|
||||
s_in[leftIdx] = s_in[rightIdx];
|
||||
s_in[rightIdx] = tmp;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
T temp = s_in[ai];
|
||||
|
||||
if (traits::isExclusive())
|
||||
{
|
||||
tempData.w = temp;
|
||||
tempData.z = traits::op(temp, threadScan0[3]);
|
||||
tempData.y = traits::op(temp, threadScan0[2]);
|
||||
tempData.x = traits::op(temp, threadScan0[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
tempData.w = traits::op(temp, threadScan0[3]);
|
||||
tempData.z = traits::op(temp, threadScan0[2]);
|
||||
tempData.y = traits::op(temp, threadScan0[1]);
|
||||
tempData.x = traits::op(temp, threadScan0[0]);
|
||||
}
|
||||
|
||||
int i = aiDev * 4;
|
||||
if (traits::isFullBlock() || i + 3 < numElements)
|
||||
{
|
||||
outData[aiDev] = tempData;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (i < numElements) { d_out[i] = tempData.x;
|
||||
if (i+1 < numElements) { d_out[i+1] = tempData.y;
|
||||
if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}
|
||||
}
|
||||
|
||||
#ifdef DISALLOW_LOADSTORE_OVERLAP
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
temp = s_in[bi];
|
||||
|
||||
if (traits::isExclusive())
|
||||
{
|
||||
tempData.w = temp;
|
||||
tempData.z = traits::op(temp, threadScan1[3]);
|
||||
tempData.y = traits::op(temp, threadScan1[2]);
|
||||
tempData.x = traits::op(temp, threadScan1[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
tempData.w = traits::op(temp, threadScan1[3]);
|
||||
tempData.z = traits::op(temp, threadScan1[2]);
|
||||
tempData.y = traits::op(temp, threadScan1[1]);
|
||||
tempData.x = traits::op(temp, threadScan1[0]);
|
||||
}
|
||||
|
||||
i = biDev * 4;
|
||||
if (traits::isFullBlock() || i + 3 < numElements)
|
||||
{
|
||||
outData[biDev] = tempData;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (i < numElements) { d_out[i] = tempData.x;
|
||||
if (i+1 < numElements) { d_out[i+1] = tempData.y;
|
||||
if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
T temp;
|
||||
temp = s_in[ai];
|
||||
|
||||
if (traits::isExclusive())
|
||||
{
|
||||
tempData.x = temp;
|
||||
tempData.y = traits::op(temp, threadScan0[0]);
|
||||
tempData.z = traits::op(temp, threadScan0[1]);
|
||||
tempData.w = traits::op(temp, threadScan0[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
tempData.x = traits::op(temp, threadScan0[0]);
|
||||
tempData.y = traits::op(temp, threadScan0[1]);
|
||||
tempData.z = traits::op(temp, threadScan0[2]);
|
||||
tempData.w = traits::op(temp, threadScan0[3]);
|
||||
}
|
||||
|
||||
int i = aiDev * 4;
|
||||
if (traits::isFullBlock() || i + 3 < numElements)
|
||||
{
|
||||
outData[aiDev] = tempData;
|
||||
}
|
||||
else
|
||||
{
|
||||
// we can't use vec4 because the original array isn't a multiple of
|
||||
// 4 elements
|
||||
if ( i < numElements) { d_out[i] = tempData.x;
|
||||
if ((i+1) < numElements) { d_out[i+1] = tempData.y;
|
||||
if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
|
||||
}
|
||||
|
||||
#ifdef DISALLOW_LOADSTORE_OVERLAP
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
temp = s_in[bi];
|
||||
|
||||
if (traits::isExclusive())
|
||||
{
|
||||
tempData.x = temp;
|
||||
tempData.y = traits::op(temp, threadScan1[0]);
|
||||
tempData.z = traits::op(temp, threadScan1[1]);
|
||||
tempData.w = traits::op(temp, threadScan1[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
tempData.x = traits::op(temp, threadScan1[0]);
|
||||
tempData.y = traits::op(temp, threadScan1[1]);
|
||||
tempData.z = traits::op(temp, threadScan1[2]);
|
||||
tempData.w = traits::op(temp, threadScan1[3]);
|
||||
}
|
||||
|
||||
i = biDev * 4;
|
||||
if (traits::isFullBlock() || i + 3 < numElements)
|
||||
{
|
||||
outData[biDev] = tempData;
|
||||
}
|
||||
else
|
||||
{
|
||||
// we can't use vec4 because the original array isn't a multiple of
|
||||
// 4 elements
|
||||
if ( i < numElements) { d_out[i] = tempData.x;
|
||||
if ((i+1) < numElements) { d_out[i+1] = tempData.y;
|
||||
if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Scan all warps of a CTA without synchronization
|
||||
*
|
||||
* The warp-scan algorithm breaks a block of data into warp-sized chunks, and
|
||||
* scans the chunks independently with a warp of threads each. Because warps
|
||||
* execute instructions in SIMD fashion, there is no need to synchronize in
|
||||
* order to share data within a warp (only across warps). Also, in SIMD the
|
||||
* most efficient algorithm is a step-efficient algorithm. Therefore, within
|
||||
* each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps
|
||||
* to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the
|
||||
* work-efficient tree-based algorithm described by Guy Blelloch [1990] that
|
||||
* takes 2 * log(N) steps and is in general more complex to implement.
|
||||
* Previous versions of CUDPP used the Blelloch algorithm. For current GPUs,
|
||||
* the warp size is 32, so this takes five steps per warp.
|
||||
*
|
||||
* Each thread is responsible for a single element of the array to be scanned.
|
||||
* Each thread inputs a single value to the scan via \a val and returns
|
||||
* its own scanned result element. The threads of each warp cooperate
|
||||
* via the shared memory array \a s_data to scan WARP_SIZE elements.
|
||||
*
|
||||
* Template parameter \a maxlevel allows this warpscan to be performed on
|
||||
* partial warps. For example, if only the first 8 elements of each warp need
|
||||
* to be scanned, then warpscan only performs log2(8)=3 steps rather than 5.
|
||||
*
|
||||
* The computation uses 2 * WARP_SIZE elements of shared memory per warp to
|
||||
* enable warps to offset beyond their input data and receive the identity
|
||||
* element without using any branch instructions.
|
||||
*
|
||||
* \note s_data is declared volatile here to prevent the compiler from
|
||||
* optimizing away writes to shared memory, and ensure correct intrawarp
|
||||
* communication in the absence of __syncthreads.
|
||||
*
|
||||
* @return The result of the warp scan for the current thread
|
||||
* @param[in] val The current threads's input to the scan
|
||||
* @param[in,out] s_data A pointer to a temporary shared array of 2*CTA_SIZE
|
||||
* elements used to compute the warp scans
|
||||
*/
|
||||
template<class T, class traits,int maxlevel>
|
||||
__device__ T warpscan(T val, volatile T* s_data)
|
||||
{
|
||||
// The following is the same as 2 * 32 * warpId + threadInWarp =
|
||||
// 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE-1))
|
||||
int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE-1));
|
||||
s_data[idx] = traits::identity();
|
||||
idx += WARP_SIZE;
|
||||
T t = s_data[idx] = val; __EMUSYNC;
|
||||
|
||||
// This code is needed because the warp size of device emulation
|
||||
// is only 1 thread, so sync-less cooperation within a warp doesn't
|
||||
// work.
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
t = s_data[idx - 1]; __EMUSYNC;
|
||||
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
||||
t = s_data[idx - 2]; __EMUSYNC;
|
||||
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
||||
t = s_data[idx - 4]; __EMUSYNC;
|
||||
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
||||
t = s_data[idx - 8]; __EMUSYNC;
|
||||
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
||||
t = s_data[idx - 16]; __EMUSYNC;
|
||||
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
||||
#else
|
||||
if (0 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 1]); }
|
||||
if (1 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 2]); }
|
||||
if (2 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 4]); }
|
||||
if (3 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 8]); }
|
||||
if (4 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx -16]); }
|
||||
#endif
|
||||
|
||||
return s_data[idx-1]; // convert inclusive -> exclusive
|
||||
}
|
||||
|
||||
/** @brief Perform a full CTA scan using the warp-scan algorithm
|
||||
*
|
||||
* As described in the comment for warpscan(), the warp-scan algorithm breaks
|
||||
* a block of data into warp-sized chunks, and scans the chunks independently
|
||||
* with a warp of threads each. To complete the scan, each warp <i>j</i> then
|
||||
* writes its last element to element <i>j</i> of a temporary shared array.
|
||||
* Then a single warp exclusive-scans these "warp sums". Finally, each thread
|
||||
* adds the result of the warp sum scan to the result of the scan from the
|
||||
* first pass.
|
||||
*
|
||||
* Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan
|
||||
* twice.
|
||||
*
|
||||
* @param x The first input value for the current thread
|
||||
* @param y The second input value for the current thread
|
||||
* @param s_data Temporary shared memory space of 2*CTA_SIZE elements for
|
||||
* performing the scan
|
||||
*/
|
||||
template <class T, class traits>
|
||||
__device__ void scanWarps(T x, T y,
|
||||
T *s_data)
|
||||
{
|
||||
T val = warpscan<T, traits, 4>(x, s_data);
|
||||
__syncthreads();
|
||||
T val2 = warpscan<T, traits, 4>(y, s_data);
|
||||
|
||||
int idx = threadIdx.x;
|
||||
|
||||
if ((idx & 31)==31)
|
||||
{
|
||||
s_data[idx >> 5] = traits::op(val, x);
|
||||
s_data[(idx + blockDim.x) >> 5] = traits::op(val2, y);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
#ifndef __DEVICE_EMULATION__
|
||||
if (idx < 32)
|
||||
#endif
|
||||
{
|
||||
s_data[idx] = warpscan<T,traits,(LOG_CTA_SIZE-LOG_WARP_SIZE+1)>(s_data[idx], s_data);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
val = traits::op(val, s_data[idx >> 5]);
|
||||
|
||||
val2 = traits::op(val2, s_data[(idx + blockDim.x) >> 5]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
s_data[idx] = val;
|
||||
s_data[idx+blockDim.x] = val2;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief CTA-level scan routine; scans s_data in shared memory in each thread block
|
||||
*
|
||||
* This function is the main CTA-level scan function. It may be called by other
|
||||
* CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements.
|
||||
* Each thread is responsible for one element in each half of the input array.
|
||||
* \note This code is intended to be run on a CTA of 128 threads. Other sizes are
|
||||
* untested.
|
||||
*
|
||||
* @param[in] s_data The array to be scanned in shared memory
|
||||
* @param[out] d_blockSums Array of per-block sums
|
||||
* @param[in] blockSumIndex Location in \a d_blockSums to which to write this block's sum
|
||||
*/
|
||||
template <class T, class traits>
|
||||
__device__ void scanCTA(T *s_data,
|
||||
T *d_blockSums,
|
||||
unsigned int blockSumIndex)
|
||||
{
|
||||
T val = s_data[threadIdx.x];
|
||||
T val2 = s_data[threadIdx.x + blockDim.x];
|
||||
__syncthreads();
|
||||
|
||||
scanWarps<T,traits>(val, val2, s_data);
|
||||
__syncthreads();
|
||||
|
||||
if (traits::writeSums() && threadIdx.x == blockDim.x - 1)
|
||||
{
|
||||
d_blockSums[blockSumIndex] = traits::op(val2, s_data[threadIdx.x + blockDim.x]);
|
||||
}
|
||||
|
||||
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
// must sync in emulation mode when doing backward scans, because otherwise the
|
||||
// shared memory array will get reversed before the block sums are read!
|
||||
if (traits::isBackward())
|
||||
__syncthreads();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/** @} */ // end scan functions
|
||||
/** @} */ // end cudpp_cta
|
|
@ -0,0 +1,417 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 5632 $
|
||||
// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt in
|
||||
// the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* cudpp.cpp
|
||||
*
|
||||
* @brief Main library source file. Implements wrappers for public
|
||||
* interface.
|
||||
*
|
||||
* Main library source file. Implements wrappers for public
|
||||
* interface. These wrappers call application-level operators.
|
||||
* As this grows we may decide to partition into multiple source
|
||||
* files.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \defgroup publicInterface CUDPP Public Interface
|
||||
* The CUDA public interface comprises the functions, structs, and enums
|
||||
* defined in cudpp.h. Public interface functions call functions in the
|
||||
* \link cudpp_app Application-Level\endlink interface. The public
|
||||
* interface functions include Plan Interface functions and Algorithm
|
||||
* Interface functions. Plan Inteface functions are used for creating
|
||||
* CUDPP Plan objects which contain configuration details, intermediate
|
||||
* storage space, and in the case of cudppSparseMatrix(), data. The
|
||||
* Algorithm Interface is the set of functions that do the real work
|
||||
* of CUDPP, such as cudppScan() and cudppSparseMatrixVectorMultiply.
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @name Algorithm Interface
|
||||
* @{
|
||||
*/
|
||||
|
||||
#include "cudpp.h"
|
||||
#include "cudpp_plan_manager.h"
|
||||
#include "cudpp_scan.h"
|
||||
//#include "cudpp_segscan.h"
|
||||
//#include "cudpp_compact.h"
|
||||
//#include "cudpp_spmvmult.h"
|
||||
#include "cudpp_radixsort.h"
|
||||
//#include "cudpp_rand.h"
|
||||
|
||||
/**
|
||||
* @brief Performs a scan operation of numElements on its input in
|
||||
* GPU memory (d_in) and places the output in GPU memory
|
||||
* (d_out), with the scan parameters specified in the plan pointed to by
|
||||
* planHandle.
|
||||
|
||||
* The input to a scan operation is an input array, a binary associative
|
||||
* operator (like + or max), and an identity element for that operator
|
||||
* (+'s identity is 0). The output of scan is the same size as its input.
|
||||
* Informally, the output at each element is the result of operator
|
||||
* applied to each input that comes before it. For instance, the
|
||||
* output of sum-scan at each element is the sum of all the input
|
||||
* elements before that input.
|
||||
*
|
||||
* More formally, for associative operator
|
||||
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly,
|
||||
* <var>out<sub>i</sub></var> = <var>in<sub>0</sub></var>
|
||||
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly
|
||||
* <var>in<sub>1</sub></var>
|
||||
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly ...
|
||||
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly
|
||||
* <var>in<sub>i-1</sub></var>.
|
||||
*
|
||||
* CUDPP supports "exclusive" and "inclusive" scans. For the ADD operator,
|
||||
* an exclusive scan computes the sum of all input elements before the
|
||||
* current element, while an inclusive scan computes the sum of all input
|
||||
* elements up to and including the current element.
|
||||
*
|
||||
* Before calling scan, create an internal plan using cudppPlan().
|
||||
*
|
||||
* After you are finished with the scan plan, clean up with cudppDestroyPlan().
|
||||
*
|
||||
* @param[in] planHandle Handle to plan for this scan
|
||||
* @param[out] d_out output of scan, in GPU memory
|
||||
* @param[in] d_in input to scan, in GPU memory
|
||||
* @param[in] numElements number of elements to scan
|
||||
*
|
||||
* @see cudppPlan, cudppDestroyPlan
|
||||
*/
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppScan(CUDPPHandle planHandle,
|
||||
void *d_out,
|
||||
const void *d_in,
|
||||
size_t numElements)
|
||||
{
|
||||
CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
|
||||
if (plan != NULL)
|
||||
{
|
||||
cudppScanDispatch(d_out, d_in, numElements, 1, plan);
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs a segmented scan operation of numElements on its input in
|
||||
* GPU memory (d_idata) and places the output in GPU memory
|
||||
* (d_out), with the scan parameters specified in the plan pointed to by
|
||||
* planHandle.
|
||||
|
||||
* The input to a segmented scan operation is an input array of data,
|
||||
* an input array of flags which demarcate segments, a binary associative
|
||||
* operator (like + or max), and an identity element for that operator
|
||||
* (+'s identity is 0). The array of flags is the same length as the input
|
||||
* with 1 marking the the first element of a segment and 0 otherwise. The
|
||||
* output of segmented scan is the same size as its input. Informally, the
|
||||
* output at each element is the result of operator applied to each input
|
||||
* that comes before it in that segment. For instance, the output of
|
||||
* segmented sum-scan at each element is the sum of all the input elements
|
||||
* before that input in that segment.
|
||||
*
|
||||
* More formally, for associative operator
|
||||
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly,
|
||||
* <var>out<sub>i</sub></var> = <var>in<sub>k</sub></var>
|
||||
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly
|
||||
* <var>in<sub>k+1</sub></var>
|
||||
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly ...
|
||||
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly
|
||||
* <var>in<sub>i-1</sub></var>.
|
||||
* <i>k</i> is the index of the first element of the segment in which <i>i</i> lies
|
||||
*
|
||||
* We support both "exclusive" and "inclusive" variants. For a segmented sum-scan,
|
||||
* the exclusive variant computes the sum of all input elements before the
|
||||
* current element in that segment, while the inclusive variant computes the
|
||||
* sum of all input elements up to and including the current element, in
|
||||
* that segment.
|
||||
*
|
||||
* Before calling segmented scan, create an internal plan using cudppPlan().
|
||||
*
|
||||
* After you are finished with the scan plan, clean up with cudppDestroyPlan().
|
||||
* @param[in] planHandle Handle to plan for this scan
|
||||
* @param[out] d_out output of segmented scan, in GPU memory
|
||||
* @param[in] d_idata input data to segmented scan, in GPU memory
|
||||
* @param[in] d_iflags input flags to segmented scan, in GPU memory
|
||||
* @param[in] numElements number of elements to perform segmented scan on
|
||||
*
|
||||
* @see cudppPlan, cudppDestroyPlan
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle,
|
||||
void *d_out,
|
||||
const void *d_idata,
|
||||
const unsigned int *d_iflags,
|
||||
size_t numElements)
|
||||
{
|
||||
CUDPPSegmentedScanPlan *plan =
|
||||
(CUDPPSegmentedScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
|
||||
if (plan != NULL)
|
||||
{
|
||||
cudppSegmentedScanDispatch(d_out, d_idata, d_iflags, numElements, plan);
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
|
||||
}
|
||||
}
|
||||
*/
|
||||
/**
|
||||
* @brief Performs numRows parallel scan operations of numElements
|
||||
* each on its input (d_in) and places the output in d_out,
|
||||
* with the scan parameters set by config. Exactly like cudppScan
|
||||
* except that it runs on multiple rows in parallel.
|
||||
*
|
||||
* Note that to achieve good performance with cudppMultiScan one should
|
||||
* allocate the device arrays passed to it so that all rows are aligned
|
||||
* to the correct boundaries for the architecture the app is running on.
|
||||
* The easy way to do this is to use cudaMallocPitch() to allocate a
|
||||
* 2D array on the device. Use the \a rowPitch parameter to cudppPlan()
|
||||
* to specify this pitch. The easiest way is to pass the device pitch
|
||||
* returned by cudaMallocPitch to cudppPlan() via \a rowPitch.
|
||||
*
|
||||
* @param[in] planHandle handle to CUDPPScanPlan
|
||||
* @param[out] d_out output of scan, in GPU memory
|
||||
* @param[in] d_in input to scan, in GPU memory
|
||||
* @param[in] numElements number of elements (per row) to scan
|
||||
* @param[in] numRows number of rows to scan in parallel
|
||||
*
|
||||
* @see cudppScan, cudppPlan
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
|
||||
void *d_out,
|
||||
const void *d_in,
|
||||
size_t numElements,
|
||||
size_t numRows)
|
||||
{
|
||||
CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
|
||||
if (plan != NULL)
|
||||
{
|
||||
cudppScanDispatch(d_out, d_in, numElements, numRows, plan);
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Given an array \a d_in and an array of 1/0 flags in \a
|
||||
* deviceValid, returns a compacted array in \a d_out of corresponding
|
||||
* only the "valid" values from \a d_in.
|
||||
*
|
||||
* Takes as input an array of elements in GPU memory
|
||||
* (\a d_in) and an equal-sized unsigned int array in GPU memory
|
||||
* (\a deviceValid) that indicate which of those input elements are
|
||||
* valid. The output is a packed array, in GPU memory, of only those
|
||||
* elements marked as valid.
|
||||
*
|
||||
* Internally, uses cudppScan.
|
||||
*
|
||||
* Example:
|
||||
* \code
|
||||
* d_in = [ a b c d e f ]
|
||||
* deviceValid = [ 1 0 1 1 0 1 ]
|
||||
* d_out = [ a c d f ]
|
||||
* \endcode
|
||||
*
|
||||
* @todo [MJH] We need to evaluate whether cudppCompact should be a core member
|
||||
* of the public interface. It's not clear to me that what the user always
|
||||
* wants is a final compacted array. Often one just wants the array of indices
|
||||
* to which each input element should go in the output. The split() routine used
|
||||
* in radix sort might make more sense to expose.
|
||||
*
|
||||
* @param[in] planHandle handle to CUDPPCompactPlan
|
||||
* @param[out] d_out compacted output
|
||||
* @param[out] d_numValidElements set during cudppCompact; is set with the
|
||||
* number of elements valid flags in the d_isValid input array
|
||||
* @param[in] d_in input to compact
|
||||
* @param[in] d_isValid which elements in d_in are valid
|
||||
* @param[in] numElements number of elements in d_in
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppCompact(CUDPPHandle planHandle,
|
||||
void *d_out,
|
||||
size_t *d_numValidElements,
|
||||
const void *d_in,
|
||||
const unsigned int *d_isValid,
|
||||
size_t numElements)
|
||||
{
|
||||
CUDPPCompactPlan *plan = (CUDPPCompactPlan*)CUDPPPlanManager::GetPlan(planHandle);
|
||||
if (plan != NULL)
|
||||
{
|
||||
cudppCompactDispatch(d_out, d_numValidElements, d_in, d_isValid,
|
||||
numElements, plan);
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
|
||||
}
|
||||
}
|
||||
*/
|
||||
/**
|
||||
* @brief Sorts key-value pairs or keys only
|
||||
*
|
||||
* Takes as input an array of keys in GPU memory
|
||||
* (d_keys) and an optional array of corresponding values,
|
||||
* and outputs sorted arrays of keys and (optionally) values in place.
|
||||
* Key-value and key-only sort is selected through the configuration of
|
||||
* the plan, using the options CUDPP_OPTION_KEYS_ONLY and
|
||||
* CUDPP_OPTION_KEY_VALUE_PAIRS.
|
||||
*
|
||||
* Supported key types are CUDPP_FLOAT and CUDPP_UINT. Values can be
|
||||
* any 32-bit type (internally, values are treated only as a payload
|
||||
* and cast to unsigned int).
|
||||
*
|
||||
* @todo Determine if we need to provide an "out of place" sort interface.
|
||||
*
|
||||
* @param[in] planHandle handle to CUDPPSortPlan
|
||||
* @param[out] d_keys keys by which key-value pairs will be sorted
|
||||
* @param[in] d_values values to be sorted
|
||||
* @param[in] keyBits the number of least significant bits in each element
|
||||
* of d_keys to sort by
|
||||
* @param[in] numElements number of elements in d_keys and d_values
|
||||
*
|
||||
* @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
|
||||
*/
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppSort(CUDPPHandle planHandle,
|
||||
void *d_keys,
|
||||
void *d_values,
|
||||
int keyBits,
|
||||
size_t numElements)
|
||||
{
|
||||
CUDPPRadixSortPlan *plan = (CUDPPRadixSortPlan*)CUDPPPlanManager::GetPlan(planHandle);
|
||||
if (plan != NULL)
|
||||
{
|
||||
cudppRadixSortDispatch(d_keys, d_values, numElements, keyBits, plan);
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Perform matrix-vector multiply y = A*x for arbitrary sparse matrix A and vector x
|
||||
*
|
||||
* Given a matrix object handle (which has been initialized using cudppSparseMatrix()),
|
||||
* This function multiplies the input vector \a d_x by the matrix referred to by
|
||||
* \a sparseMatrixHandle, returning the result in \a d_y.
|
||||
*
|
||||
* @param sparseMatrixHandle Handle to a sparse matrix object created with cudppSparseMatrix()
|
||||
* @param d_y The output vector, y
|
||||
* @param d_x The input vector, x
|
||||
*
|
||||
* @see cudppSparseMatrix, cudppDestroySparseMatrix
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle,
|
||||
void *d_y,
|
||||
const void *d_x)
|
||||
{
|
||||
CUDPPSparseMatrixVectorMultiplyPlan *plan =
|
||||
(CUDPPSparseMatrixVectorMultiplyPlan*)CUDPPPlanManager::GetPlan(sparseMatrixHandle);
|
||||
|
||||
if (plan != NULL)
|
||||
{
|
||||
cudppSparseMatrixVectorMultiplyDispatch(d_y, d_x, plan);
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
|
||||
}
|
||||
}
|
||||
*/
|
||||
/**
|
||||
* @brief Rand puts \a numElements random 32-bit elements into \a d_out
|
||||
*
|
||||
|
||||
* Outputs \a numElements random values to \a d_out. \a d_out must be of
|
||||
* type unsigned int, allocated in device memory.
|
||||
*
|
||||
* The algorithm used for the random number generation is stored in \a planHandle.
|
||||
* Depending on the specification of the pseudo random number generator(PRNG),
|
||||
* the generator may have one or more seeds. To set the seed, use cudppRandSeed().
|
||||
*
|
||||
* @todo Currently only MD5 PRNG is supported. We may provide more rand routines in
|
||||
* the future.
|
||||
*
|
||||
* @param[in] planHandle Handle to plan for rand
|
||||
* @param[in] numElements number of elements in d_out.
|
||||
* @param[out] d_out output of rand, in GPU memory. Should be an array of unsigned integers.
|
||||
*
|
||||
* @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements)
|
||||
{
|
||||
CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
|
||||
if(plan != NULL)
|
||||
{
|
||||
//dispatch the rand algorithm here
|
||||
cudppRandDispatch(d_out, numElements, plan);
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
else
|
||||
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
|
||||
}
|
||||
*/
|
||||
|
||||
/**@brief Sets the seed used for rand
|
||||
*
|
||||
* The seed is crucial to any random number generator as it allows a
|
||||
* sequence of random numbers to be replicated. Since there may be
|
||||
* multiple different rand algorithms in CUDPP, cudppRandSeed
|
||||
* uses \a planHandle to determine which seed to set. Each rand
|
||||
* algorithm has its own unique set of seeds depending on what
|
||||
* the algorithm needs.
|
||||
*
|
||||
* @param[in] planHandle the handle to the plan which specifies which rand seed to set
|
||||
* @param[in] seed the value which the internal cudpp seed will be set to
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed)
|
||||
{
|
||||
CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
|
||||
//switch on the plan to figure out which seed to update
|
||||
switch(plan->m_config.algorithm)
|
||||
{
|
||||
case CUDPP_RAND_MD5:
|
||||
plan->m_seed = seed;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return CUDPP_SUCCESS;
|
||||
}//end cudppRandSeed
|
||||
*/
|
||||
/** @} */ // end Algorithm Interface
|
||||
/** @} */ // end of publicInterface group
|
||||
|
||||
// Leave this at the end of the file
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-file-style: "NVIDIA"
|
||||
// End:
|
||||
|
|
@ -0,0 +1,525 @@
|
|||
// -------------------------------------------------------------
|
||||
// CUDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt in
|
||||
// the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* cudpp.h
|
||||
*
|
||||
* @brief Main library header file. Defines public interface.
|
||||
*
|
||||
* The CUDPP public interface is a C-only interface to enable
|
||||
* linking with code written in other languages (e.g. C, C++,
|
||||
* and Fortran). While the internals of CUDPP are not limited
|
||||
* to C (C++ features are used), the public interface is
|
||||
* entirely C (thus it is declared "extern C").
|
||||
*/
|
||||
|
||||
/**
|
||||
* \mainpage
|
||||
*
|
||||
* \section introduction Introduction
|
||||
*
|
||||
* CUDPP is the CUDA Data Parallel Primitives Library. CUDPP is a
|
||||
* library of data-parallel algorithm primitives such as
|
||||
* parallel-prefix-sum ("scan"), parallel sort and parallel reduction.
|
||||
* Primitives such as these are important building blocks for a wide
|
||||
* variety of data-parallel algorithms, including sorting, stream
|
||||
* compaction, and building data structures such as trees and
|
||||
* summed-area tables.
|
||||
*
|
||||
* \section overview Overview Presentation
|
||||
*
|
||||
* A brief set of slides that describe the features, design principles,
|
||||
* applications and impact of CUDPP is available here:
|
||||
* <a href="http://cudpp.googlecode.com/svn/trunk/cudpp/doc/CUDPP_slides.pdf">CUDPP Presentation</a>.
|
||||
*
|
||||
* \section homepage Homepage
|
||||
* Homepage for CUDPP: http://code.google.com/p/cudpp
|
||||
*
|
||||
* Announcements and discussion of CUDPP are hosted on the
|
||||
* <a href="http://groups.google.com/group/cudpp?hl=en">CUDPP Google Group</a>.
|
||||
*
|
||||
* \section getting-started Getting Started with CUDPP
|
||||
*
|
||||
* You may want to start by browsing the \link publicInterface CUDPP Public
|
||||
* Interface\endlink. For information on building CUDPP, see
|
||||
* \ref building-cudpp "Building CUDPP".
|
||||
*
|
||||
* The "apps" subdirectory included with CUDPP has a few source code samples
|
||||
* that use CUDPP:
|
||||
* - \ref example_simpleCUDPP "simpleCUDPP", a simple example of using
|
||||
* cudppScan()
|
||||
* - satGL, an example of using cudppMultiScan() to generate a summed-area
|
||||
* table (SAT) of a scene rendered in real time. The SAT is then used to simulate
|
||||
* depth of field blur.
|
||||
* - cudpp_testrig, a comprehensive test application for all the functionality
|
||||
* of CUDPP
|
||||
*
|
||||
* We have also provided a code walkthrough of the
|
||||
* \ref example_simpleCUDPP "simpleCUDPP" example.
|
||||
*
|
||||
* \section getting-help Getting Help and Reporting Problems
|
||||
*
|
||||
* To get help using CUDPP, please use the
|
||||
* <a href="http://groups.google.com/group/cudpp?hl=en">CUDPP Google Group</a>.
|
||||
*
|
||||
* To report CUDPP bugs or request features, you may use either the above
|
||||
* CUDPP Google Group, or you can file an issue directly using
|
||||
* <a href="http://code.google.com/p/cudpp/issues/list">Google Code</a>.
|
||||
*
|
||||
* \section release-notes Release Notes
|
||||
*
|
||||
* For specific release details see the \ref changelog "Change Log".
|
||||
*
|
||||
* This release (1.1.1) is a bugfix release to CUDPP 1.1 that includes
|
||||
* fixes to support CUDA 3.0 and the new NVIDIA Fermi architecture,
|
||||
* including GeForce 400 series and Tesla 20 series GPUs. It also has
|
||||
* bug fixes for 64-bit OSes.
|
||||
*
|
||||
* \section opSys Operating System Support
|
||||
*
|
||||
* This release (1.1.1) has been thoroughly tested on the following OSes.
|
||||
* - Windows XP (32-bit) (CUDA 2.2, 3.0)
|
||||
* - Windows 7 (64-bit) (CUDA 3.0)
|
||||
* - Redhat Enterprise Linux 5 (64-bit) (CUDA 3.0)
|
||||
* - and Mac OS X 10.6 (Snow Leopard, 64-bit) (CUDA 3.0)
|
||||
*
|
||||
* We expect CUDPP to build and run correctly on other flavors of Linux
|
||||
* and Windows, but these are not actively tested by the developers at
|
||||
* this time.
|
||||
*
|
||||
* Notes: CUDPP is not compatible with CUDA 2.1. A compiler bug in 2.1
|
||||
* causes the compiler to crash. Also, starting with CUDPP 1.1.1, we are
|
||||
* no longer testing CUDA device emulation, because it is deprecated in
|
||||
* CUDA 3.0 and will be removed from future CUDA versions.
|
||||
*
|
||||
* \section cuda CUDA
|
||||
* CUDPP is implemented in
|
||||
* <a href="http://developer.nvidia.com/cuda">CUDA C/C++</a>. It requires the
|
||||
* CUDA Toolkit version 2.2 or later. Please see the NVIDIA
|
||||
* <a href="http://developer.nvidia.com/cuda">CUDA</a> homepage to download
|
||||
* CUDA as well as the CUDA Programming Guide and CUDA SDK, which includes many
|
||||
* CUDA code examples. Some of the samples in the CUDA SDK (including
|
||||
* "marchingCubes", "lineOfSight", and radixSort) also use CUDPP.
|
||||
*
|
||||
* \section design-goals Design Goals
|
||||
* Design goals for CUDPP include:
|
||||
*
|
||||
* - Performance. We aim to provide best-of-class performance for our
|
||||
* primitives. We welcome suggestions and contributions that will improve
|
||||
* CUDPP performance. We also want to provide primitives that can be easily
|
||||
* benchmarked, and compared against other implementations on GPUs and other
|
||||
* processors.
|
||||
* - Modularity. We want our primitives to be easily included in other
|
||||
* applications. To that end we have made the following design decisions:
|
||||
* - CUDPP is provided as a library that can link against other applications.
|
||||
* - CUDPP calls run on the GPU on GPU data. Thus they can be used
|
||||
* as standalone calls on the GPU (on GPU data initialized by the
|
||||
* calling application) and, more importantly, as GPU components in larger
|
||||
* CPU/GPU applications.
|
||||
* - CUDPP is implemented as 4 layers:
|
||||
* -# The \link publicInterface Public Interface\endlink is the external
|
||||
* library interface, which is the intended entry point for most
|
||||
* applications. The public interface calls into the
|
||||
* \link cudpp_app Application-Level API\endlink.
|
||||
* -# The \link cudpp_app Application-Level API\endlink comprises functions
|
||||
* callable from CPU code. These functions execute code jointly on the
|
||||
* CPU (host) and the GPU by calling into the
|
||||
* \link cudpp_kernel Kernel-Level API\endlink below them.
|
||||
* -# The \link cudpp_kernel Kernel-Level API\endlink comprises functions
|
||||
* that run entirely on the GPU across an entire grid of thread blocks.
|
||||
* These functions may call into the \link cudpp_cta CTA-Level API\endlink
|
||||
* below them.
|
||||
* -# The \link cudpp_cta CTA-Level API\endlink comprises functions that run
|
||||
* entirely on the GPU within a single Cooperative Thread Array (CTA,
|
||||
* aka thread block). These are low-level functions that implement core
|
||||
* data-parallel algorithms, typically by processing data within shared
|
||||
* (CUDA \c __shared__) memory.
|
||||
*
|
||||
* Programmers may use any of the lower three CUDPP layers in their own
|
||||
* programs by building the source directly into their application. However,
|
||||
* the typical usage of CUDPP is to link to the library and invoke functions in
|
||||
* the CUDPP \link publicInterface Public Interface\endlink, as in the
|
||||
* \ref example_simpleCUDPP "simpleCUDPP", satGL, and cudpp_testrig application
|
||||
* examples included in the CUDPP distribution.
|
||||
*
|
||||
* In the future, if and when CUDA supports building device-level libraries, we
|
||||
* hope to enhance CUDPP to ease the use of CUDPP internal algorithms at all
|
||||
* levels.
|
||||
*
|
||||
* \subsection uses Use Cases
|
||||
* We expect the normal use of CUDPP will be in one of two ways:
|
||||
* -# Linking the CUDPP library against another application.
|
||||
* -# Running our "test" application, cudpp_testrig, that exercises
|
||||
* CUDPP functionality.
|
||||
*
|
||||
* \section references References
|
||||
* The following publications describe work incorporated in CUDPP.
|
||||
*
|
||||
* - Mark Harris, Shubhabrata Sengupta, and John D. Owens. "Parallel Prefix Sum (Scan) with CUDA". In Hubert Nguyen, editor, <i>GPU Gems 3</i>, chapter 39, pages 851–876. Addison Wesley, August 2007. http://graphics.idav.ucdavis.edu/publications/print_pub?pub_id=916
|
||||
* - Shubhabrata Sengupta, Mark Harris, Yao Zhang, and John D. Owens. "Scan Primitives for GPU Computing". In <i>Graphics Hardware 2007</i>, pages 97–106, August 2007. http://graphics.idav.ucdavis.edu/publications/print_pub?pub_id=915
|
||||
* - Shubhabrata Sengupta, Mark Harris, and Michael Garland. "Efficient parallel scan algorithms for GPUs". NVIDIA Technical Report NVR-2008-003, December 2008. http://mgarland.org/papers.html#segscan-tr
|
||||
* - Nadathur Satish, Mark Harris, and Michael Garland. "Designing Efficient Sorting Algorithms for Manycore GPUs". In <i>Proceedings of the 23rd IEEE International Parallel & Distributed Processing Symposium</i>, May 2009. http://mgarland.org/papers.html#gpusort
|
||||
* - Stanley Tzeng, Li-Yi Wei. "Parallel White Noise Generation on a GPU via Cryptographic Hash". In <i>Proceedings of the 2008 Symposium on Interactive 3D Graphics and Games</i>, pages 79–87, February 2008. http://research.microsoft.com/apps/pubs/default.aspx?id=70502
|
||||
*
|
||||
* Many researchers are using CUDPP in their work, and there are many publications
|
||||
* that have used it \ref cudpp_refs "(references)". If your work uses CUDPP, please
|
||||
* let us know by sending us a reference (preferably in BibTeX format) to your work.
|
||||
*
|
||||
* \section citing Citing CUDPP
|
||||
*
|
||||
* If you make use of CUDPP primitives in your work and want to cite
|
||||
* CUDPP (thanks!), we would prefer for you to cite the appropriate
|
||||
* papers above, since they form the core of CUDPP. To be more specific,
|
||||
* the GPU Gems paper describes (unsegmented) scan, multi-scan for
|
||||
* summed-area tables, and stream compaction. The NVIDIA technical report
|
||||
* describes the current scan and segmented scan algorithms used in the
|
||||
* library, and the Graphics Hardware paper describes an earlier
|
||||
* implementation of segmented scan, quicksort, and sparse matrix-vector
|
||||
* multiply. The IPDPS paper describes the radix sort used in CUDPP, and
|
||||
* the I3D paper describes the random number generation algorithm.
|
||||
*
|
||||
* \section credits Credits
|
||||
* \subsection developers CUDPP Developers
|
||||
* - <a href="http://www.markmark.net">Mark Harris</a>, NVIDIA Corporation
|
||||
* - <a href="http://www.ece.ucdavis.edu/~jowens/">John D. Owens</a>, University of California, Davis
|
||||
* - <a href="http://graphics.cs.ucdavis.edu/~shubho/">Shubho Sengupta</a>, University of California, Davis
|
||||
* - Stanley Tzeng, University of California, Davis
|
||||
* - <a href="http://www.ece.ucdavis.edu/~yaozhang/">Yao Zhang</a>, University of California, Davis
|
||||
* - <a href="http://www.ece.ucdavis.edu/~aaldavid/">Andrew Davidson</a>, University of California, Davis (formerly Louisiana State University)
|
||||
*
|
||||
* \subsection contributors Other CUDPP Contributors
|
||||
* - <a href="http://www.eecs.berkeley.edu/~nrsatish/">Nadatur Satish</a>, University of California, Berkeley
|
||||
*
|
||||
* \subsection acknowledgments Acknowledgments
|
||||
*
|
||||
* Thanks to Jim Ahrens, Timo Aila, Nathan Bell, Ian Buck, Guy Blelloch,
|
||||
* Jeff Bolz, Michael Garland, Jeff Inman, Eric Lengyel, Samuli Laine,
|
||||
* David Luebke, Pat McCormick, and Richard Vuduc for their contributions
|
||||
* during the development of this library.
|
||||
*
|
||||
* CUDPP Developers from UC Davis thank their funding agencies:
|
||||
* - Department of Energy Early Career Principal Investigator Award
|
||||
* DE-FG02-04ER25609
|
||||
* - SciDAC Institute for Ultrascale Visualization (http://www.iusv.org/)
|
||||
* - Los Alamos National Laboratory
|
||||
* - National Science Foundation (grant 0541448)
|
||||
* - Generous hardware donations from NVIDIA
|
||||
*
|
||||
* \section license-overview CUDPP Copyright and Software License
|
||||
* CUDPP is copyright The Regents of the University of California, Davis campus
|
||||
* and NVIDIA Corporation. The library, examples, and all source code are
|
||||
* released under the BSD license, designed to encourage reuse of this software
|
||||
* in other projects, both commercial and non-commercial. For details, please
|
||||
* see the \ref license page.
|
||||
*
|
||||
* Note that prior to release 1.1 of CUDPP, the license used was a modified
|
||||
* BSD license. With release 1.1, this license was replaced with the pure BSD
|
||||
* license to facilitate the use of open source hosting of the code.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @page license CUDPP License
|
||||
*
|
||||
* \section licenseBSD CUDPP License
|
||||
*
|
||||
* CUDPP is released under the
|
||||
* <a href="http://www.opensource.org/licenses/bsd-license.php">BSD license</a>.
|
||||
*
|
||||
* @include license.txt
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @page changelog CUDPP Change Log
|
||||
*
|
||||
* @include changelog.txt
|
||||
*/
|
||||
|
||||
/**
|
||||
* @page cudpp_refs Publications that use CUDPP
|
||||
*
|
||||
* @htmlinclude doc/bib/cudpp_refs.html
|
||||
*/
|
||||
|
||||
/**
|
||||
* @page cudpp_refs_bib Bibliography for publications that use CUDPP
|
||||
*
|
||||
* @htmlinclude doc/bib/cudpp_refs_bib.html
|
||||
*/
|
||||
|
||||
/**
|
||||
* @page building-cudpp Building CUDPP
|
||||
*
|
||||
* CUDPP has currently been tested in Windows XP, Windows Vista, Mac OS X
|
||||
* and Linux. See \ref release-notes for release specific platform support.
|
||||
*
|
||||
* \section build-win32 Building CUDPP on Windows XP
|
||||
*
|
||||
* CUDPP can be built using either or MSVC 8 (2005) or MSVC 9 (2008). To
|
||||
* build, open cudpp/cudpp.sln. Then you can build the library
|
||||
* using the "build" command as you would with any other workspace. There are
|
||||
* four configurations: debug, release, emudebug, and emurelease. The first
|
||||
* two are self-explanatory. The second two are built to use CUDA device
|
||||
* emulation, meaning they will be run (slowly) on the CPU.
|
||||
*
|
||||
* \section build-linux Building CUDPP on Linux and Mac OS X
|
||||
*
|
||||
* CUDPP can be built using standard g++ and Make tools on Linux, by typing
|
||||
* "make" in the "cudpp/" subdirectory. Before building CUDPP, you should
|
||||
* first build the CUDA Utility Library (libcutil) by typing "make; make dbg=1"
|
||||
* in the "common/" subdirectory. This will generate libcutil.a and
|
||||
* libcutilD.a.
|
||||
*
|
||||
* The makefile for CUDPP and all sample applications take the optional
|
||||
* arguments "emu=1" and "dbg=1". The former builds CUDPP for device emulation,
|
||||
* and the latter for debugging. The two flags can be combined. "verbose=1"
|
||||
* can be used to see all compiler output.
|
||||
*
|
||||
* \section build-apps Building CUDPP Sample Applications
|
||||
*
|
||||
* The sample applications in the "apps/" subdirectory can be built exactly
|
||||
* like CUDPP is--either by opening the appropriate .sln/.vcproj file in MSVC
|
||||
* in Windows, or using "make" in Linux.
|
||||
*
|
||||
* On some Linux installations you will get linker errors relating to "-lXi"
|
||||
* and "-lXmu". To fix this, you will need to install libXi and libXmu. On
|
||||
* Debian and Ubuntu, for example, you can simply run
|
||||
* "sudo apt-get install libxi-dev", and
|
||||
* "sudo apt-get install libxmu-dev"
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __CUDPP_H__
|
||||
#define __CUDPP_H__
|
||||
|
||||
#include <stdlib.h> // for size_t
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief CUDPP Result codes returned by CUDPP API functions.
|
||||
*/
|
||||
enum CUDPPResult
|
||||
{
|
||||
CUDPP_SUCCESS = 0, /**< No error. */
|
||||
CUDPP_ERROR_INVALID_HANDLE, /**< Specified handle (for example,
|
||||
to a plan) is invalid. **/
|
||||
CUDPP_ERROR_ILLEGAL_CONFIGURATION, /**< Specified configuration is
|
||||
illegal. For example, an
|
||||
invalid or illogical
|
||||
combination of options. */
|
||||
CUDPP_ERROR_UNKNOWN = 9999 /**< Unknown or untraceable error. */
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Options for configuring CUDPP algorithms.
|
||||
*
|
||||
* @see CUDPPConfiguration, cudppPlan, CUDPPAlgorithm
|
||||
*/
|
||||
enum CUDPPOption
|
||||
{
|
||||
CUDPP_OPTION_FORWARD = 0x1, /**< Algorithms operate forward:
|
||||
* from start to end of input
|
||||
* array */
|
||||
CUDPP_OPTION_BACKWARD = 0x2, /**< Algorithms operate backward:
|
||||
* from end to start of array */
|
||||
CUDPP_OPTION_EXCLUSIVE = 0x4, /**< Exclusive (for scans) - scan
|
||||
* includes all elements up to (but
|
||||
* not including) the current
|
||||
* element */
|
||||
CUDPP_OPTION_INCLUSIVE = 0x8, /**< Inclusive (for scans) - scan
|
||||
* includes all elements up to and
|
||||
* including the current element */
|
||||
CUDPP_OPTION_CTA_LOCAL = 0x10, /**< Algorithm performed only on
|
||||
* the CTAs (blocks) with no
|
||||
* communication between blocks.
|
||||
* @todo Currently ignored. */
|
||||
CUDPP_OPTION_KEYS_ONLY = 0x20, /**< No associated value to a key
|
||||
* (for global radix sort) */
|
||||
CUDPP_OPTION_KEY_VALUE_PAIRS = 0x40, /**< Each key has an associated value */
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @brief Datatypes supported by CUDPP algorithms.
|
||||
*
|
||||
* @see CUDPPConfiguration, cudppPlan
|
||||
*/
|
||||
enum CUDPPDatatype
|
||||
{
|
||||
CUDPP_CHAR, //!< Character type (C char)
|
||||
CUDPP_UCHAR, //!< Unsigned character (byte) type (C unsigned char)
|
||||
CUDPP_INT, //!< Integer type (C int)
|
||||
CUDPP_UINT, //!< Unsigned integer type (C unsigned int)
|
||||
CUDPP_FLOAT //!< Float type (C float)
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Operators supported by CUDPP algorithms (currently scan and
|
||||
* segmented scan).
|
||||
*
|
||||
* These are all binary associative operators.
|
||||
*
|
||||
* @see CUDPPConfiguration, cudppPlan
|
||||
*/
|
||||
enum CUDPPOperator
|
||||
{
|
||||
CUDPP_ADD, //!< Addition of two operands
|
||||
CUDPP_MULTIPLY, //!< Multiplication of two operands
|
||||
CUDPP_MIN, //!< Minimum of two operands
|
||||
CUDPP_MAX //!< Maximum of two operands
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Algorithms supported by CUDPP. Used to create appropriate plans using
|
||||
* cudppPlan.
|
||||
*
|
||||
* @see CUDPPConfiguration, cudppPlan
|
||||
*/
|
||||
enum CUDPPAlgorithm
|
||||
{
|
||||
CUDPP_SCAN, //!< Scan or prefix-sum
|
||||
CUDPP_SEGMENTED_SCAN, //!< Segmented scan
|
||||
CUDPP_COMPACT, //!< Stream compact
|
||||
CUDPP_REDUCE, //!< Parallel reduction (NOTE: currently unimplemented)
|
||||
CUDPP_SORT_RADIX, //!< Radix sort
|
||||
CUDPP_SPMVMULT, //!< Sparse matrix-dense vector multiplication
|
||||
CUDPP_RAND_MD5, //!< PseudoRandom Number Generator using MD5 hash algorithm
|
||||
CUDPP_ALGORITHM_INVALID, //!< Placeholder at end of enum
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Configuration struct used to specify algorithm, datatype,
|
||||
* operator, and options when creating a plan for CUDPP algorithms.
|
||||
*
|
||||
* @see cudppPlan
|
||||
*/
|
||||
struct CUDPPConfiguration
|
||||
{
|
||||
CUDPPAlgorithm algorithm; //!< The algorithm to be used
|
||||
CUDPPOperator op; //!< The numerical operator to be applied
|
||||
CUDPPDatatype datatype; //!< The datatype of the input arrays
|
||||
unsigned int options; //!< Options to configure the algorithm
|
||||
};
|
||||
|
||||
#define CUDPP_INVALID_HANDLE 0xC0DABAD1
|
||||
typedef size_t CUDPPHandle;
|
||||
|
||||
/* To use CUDPP as a static library, #define CUDPP_STATIC_LIB before
|
||||
* including cudpp.h
|
||||
*/
|
||||
#define CUDPP_STATIC_LIB
|
||||
#ifndef CUDPP_DLL
|
||||
#ifdef _WIN32
|
||||
#ifdef CUDPP_STATIC_LIB
|
||||
#define CUDPP_DLL
|
||||
#else
|
||||
#ifdef BUILD_DLL
|
||||
#define CUDPP_DLL __declspec(dllexport)
|
||||
#else
|
||||
#define CUDPP_DLL __declspec(dllimport)
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#define CUDPP_DLL
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Plan allocation (for scan, sort, and compact)
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppPlan(CUDPPHandle *planHandle,
|
||||
CUDPPConfiguration config,
|
||||
size_t n,
|
||||
size_t rows,
|
||||
size_t rowPitch);
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppDestroyPlan(CUDPPHandle plan);
|
||||
|
||||
// Scan and sort algorithms
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppScan(CUDPPHandle planHandle,
|
||||
void *d_out,
|
||||
const void *d_in,
|
||||
size_t numElements);
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
|
||||
void *d_out,
|
||||
const void *d_in,
|
||||
size_t numElements,
|
||||
size_t numRows);
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle,
|
||||
void *d_out,
|
||||
const void *d_idata,
|
||||
const unsigned int *d_iflags,
|
||||
size_t numElements);
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppCompact(CUDPPHandle planHandle,
|
||||
void *d_out,
|
||||
size_t *d_numValidElements,
|
||||
const void *d_in,
|
||||
const unsigned int *d_isValid,
|
||||
size_t numElements);
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppSort(CUDPPHandle planHandle,
|
||||
void *d_keys,
|
||||
void *d_values,
|
||||
int keybits,
|
||||
size_t numElements);
|
||||
|
||||
// Sparse matrix allocation
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppSparseMatrix(CUDPPHandle *sparseMatrixHandle,
|
||||
CUDPPConfiguration config,
|
||||
size_t n,
|
||||
size_t rows,
|
||||
const void *A,
|
||||
const unsigned int *h_rowIndices,
|
||||
const unsigned int *h_indices);
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppDestroySparseMatrix(CUDPPHandle sparseMatrixHandle);
|
||||
|
||||
// Sparse matrix-vector algorithms
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle,
|
||||
void *d_y,
|
||||
const void *d_x);
|
||||
|
||||
// random number generation algorithms
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements);
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
// Leave this at the end of the file
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-file-style: "NVIDIA"
|
||||
// End:
|
|
@ -0,0 +1,66 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt in
|
||||
// the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* cudpp_globals.h
|
||||
*
|
||||
* @brief Global declarations defining machine characteristics of GPU target
|
||||
* These are currently set for best performance on G8X GPUs. The optimal
|
||||
* parameters may change on future GPUs. In the future, we hope to make
|
||||
* CUDPP a self-tuning library.
|
||||
*/
|
||||
|
||||
#ifndef __CUDPP_GLOBALS_H__
|
||||
#define __CUDPP_GLOBALS_H__
|
||||
|
||||
const int NUM_BANKS = 16; /**< Number of shared memory banks */
|
||||
const int LOG_NUM_BANKS = 4; /**< log_2(NUM_BANKS) */
|
||||
const int CTA_SIZE = 128; /**< Number of threads in a CTA */
|
||||
const int WARP_SIZE = 32; /**< Number of threads in a warp */
|
||||
const int LOG_CTA_SIZE = 7; /**< log_2(CTA_SIZE) */
|
||||
const int LOG_WARP_SIZE = 5; /**< log_2(WARP_SIZE) */
|
||||
const int LOG_SIZEOF_FLOAT = 2; /**< log_2(sizeof(float)) */
|
||||
const int SCAN_ELTS_PER_THREAD = 8; /**< Number of elements per scan thread */
|
||||
const int SEGSCAN_ELTS_PER_THREAD = 8; /**< Number of elements per segmented scan thread */
|
||||
|
||||
const int maxSharedMemoryPerBlock = 16384; /**< Number of bytes of shared
|
||||
memory in each block */
|
||||
const int maxThreadsPerBlock = CTA_SIZE; /**< Maximum number of
|
||||
* threads in a CTA */
|
||||
|
||||
/**
|
||||
* @brief Macro to insert necessary __syncthreads() in device emulation mode
|
||||
*/
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
#define __EMUSYNC __syncthreads()
|
||||
#else
|
||||
#define __EMUSYNC
|
||||
#endif
|
||||
|
||||
|
||||
#define AVOID_BANK_CONFLICTS /**< Set if by default, we want our
|
||||
* shared memory allocation to perform
|
||||
* additional computation to avoid bank
|
||||
* conflicts */
|
||||
|
||||
#ifdef AVOID_BANK_CONFLICTS
|
||||
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
|
||||
#else
|
||||
#define CONFLICT_FREE_OFFSET(index) (0)
|
||||
#endif
|
||||
|
||||
#endif // __CUDPP_GLOBALS_H__
|
||||
|
||||
// Leave this at the end of the file
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-file-style: "NVIDIA"
|
||||
// End:
|
|
@ -0,0 +1,94 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
#include "cudpp_maximal_launch.h"
|
||||
|
||||
inline size_t min(size_t x, size_t y)
|
||||
{
|
||||
return (x <= y) ? x : y;
|
||||
}
|
||||
|
||||
inline size_t max(size_t x, size_t y)
|
||||
{
|
||||
return (x >= y) ? x : y;
|
||||
}
|
||||
|
||||
// computes next highest multiple of f from x
|
||||
inline size_t multiple(size_t x, size_t f)
|
||||
{
|
||||
return ((x + (f-1)) / f);
|
||||
}
|
||||
|
||||
|
||||
// MS Excel-style CEIL() function
|
||||
// Rounds x up to nearest multiple of f
|
||||
inline size_t ceiling(size_t x, size_t f)
|
||||
{
|
||||
return multiple(x, f) * f;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
size_t maxBlocks(cudaFuncAttributes &attribs,
|
||||
cudaDeviceProp &devprop,
|
||||
size_t bytesDynamicSharedMem,
|
||||
size_t threadsPerBlock)
|
||||
{
|
||||
|
||||
// Determine the maximum number of CTAs that can be run simultaneously for each kernel
|
||||
// This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
|
||||
const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
|
||||
const unsigned int warpAllocationMultiple = 2;
|
||||
const unsigned int smemAllocationUnit = 512; // in bytes
|
||||
const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
|
||||
const unsigned int maxBlocksPerSM = 8;
|
||||
|
||||
// Number of warps (round up to nearest whole multiple of warp size)
|
||||
size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
|
||||
// Round up to warp allocation multiple
|
||||
numWarps = ceiling(numWarps, warpAllocationMultiple);
|
||||
|
||||
// Number of regs is regs per thread times number of warps times warp size
|
||||
size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
|
||||
// Round up to multiple of register allocation unit size
|
||||
regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);
|
||||
|
||||
size_t smemBytes = attribs.sharedSizeBytes + bytesDynamicSharedMem;
|
||||
size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);
|
||||
|
||||
size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
|
||||
size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
|
||||
size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock;
|
||||
|
||||
return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
size_t maxBlocksFromPointer(void* kernel,
|
||||
size_t bytesDynamicSharedMem,
|
||||
size_t threadsPerBlock)
|
||||
{
|
||||
cudaDeviceProp devprop;
|
||||
int deviceID = -1;
|
||||
cudaError_t err = cudaGetDevice(&deviceID);
|
||||
if (err == cudaSuccess)
|
||||
{
|
||||
err = cudaGetDeviceProperties(&devprop, deviceID);
|
||||
if (err != cudaSuccess)
|
||||
return -1;
|
||||
|
||||
cudaFuncAttributes attr;
|
||||
err = cudaFuncGetAttributes(&attr, (const char*)kernel);
|
||||
if (err != cudaSuccess)
|
||||
return -1;
|
||||
|
||||
return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
#ifndef _MAXIMAL_LAUNCH_H_
|
||||
#define _MAXIMAL_LAUNCH_H_
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
extern "C"
|
||||
size_t maxBlocks(cudaFuncAttributes &attribs,
|
||||
cudaDeviceProp &devprop,
|
||||
size_t bytesDynamicSharedMem,
|
||||
size_t threadsPerBlock);
|
||||
|
||||
extern "C"
|
||||
size_t maxBlocksFromPointer(void* kernel,
|
||||
size_t bytesDynamicSharedMem,
|
||||
size_t threadsPerBlock);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
template <typename T>
|
||||
size_t maxBlocks(T kernel,
|
||||
size_t bytesDynamicSharedMem,
|
||||
size_t threadsPerBlock)
|
||||
{
|
||||
return maxBlocksFromPointer((void*)kernel, bytesDynamicSharedMem, threadsPerBlock);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _MAXIMAL_LAUNCH_H_
|
|
@ -0,0 +1,459 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 3572$
|
||||
// $Date: 2007-11-19 13:58:06 +0000 (Mon, 19 Nov 2007) $
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
#include "cudpp.h"
|
||||
#include "cudpp_plan_manager.h"
|
||||
#include "cudpp_scan.h"
|
||||
//#include "cudpp_segscan.h"
|
||||
//#include "cudpp_compact.h"
|
||||
//#include "cudpp_spmvmult.h"
|
||||
#include "cudpp_radixsort.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
CUDPPPlanManager* CUDPPPlanManager::m_instance = NULL;
|
||||
|
||||
CUDPPResult validateOptions(CUDPPConfiguration config, size_t /*numElements*/, size_t numRows, size_t /*rowPitch*/)
|
||||
{
|
||||
CUDPPResult ret = CUDPP_SUCCESS;
|
||||
if ((config.options & CUDPP_OPTION_BACKWARD) && (config.options & CUDPP_OPTION_FORWARD))
|
||||
ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION;
|
||||
if ((config.options & CUDPP_OPTION_EXCLUSIVE) && (config.options & CUDPP_OPTION_INCLUSIVE))
|
||||
ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION;
|
||||
|
||||
if (config.algorithm == CUDPP_COMPACT && numRows > 1)
|
||||
ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION; //!< @todo: add support for multi-row cudppCompact
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** @addtogroup publicInterface
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @name Plan Interface
|
||||
* @{
|
||||
*/
|
||||
|
||||
|
||||
/** @brief Create a CUDPP plan
|
||||
*
|
||||
* A plan is a data structure containing state and intermediate storage space
|
||||
* that CUDPP uses to execute algorithms on data. A plan is created by
|
||||
* passing to cudppPlan() a CUDPPConfiguration that specifies the algorithm,
|
||||
* operator, datatype, and options. The size of the data must also be passed
|
||||
* to cudppPlan(), in the \a numElements, \a numRows, and \a rowPitch
|
||||
* arguments. These sizes are used to allocate internal storage space at the
|
||||
* time the plan is created. The CUDPP planner may use the sizes, options,
|
||||
* and information about the present hardware to choose optimal settings.
|
||||
*
|
||||
* Note that \a numElements is the maximum size of the array to be processed
|
||||
* with this plan. That means that a plan may be re-used to process (for
|
||||
* example, to sort or scan) smaller arrays.
|
||||
*
|
||||
* @param[out] planHandle A pointer to an opaque handle to the internal plan
|
||||
* @param[in] config The configuration struct specifying algorithm and options
|
||||
* @param[in] numElements The maximum number of elements to be processed
|
||||
* @param[in] numRows The number of rows (for 2D operations) to be processed
|
||||
* @param[in] rowPitch The pitch of the rows of input data, in elements
|
||||
*/
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppPlan(CUDPPHandle *planHandle,
|
||||
CUDPPConfiguration config,
|
||||
size_t numElements,
|
||||
size_t numRows,
|
||||
size_t rowPitch)
|
||||
{
|
||||
CUDPPResult result = CUDPP_SUCCESS;
|
||||
|
||||
CUDPPPlan *plan;
|
||||
|
||||
result = validateOptions(config, numElements, numRows, rowPitch);
|
||||
if (result != CUDPP_SUCCESS)
|
||||
{
|
||||
*planHandle = CUDPP_INVALID_HANDLE;
|
||||
return result;
|
||||
}
|
||||
|
||||
switch (config.algorithm)
|
||||
{
|
||||
case CUDPP_SCAN:
|
||||
{
|
||||
plan = new CUDPPScanPlan(config, numElements, numRows, rowPitch);
|
||||
break;
|
||||
}
|
||||
// case CUDPP_COMPACT:
|
||||
// {
|
||||
// plan = new CUDPPCompactPlan(config, numElements, numRows, rowPitch);
|
||||
// break;
|
||||
// }
|
||||
case CUDPP_SORT_RADIX:
|
||||
//case CUDPP_SORT_RADIX_GLOBAL:
|
||||
{
|
||||
plan = new CUDPPRadixSortPlan(config, numElements);
|
||||
break;
|
||||
}
|
||||
/* case CUDPP_SEGMENTED_SCAN:
|
||||
{
|
||||
plan = new CUDPPSegmentedScanPlan(config, numElements);
|
||||
break;
|
||||
}
|
||||
//new rand plan
|
||||
case CUDPP_RAND_MD5:
|
||||
{
|
||||
plan = new CUDPPRandPlan(config, numElements);
|
||||
break;
|
||||
}
|
||||
case CUDPP_REDUCE:*/
|
||||
default:
|
||||
//! @todo: implement cudppReduce()
|
||||
return CUDPP_ERROR_ILLEGAL_CONFIGURATION;
|
||||
break;
|
||||
}
|
||||
|
||||
*planHandle = CUDPPPlanManager::AddPlan(plan);
|
||||
if (CUDPP_INVALID_HANDLE == *planHandle)
|
||||
return CUDPP_ERROR_UNKNOWN;
|
||||
else
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
|
||||
/** @brief Destroy a CUDPP Plan
|
||||
*
|
||||
* Deletes the plan referred to by \a planHandle and all associated internal
|
||||
* storage.
|
||||
*
|
||||
* @param[in] planHandle The CUDPPHandle to the plan to be destroyed
|
||||
*/
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppDestroyPlan(CUDPPHandle planHandle)
|
||||
{
|
||||
if (CUDPPPlanManager::RemovePlan(planHandle) == false)
|
||||
return CUDPP_ERROR_INVALID_HANDLE;
|
||||
else
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
|
||||
/** @brief Create a CUDPP Sparse Matrix Object
|
||||
*
|
||||
* The sparse matrix plan is a data structure containing state and intermediate storage space
|
||||
* that CUDPP uses to perform sparse matrix dense vector multiply. This plan is created by
|
||||
* passing to CUDPPSparseMatrixVectorMultiplyPlan() a CUDPPConfiguration that specifies the
|
||||
* algorithm (sprarse matrix-dense vector multiply) and datatype, along with the sparse matrix
|
||||
* itself in CSR format. The number of non-zero elements in the sparse matrix must also be passed
|
||||
* as \a numNonZeroElements. This is used to allocate internal storage space at the time the
|
||||
* sparse matrix plan is created.
|
||||
*
|
||||
* @param[out] sparseMatrixHandle A pointer to an opaque handle to the sparse matrix object
|
||||
* @param[in] config The configuration struct specifying algorithm and options
|
||||
* @param[in] numNonZeroElements The number of non zero elements in the sparse matrix
|
||||
* @param[in] numRows This is the number of rows in y, x and A for y = A * x
|
||||
* @param[in] A The matrix data
|
||||
* @param[in] h_rowIndices An array containing the index of the start of each row in \a A
|
||||
* @param[in] h_indices An array containing the index of each nonzero element in \a A
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppSparseMatrix(CUDPPHandle *sparseMatrixHandle,
|
||||
CUDPPConfiguration config,
|
||||
size_t numNonZeroElements,
|
||||
size_t numRows,
|
||||
const void *A,
|
||||
const unsigned int *h_rowIndices,
|
||||
const unsigned int *h_indices)
|
||||
{
|
||||
CUDPPResult result = CUDPP_SUCCESS;
|
||||
|
||||
CUDPPPlan *sparseMatrix;
|
||||
|
||||
if ((config.algorithm != CUDPP_SPMVMULT) ||
|
||||
(numNonZeroElements <= 0) || (numRows <= 0))
|
||||
{
|
||||
result = CUDPP_ERROR_ILLEGAL_CONFIGURATION;
|
||||
}
|
||||
|
||||
if (result != CUDPP_SUCCESS)
|
||||
{
|
||||
*sparseMatrixHandle = CUDPP_INVALID_HANDLE;
|
||||
return result;
|
||||
}
|
||||
|
||||
sparseMatrix =
|
||||
new CUDPPSparseMatrixVectorMultiplyPlan(config, numNonZeroElements, A,
|
||||
h_rowIndices, h_indices, numRows);
|
||||
|
||||
*sparseMatrixHandle = CUDPPPlanManager::AddPlan(sparseMatrix);
|
||||
if (CUDPP_INVALID_HANDLE == *sparseMatrixHandle)
|
||||
return CUDPP_ERROR_UNKNOWN;
|
||||
else
|
||||
return CUDPP_SUCCESS;
|
||||
}
|
||||
*/
|
||||
/** @brief Destroy a CUDPP Sparse Matrix Object
|
||||
*
|
||||
* Deletes the sparse matrix data and plan referred to by \a sparseMatrixHandle
|
||||
* and all associated internal storage.
|
||||
*
|
||||
* @param[in] sparseMatrixHandle The CUDPPHandle to the matrix object to be destroyed
|
||||
|
||||
CUDPP_DLL
|
||||
CUDPPResult cudppDestroySparseMatrix(CUDPPHandle sparseMatrixHandle)
|
||||
{
|
||||
return cudppDestroyPlan(sparseMatrixHandle);
|
||||
}
|
||||
*/
|
||||
/** @} */ // end Plan Interface
|
||||
/** @} */ // end publicInterface
|
||||
|
||||
|
||||
/** @brief Plan base class constructor
|
||||
*
|
||||
* @param[in] config The configuration struct specifying algorithm and options
|
||||
* @param[in] numElements The maximum number of elements to be processed
|
||||
* @param[in] numRows The number of rows (for 2D operations) to be processed
|
||||
* @param[in] rowPitch The pitch of the rows of input data, in elements
|
||||
*/
|
||||
CUDPPPlan::CUDPPPlan(CUDPPConfiguration config,
|
||||
size_t numElements,
|
||||
size_t numRows,
|
||||
size_t rowPitch)
|
||||
: m_config(config),
|
||||
m_numElements(numElements),
|
||||
m_numRows(numRows),
|
||||
m_rowPitch(rowPitch)
|
||||
{
|
||||
}
|
||||
|
||||
/** @brief Scan Plan constructor
|
||||
*
|
||||
* @param[in] config The configuration struct specifying algorithm and options
|
||||
* @param[in] numElements The maximum number of elements to be scanned
|
||||
* @param[in] numRows The maximum number of rows (for 2D operations) to be scanned
|
||||
* @param[in] rowPitch The pitch of the rows of input data, in elements
|
||||
*/
|
||||
CUDPPScanPlan::CUDPPScanPlan(CUDPPConfiguration config,
|
||||
size_t numElements,
|
||||
size_t numRows,
|
||||
size_t rowPitch)
|
||||
: CUDPPPlan(config, numElements, numRows, rowPitch),
|
||||
m_blockSums(0),
|
||||
m_rowPitches(0),
|
||||
m_numEltsAllocated(0),
|
||||
m_numRowsAllocated(0),
|
||||
m_numLevelsAllocated(0)
|
||||
{
|
||||
allocScanStorage(this);
|
||||
}
|
||||
|
||||
/** @brief CUDPP scan plan destructor */
|
||||
CUDPPScanPlan::~CUDPPScanPlan()
|
||||
{
|
||||
freeScanStorage(this);
|
||||
}
|
||||
|
||||
/** @brief SegmentedScan Plan constructor
|
||||
*
|
||||
* @param[in] config The configuration struct specifying options
|
||||
* @param[in] numElements The maximum number of elements to be scanned
|
||||
|
||||
CUDPPSegmentedScanPlan::CUDPPSegmentedScanPlan(CUDPPConfiguration config,
|
||||
size_t numElements)
|
||||
: CUDPPPlan(config, numElements, 1, 0),
|
||||
m_blockSums(0),
|
||||
m_blockFlags(0),
|
||||
m_blockIndices(0),
|
||||
m_numEltsAllocated(0),
|
||||
m_numLevelsAllocated(0)
|
||||
{
|
||||
allocSegmentedScanStorage(this);
|
||||
}
|
||||
*/
|
||||
/** @brief SegmentedScan plan destructor
|
||||
CUDPPSegmentedScanPlan::~CUDPPSegmentedScanPlan()
|
||||
{
|
||||
freeSegmentedScanStorage(this);
|
||||
}
|
||||
*/
|
||||
/** @brief Compact Plan constructor
|
||||
*
|
||||
* @param[in] config The configuration struct specifying options
|
||||
* @param[in] numElements The maximum number of elements to be compacted
|
||||
* @param[in] numRows The number of rows (for 2D operations) to be compacted
|
||||
* @param[in] rowPitch The pitch of the rows of input data, in elements
|
||||
|
||||
CUDPPCompactPlan::CUDPPCompactPlan(CUDPPConfiguration config,
|
||||
size_t numElements,
|
||||
size_t numRows,
|
||||
size_t rowPitch)
|
||||
: CUDPPPlan(config, numElements, numRows, rowPitch),
|
||||
m_d_outputIndices(0)
|
||||
{
|
||||
assert(numRows == 1); //!< @todo Add support for multirow compaction
|
||||
|
||||
CUDPPConfiguration scanConfig =
|
||||
{
|
||||
CUDPP_SCAN,
|
||||
CUDPP_ADD,
|
||||
CUDPP_UINT,
|
||||
(config.options & CUDPP_OPTION_BACKWARD) ?
|
||||
CUDPP_OPTION_BACKWARD | CUDPP_OPTION_EXCLUSIVE :
|
||||
CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE
|
||||
};
|
||||
m_scanPlan = new CUDPPScanPlan(scanConfig, numElements, numRows, rowPitch);
|
||||
|
||||
allocCompactStorage(this);
|
||||
}
|
||||
*/
|
||||
/** @brief Compact plan destructor
|
||||
CUDPPCompactPlan::~CUDPPCompactPlan()
|
||||
{
|
||||
delete m_scanPlan;
|
||||
freeCompactStorage(this);
|
||||
}
|
||||
*/
|
||||
/** @brief Sort Plan constructor
|
||||
*
|
||||
* @param[in] config The configuration struct specifying algorithm and options
|
||||
* @param[in] numElements The maximum number of elements to be sorted
|
||||
*/
|
||||
/*CUDPPSortPlan::CUDPPSortPlan(CUDPPConfiguration config, size_t numElements)
|
||||
: CUDPPPlan(config, numElements, 1, 0),
|
||||
m_scanPlan(0),
|
||||
m_d_temp(0),
|
||||
m_d_tempAddress(0)
|
||||
{
|
||||
CUDPPConfiguration scanConfig =
|
||||
{
|
||||
CUDPP_SCAN,
|
||||
CUDPP_ADD,
|
||||
CUDPP_UINT,
|
||||
CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE
|
||||
};
|
||||
|
||||
//if (config.algorithm == CUDPP_SORT_RADIX_GLOBAL)
|
||||
{
|
||||
m_scanPlan = new CUDPPScanPlan(scanConfig, numElements, 1, 0);
|
||||
}
|
||||
|
||||
allocSortStorage(this);
|
||||
}*/
|
||||
|
||||
/** @brief Sort plan destructor */
|
||||
/*CUDPPSortPlan::~CUDPPSortPlan()
|
||||
{
|
||||
delete m_scanPlan;
|
||||
freeSortStorage(this);
|
||||
}*/
|
||||
|
||||
CUDPPRadixSortPlan::CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements)
|
||||
: CUDPPPlan(config, numElements, 1, 0),
|
||||
m_scanPlan(0),
|
||||
m_tempKeys(0),
|
||||
m_tempValues(0),
|
||||
m_counters(0),
|
||||
m_countersSum(0),
|
||||
m_blockOffsets(0)
|
||||
{
|
||||
size_t numBlocks2 = ((numElements % (SORT_CTA_SIZE * 2)) == 0) ?
|
||||
(numElements / (SORT_CTA_SIZE * 2)) : (numElements / (SORT_CTA_SIZE * 2) + 1);
|
||||
|
||||
CUDPPConfiguration scanConfig =
|
||||
{
|
||||
CUDPP_SCAN,
|
||||
CUDPP_ADD,
|
||||
CUDPP_UINT,
|
||||
CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE
|
||||
};
|
||||
|
||||
if(m_config.options == CUDPP_OPTION_KEYS_ONLY)
|
||||
m_bKeysOnly = true;
|
||||
else
|
||||
m_bKeysOnly = false;
|
||||
|
||||
m_scanPlan = new CUDPPScanPlan(scanConfig, numBlocks2*16, 1, 0);
|
||||
|
||||
allocRadixSortStorage(this);
|
||||
}
|
||||
|
||||
CUDPPRadixSortPlan::~CUDPPRadixSortPlan()
|
||||
{
|
||||
delete m_scanPlan;
|
||||
freeRadixSortStorage(this);
|
||||
}
|
||||
|
||||
/** @brief SparseMatrixVectorMultiply Plan constructor
|
||||
*
|
||||
* @param[in] config The configuration struct specifying options
|
||||
* @param[in] numNonZeroElements The number of non-zero elements in sparse matrix
|
||||
* @param[in] A Array of non-zero matrix elements
|
||||
* @param[in] rowIndex Array of indices of the first element of each row
|
||||
* in the "flattened" version of the sparse matrix
|
||||
* @param[in] index Array of indices of non-zero elements in the matrix
|
||||
* @param[in] numRows The number of rows in the sparse matrix
|
||||
|
||||
CUDPPSparseMatrixVectorMultiplyPlan::CUDPPSparseMatrixVectorMultiplyPlan(
|
||||
CUDPPConfiguration config,
|
||||
size_t numNonZeroElements,
|
||||
const void *A,
|
||||
const unsigned int *rowIndex,
|
||||
const unsigned int *index,
|
||||
size_t numRows
|
||||
)
|
||||
: CUDPPPlan(config, numNonZeroElements, 1, 0),
|
||||
m_segmentedScanPlan(0),
|
||||
m_d_prod(0),
|
||||
m_d_flags(0),
|
||||
m_d_rowFinalIndex(0),
|
||||
m_rowFinalIndex(0),
|
||||
m_numRows(numRows),
|
||||
m_numNonZeroElements(numNonZeroElements)
|
||||
{
|
||||
CUDPPConfiguration segScanConfig =
|
||||
{
|
||||
CUDPP_SEGMENTED_SCAN,
|
||||
CUDPP_ADD,
|
||||
config.datatype,
|
||||
(CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE)
|
||||
};
|
||||
m_segmentedScanPlan = new CUDPPSegmentedScanPlan(segScanConfig, m_numNonZeroElements);
|
||||
|
||||
// Generate an array of the indices of the last element of each row
|
||||
// in the "flattened" version of the sparse matrix
|
||||
m_rowFinalIndex = new unsigned int [m_numRows];
|
||||
for (unsigned int i=0; i < m_numRows; ++i)
|
||||
{
|
||||
if (i < m_numRows-1)
|
||||
m_rowFinalIndex[i] = rowIndex[i+1];
|
||||
else
|
||||
m_rowFinalIndex[i] = (unsigned int)numNonZeroElements;
|
||||
}
|
||||
|
||||
allocSparseMatrixVectorMultiplyStorage(this, A, rowIndex, index);
|
||||
}
|
||||
*/
|
||||
/** @brief Sparse matrix-vector plan destructor
|
||||
CUDPPSparseMatrixVectorMultiplyPlan::~CUDPPSparseMatrixVectorMultiplyPlan()
|
||||
{
|
||||
freeSparseMatrixVectorMultiplyStorage(this);
|
||||
delete m_segmentedScanPlan;
|
||||
delete [] m_rowFinalIndex;
|
||||
}
|
||||
*/
|
||||
/** @brief CUDPP Rand Plan Constructor
|
||||
* @param[in] config The configuration struct specifying options
|
||||
* @param[in] num_elements The number of elements to generate random bits for
|
||||
|
||||
CUDPPRandPlan::CUDPPRandPlan(CUDPPConfiguration config, size_t num_elements)
|
||||
: CUDPPPlan(config, num_elements, 1, 0),
|
||||
m_seed(0)
|
||||
{
|
||||
|
||||
}
|
||||
*/
|
||||
|
|
@ -0,0 +1,158 @@
|
|||
// -------------------------------------------------------------
|
||||
// CUDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 3572$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
#ifndef __CUDPP_PLAN_H__
|
||||
#define __CUDPP_PLAN_H__
|
||||
|
||||
typedef void* KernelPointer;
|
||||
|
||||
extern "C" size_t getNumCTAs(KernelPointer kernel);
|
||||
extern "C" void compNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock);
|
||||
|
||||
template <typename T>
|
||||
size_t numCTAs(T kernel)
|
||||
{
|
||||
return getNumCTAs((KernelPointer)kernel);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void computeNumCTAs(T kernel, unsigned int bytesDynamicSharedMem, size_t threadsPerBlock)
|
||||
{
|
||||
compNumCTAs((KernelPointer)kernel, bytesDynamicSharedMem, threadsPerBlock);
|
||||
}
|
||||
|
||||
/** @brief Base class for CUDPP Plan data structures
|
||||
*
|
||||
* CUDPPPlan and its subclasses provide the internal (i.e. not visible to the
|
||||
* library user) infrastructure for planning algorithm execution. They
|
||||
* own intermediate storage for CUDPP algorithms as well as, in some cases,
|
||||
* information about optimal execution configuration for the present hardware.
|
||||
*
|
||||
*/
|
||||
class CUDPPPlan
|
||||
{
|
||||
public:
|
||||
CUDPPPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
|
||||
virtual ~CUDPPPlan() {}
|
||||
|
||||
// Note anything passed to functions compiled by NVCC must be public
|
||||
CUDPPConfiguration m_config; //!< @internal Options structure
|
||||
size_t m_numElements; //!< @internal Maximum number of input elements
|
||||
size_t m_numRows; //!< @internal Maximum number of input rows
|
||||
size_t m_rowPitch; //!< @internal Pitch of input rows in elements
|
||||
};
|
||||
|
||||
/** @brief Plan class for scan algorithm
|
||||
*
|
||||
*/
|
||||
class CUDPPScanPlan : public CUDPPPlan
|
||||
{
|
||||
public:
|
||||
CUDPPScanPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
|
||||
virtual ~CUDPPScanPlan();
|
||||
|
||||
void **m_blockSums; //!< @internal Intermediate block sums array
|
||||
size_t *m_rowPitches; //!< @internal Pitch of each row in elements (for cudppMultiScan())
|
||||
size_t m_numEltsAllocated; //!< @internal Number of elements allocated (maximum scan size)
|
||||
size_t m_numRowsAllocated; //!< @internal Number of rows allocated (for cudppMultiScan())
|
||||
size_t m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums)
|
||||
};
|
||||
|
||||
/** @brief Plan class for segmented scan algorithm
|
||||
*
|
||||
*/
|
||||
class CUDPPSegmentedScanPlan : public CUDPPPlan
|
||||
{
|
||||
public:
|
||||
CUDPPSegmentedScanPlan(CUDPPConfiguration config, size_t numElements);
|
||||
virtual ~CUDPPSegmentedScanPlan();
|
||||
|
||||
void **m_blockSums; //!< @internal Intermediate block sums array
|
||||
unsigned int **m_blockFlags; //!< @internal Intermediate block flags array
|
||||
unsigned int **m_blockIndices; //!< @internal Intermediate block indices array
|
||||
size_t m_numEltsAllocated; //!< @internal Number of elements allocated (maximum scan size)
|
||||
size_t m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums)
|
||||
};
|
||||
|
||||
/** @brief Plan class for compact algorithm
|
||||
*
|
||||
*/
|
||||
class CUDPPCompactPlan : public CUDPPPlan
|
||||
{
|
||||
public:
|
||||
CUDPPCompactPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
|
||||
virtual ~CUDPPCompactPlan();
|
||||
|
||||
CUDPPScanPlan *m_scanPlan; //!< @internal Compact performs a scan of type unsigned int using this plan
|
||||
unsigned int* m_d_outputIndices; //!< @internal Output address of compacted elements; this is the result of scan
|
||||
|
||||
};
|
||||
|
||||
class CUDPPRadixSortPlan : public CUDPPPlan
|
||||
{
|
||||
public:
|
||||
CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements);
|
||||
virtual ~CUDPPRadixSortPlan();
|
||||
|
||||
bool m_bKeysOnly;
|
||||
bool m_bManualCoalesce;
|
||||
bool m_bUsePersistentCTAs;
|
||||
unsigned int m_persistentCTAThreshold[2];
|
||||
unsigned int m_persistentCTAThresholdFullBlocks[2];
|
||||
CUDPPScanPlan *m_scanPlan; //!< @internal Sort performs a scan of type unsigned int using this plan
|
||||
unsigned int m_keyBits;
|
||||
mutable void *m_tempKeys; //!< @internal Intermediate storage for keys
|
||||
mutable void *m_tempValues; //!< @internal Intermediate storage for values
|
||||
unsigned int *m_counters; //!< @internal Counter for each radix
|
||||
unsigned int *m_countersSum; //!< @internal Prefix sum of radix counters
|
||||
unsigned int *m_blockOffsets; //!< @internal Global offsets of each radix in each block
|
||||
|
||||
};
|
||||
|
||||
/** @brief Plan class for sparse-matrix dense-vector multiply
|
||||
*
|
||||
*/
|
||||
class CUDPPSparseMatrixVectorMultiplyPlan : public CUDPPPlan
|
||||
{
|
||||
public:
|
||||
CUDPPSparseMatrixVectorMultiplyPlan(CUDPPConfiguration config, size_t numNZElts,
|
||||
const void *A,
|
||||
const unsigned int *rowindx,
|
||||
const unsigned int *indx, size_t numRows);
|
||||
virtual ~CUDPPSparseMatrixVectorMultiplyPlan();
|
||||
|
||||
CUDPPSegmentedScanPlan *m_segmentedScanPlan; //!< @internal Performs a segmented scan of type T using this plan
|
||||
void *m_d_prod; //!< @internal Vector of products (of an element in A and its corresponding (thats is
|
||||
//! belongs to the same row) element in x; this is the input and output of
|
||||
//! segmented scan
|
||||
unsigned int *m_d_flags; //!< @internal Vector of flags where a flag is set if an element of A is the first element
|
||||
//! of its row; this is the flags vector for segmented scan
|
||||
unsigned int *m_d_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
|
||||
//! which is the last element of that row. Resides in GPU memory.
|
||||
unsigned int *m_d_rowIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
|
||||
//! which is the first element of that row. Resides in GPU memory.
|
||||
unsigned int *m_d_index; //!<@internal Vector of column numbers one for each element in A
|
||||
void *m_d_A; //!<@internal The A matrix
|
||||
unsigned int *m_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
|
||||
//! which is the last element of that row. Resides in CPU memory.
|
||||
size_t m_numRows; //!< Number of rows
|
||||
size_t m_numNonZeroElements; //!<Number of non-zero elements
|
||||
};
|
||||
|
||||
/** @brief Plan class for random number generator
|
||||
*
|
||||
*/
|
||||
class CUDPPRandPlan : public CUDPPPlan
|
||||
{
|
||||
public:
|
||||
CUDPPRandPlan(CUDPPConfiguration config, size_t num_elements);
|
||||
|
||||
unsigned int m_seed; //!< @internal the seed for the random number generator
|
||||
};
|
||||
#endif // __CUDPP_PLAN_H__
|
|
@ -0,0 +1,155 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 3572$
|
||||
// $Date: 2007-11-19 13:58:06 +0000 (Mon, 19 Nov 2007) $
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
#include "cudpp.h"
|
||||
#include "cudpp_plan.h"
|
||||
#include "cudpp_plan_manager.h"
|
||||
#include "cudpp_maximal_launch.h"
|
||||
|
||||
typedef void* KernelPointer;
|
||||
|
||||
extern "C" size_t getNumCTAs(KernelPointer kernel)
|
||||
{
|
||||
return CUDPPPlanManager::numCTAs(kernel);
|
||||
}
|
||||
extern "C" void compNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock)
|
||||
{
|
||||
CUDPPPlanManager::computeNumCTAs(kernel, bytesDynamicSharedMem, threadsPerBlock);
|
||||
}
|
||||
|
||||
//! @internal Instantiate the plan manager singleton object
|
||||
void CUDPPPlanManager::Instantiate()
|
||||
{
|
||||
if (NULL == m_instance)
|
||||
m_instance = new CUDPPPlanManager;
|
||||
}
|
||||
|
||||
//! @internal Destroy the plan manager singleton object
|
||||
void CUDPPPlanManager::Destroy()
|
||||
{
|
||||
if (NULL != m_instance)
|
||||
{
|
||||
delete m_instance;
|
||||
m_instance = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Plan Manager destructor
|
||||
* Destroys all plans as well as the plan manager.
|
||||
*/
|
||||
CUDPPPlanManager::~CUDPPPlanManager()
|
||||
{
|
||||
std::map<CUDPPHandle,CUDPPPlan*>::iterator it;
|
||||
|
||||
for (it = m_instance->plans.begin(); it != m_instance->plans.end(); it++)
|
||||
{
|
||||
CUDPPPlan* plan = it->second;
|
||||
delete plan;
|
||||
plan = NULL;
|
||||
}
|
||||
m_instance->plans.clear();
|
||||
|
||||
m_instance->numCTAsTable.clear();
|
||||
}
|
||||
|
||||
/** @brief Add a plan to the plan manager
|
||||
*
|
||||
* @returns a valid CUDPPHandle if the plan was successfully added, or
|
||||
* CUDPP_INVALID_HANDLE otherwise
|
||||
* @param[in] plan The plan to add
|
||||
*/
|
||||
CUDPPHandle CUDPPPlanManager::AddPlan(CUDPPPlan* plan)
|
||||
{
|
||||
Instantiate();
|
||||
|
||||
std::pair<std::map<CUDPPHandle, CUDPPPlan*>::iterator, bool> ret;
|
||||
|
||||
CUDPPHandle handle = (CUDPPHandle)m_instance->plans.size();
|
||||
ret = m_instance->plans.insert(std::pair<CUDPPHandle,CUDPPPlan*>(handle, plan));
|
||||
if (ret.second == true)
|
||||
return handle;
|
||||
else
|
||||
return CUDPP_INVALID_HANDLE;
|
||||
}
|
||||
|
||||
/** @brief Remove a plan from the plan manager
|
||||
*
|
||||
* @returns true if the plan was successfully removed, false otherwise
|
||||
* @param[in] handle The handle to the plan to remove
|
||||
*/
|
||||
bool CUDPPPlanManager::RemovePlan(CUDPPHandle handle)
|
||||
{
|
||||
if (m_instance == NULL)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
std::map<CUDPPHandle,CUDPPPlan*>::iterator it;
|
||||
it = m_instance->plans.find(handle);
|
||||
|
||||
if (it != m_instance->plans.end())
|
||||
{
|
||||
CUDPPPlan* plan = it->second;
|
||||
delete plan;
|
||||
plan = NULL;
|
||||
m_instance->plans.erase(it);
|
||||
|
||||
if (0 == m_instance->plans.size())
|
||||
{
|
||||
Destroy();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Get a plan from the plan manager by handle
|
||||
*
|
||||
* @returns A pointer to the plan if found, or NULL otherwise
|
||||
* @param handle The handle to the requested plan
|
||||
*/
|
||||
CUDPPPlan* CUDPPPlanManager::GetPlan(CUDPPHandle handle)
|
||||
{
|
||||
if (m_instance == NULL)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
std::map<CUDPPHandle, CUDPPPlan*>::iterator it;
|
||||
it = m_instance->plans.find(handle);
|
||||
if (it != m_instance->plans.end())
|
||||
{
|
||||
return it->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
size_t CUDPPPlanManager::numCTAs(KernelPointer kernel)
|
||||
{
|
||||
if (m_instance == NULL)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
return m_instance->numCTAsTable[kernel];
|
||||
}
|
||||
|
||||
void CUDPPPlanManager::computeNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock)
|
||||
{
|
||||
Instantiate();
|
||||
|
||||
m_instance->numCTAsTable[kernel] = maxBlocks(kernel, bytesDynamicSharedMem, threadsPerBlock);
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 3572$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
#ifndef __CUDPP_PLAN_MANAGER_H__
|
||||
#define __CUDPP_PLAN_MANAGER_H__
|
||||
|
||||
#include <map>
|
||||
|
||||
class CUDPPPlan;
|
||||
typedef void* KernelPointer;
|
||||
|
||||
/** @brief Singleton manager class for CUDPPPlan objects
|
||||
*
|
||||
* This class manages all active plans in CUDPP. It is a singleton class,
|
||||
* meaning that only one instance can exist. It is created automatically the
|
||||
* first time AddPlan() is called, and destroyed when the last plan is removed
|
||||
* using RemovePlan().
|
||||
*/
|
||||
class CUDPPPlanManager
|
||||
{
|
||||
public:
|
||||
static CUDPPHandle AddPlan(CUDPPPlan* plan);
|
||||
static bool RemovePlan(CUDPPHandle handle);
|
||||
static CUDPPPlan* GetPlan(CUDPPHandle handle);
|
||||
|
||||
static size_t numCTAs(KernelPointer kernel);
|
||||
static void computeNumCTAs(KernelPointer kernel,
|
||||
size_t bytesDynamicSharedMem,
|
||||
size_t threadsPerBlock);
|
||||
|
||||
protected:
|
||||
static CUDPPPlanManager* m_instance;
|
||||
std::map<CUDPPHandle, CUDPPPlan*> plans;
|
||||
std::map<void*, size_t> numCTAsTable;
|
||||
|
||||
private:
|
||||
|
||||
|
||||
//! @internal Instantiate the plan manager singleton object
|
||||
static void Instantiate();
|
||||
//! @internal Destroy the plan manager singleton object
|
||||
static void Destroy();
|
||||
|
||||
private:
|
||||
CUDPPPlanManager() {}
|
||||
CUDPPPlanManager(const CUDPPPlanManager&) {}
|
||||
~CUDPPPlanManager();
|
||||
};
|
||||
|
||||
#endif // __CUDPP_PLAN_MANAGER_H__
|
|
@ -0,0 +1,34 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
#ifndef __RADIXSORT_H__
|
||||
#define __RADIXSORT_H__
|
||||
|
||||
#define SORT_CTA_SIZE 256 //This CTA_SIZE must equal 16 * number of radices
|
||||
|
||||
#include "cudpp_globals.h"
|
||||
#include "cudpp.h"
|
||||
#include "cudpp_plan.h"
|
||||
|
||||
|
||||
extern "C"
|
||||
void allocRadixSortStorage(CUDPPRadixSortPlan* plan);
|
||||
|
||||
extern "C"
|
||||
void freeRadixSortStorage(CUDPPRadixSortPlan* plan);
|
||||
|
||||
extern "C"
|
||||
void cudppRadixSortDispatch(void *keys,
|
||||
void *values,
|
||||
size_t numElements,
|
||||
int keyBits,
|
||||
const CUDPPRadixSortPlan *plan);
|
||||
|
||||
|
||||
#endif // __RADIXSORT_H__
|
|
@ -0,0 +1,36 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* cudpp_scan.h
|
||||
*
|
||||
* @brief Scan functionality header file - contains CUDPP interface (not public)
|
||||
*/
|
||||
|
||||
#ifndef _CUDPP_SCAN_H_
|
||||
#define _CUDPP_SCAN_H_
|
||||
|
||||
class CUDPPScanPlan;
|
||||
|
||||
extern "C"
|
||||
void allocScanStorage(CUDPPScanPlan *plan);
|
||||
|
||||
extern "C"
|
||||
void freeScanStorage(CUDPPScanPlan *plan);
|
||||
|
||||
extern "C"
|
||||
void cudppScanDispatch(void *d_out,
|
||||
const void *d_in,
|
||||
size_t numElements,
|
||||
size_t numRows,
|
||||
const CUDPPScanPlan *plan);
|
||||
|
||||
#endif // _CUDPP_SCAN_H_
|
|
@ -0,0 +1,363 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt in
|
||||
// the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* cudpp_util.h
|
||||
*
|
||||
* @brief C++ utility functions and classes used internally to cuDPP
|
||||
*/
|
||||
|
||||
#ifndef __CUDPP_UTIL_H__
|
||||
#define __CUDPP_UTIL_H__
|
||||
|
||||
#ifdef WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cudpp.h>
|
||||
#include <limits.h>
|
||||
#include <float.h>
|
||||
|
||||
#if (CUDA_VERSION >= 3000)
|
||||
#define LAUNCH_BOUNDS(x) __launch_bounds__((x))
|
||||
#define LAUNCH_BOUNDS_MINBLOCKs(x, y) __launch_bounds__((x),(y))
|
||||
#else
|
||||
#define LAUNCH_BOUNDS(x)
|
||||
#define LAUNCH_BOUNDS_MINBLOCKS(x, y)
|
||||
#endif
|
||||
|
||||
|
||||
/** @brief Determine if \a n is a power of two.
|
||||
* @param n Value to be checked to see if it is a power of two
|
||||
* @returns True if \a n is a power of two, false otherwise
|
||||
*/
|
||||
inline bool
|
||||
isPowerOfTwo(int n)
|
||||
{
|
||||
return ((n&(n-1))==0) ;
|
||||
}
|
||||
|
||||
/** @brief Determine if an integer \a n is a multiple of an integer \a f.
|
||||
* @param n Multiple
|
||||
* @param f Factor
|
||||
* @returns True if \a n is a multiple of \a f, false otherwise
|
||||
*/
|
||||
inline bool
|
||||
isMultiple(int n, int f)
|
||||
{
|
||||
if (isPowerOfTwo(f))
|
||||
return ((n&(f-1))==0);
|
||||
else
|
||||
return (n%f==0);
|
||||
}
|
||||
|
||||
/** @brief Compute the smallest power of two larger than \a n.
|
||||
* @param n Input value
|
||||
* @returns The smallest power f two larger than \a n
|
||||
*/
|
||||
inline int
|
||||
ceilPow2(int n)
|
||||
{
|
||||
double log2n = log2((double)n);
|
||||
if (isPowerOfTwo(n))
|
||||
return n;
|
||||
else
|
||||
return 1 << (int)ceil(log2n);
|
||||
}
|
||||
|
||||
/** @brief Compute the largest power of two smaller than \a n.
|
||||
* @param n Input value
|
||||
* @returns The largest power of two smaller than \a n.
|
||||
*/
|
||||
inline int
|
||||
floorPow2(int n)
|
||||
{
|
||||
#ifdef WIN32
|
||||
// method 2
|
||||
return 1 << (int)_logb((float)n);
|
||||
#else
|
||||
// method 3
|
||||
int exp;
|
||||
frexp((float)n, &exp);
|
||||
return 1 << (exp - 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
/** @brief Returns the maximum value for type \a T.
|
||||
*
|
||||
* Implemented using template specialization on \a T.
|
||||
*/
|
||||
template <class T>
|
||||
__host__ __device__ inline T getMax() { return 0; }
|
||||
/** @brief Returns the minimum value for type \a T.
|
||||
*
|
||||
* Implemented using template specialization on \a T.
|
||||
*/
|
||||
template <class T>
|
||||
__host__ __device__ inline T getMin() { return 0; }
|
||||
// type specializations for the above
|
||||
// getMax
|
||||
template <> __host__ __device__ inline int getMax() { return INT_MAX; }
|
||||
template <> __host__ __device__ inline unsigned int getMax() { return INT_MAX; }
|
||||
template <> __host__ __device__ inline float getMax() { return FLT_MAX; }
|
||||
template <> __host__ __device__ inline char getMax() { return (char)INT_MAX; }
|
||||
template <> __host__ __device__ inline unsigned char getMax() { return (unsigned char)INT_MAX; }
|
||||
// getMin
|
||||
template <> __host__ __device__ inline int getMin() { return INT_MIN; }
|
||||
template <> __host__ __device__ inline unsigned int getMin() { return 0; }
|
||||
template <> __host__ __device__ inline float getMin() { return -FLT_MAX; }
|
||||
template <> __host__ __device__ inline char getMin() { return (char)INT_MIN; }
|
||||
template <> __host__ __device__ inline unsigned char getMin() { return (unsigned char)0; }
|
||||
|
||||
/** @brief Returns the maximum of three values.
|
||||
* @param a First value.
|
||||
* @param b Second value.
|
||||
* @param c Third value.
|
||||
* @returns The maximum of \a a, \a b and \a c.
|
||||
*/
|
||||
template<class T>
|
||||
inline int max3(T a, T b, T c)
|
||||
{
|
||||
return (a > b) ? ((a > c)? a : c) : ((b > c) ? b : c);
|
||||
}
|
||||
|
||||
/** @brief Utility template struct for generating small vector types from scalar types
|
||||
*
|
||||
* Given a base scalar type (\c int, \c float, etc.) and a vector length (1 through 4) as
|
||||
* template parameters, this struct defines a vector type (\c float3, \c int4, etc.) of the
|
||||
* specified length and base type. For example:
|
||||
* \code
|
||||
* template <class T>
|
||||
* __device__ void myKernel(T *data)
|
||||
* {
|
||||
* typeToVector<T,4>::Result myVec4; // create a vec4 of type T
|
||||
* myVec4 = (typeToVector<T,4>::Result*)data[0]; // load first element of data as a vec4
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* This functionality is implemented using template specialization. Currently specializations
|
||||
* for int, float, and unsigned int vectors of lengths 2-4 are defined. Note that this results
|
||||
* in types being generated at compile time -- there is no runtime cost. typeToVector is used by
|
||||
* the optimized scan \c __device__ functions in scan_cta.cu.
|
||||
*/
|
||||
template <typename T, int N>
|
||||
struct typeToVector
|
||||
{
|
||||
typedef T Result;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct typeToVector<int, 4>
|
||||
{
|
||||
typedef int4 Result;
|
||||
};
|
||||
template<>
|
||||
struct typeToVector<unsigned int, 4>
|
||||
{
|
||||
typedef uint4 Result;
|
||||
};
|
||||
template<>
|
||||
struct typeToVector<float, 4>
|
||||
{
|
||||
typedef float4 Result;
|
||||
};
|
||||
template<>
|
||||
struct typeToVector<int, 3>
|
||||
{
|
||||
typedef int3 Result;
|
||||
};
|
||||
template<>
|
||||
struct typeToVector<unsigned int, 3>
|
||||
{
|
||||
typedef uint3 Result;
|
||||
};
|
||||
template<>
|
||||
struct typeToVector<float, 3>
|
||||
{
|
||||
typedef float3 Result;
|
||||
};
|
||||
template<>
|
||||
struct typeToVector<int, 2>
|
||||
{
|
||||
typedef int2 Result;
|
||||
};
|
||||
template<>
|
||||
struct typeToVector<unsigned int, 2>
|
||||
{
|
||||
typedef uint2 Result;
|
||||
};
|
||||
template<>
|
||||
struct typeToVector<float, 2>
|
||||
{
|
||||
typedef float2 Result;
|
||||
};
|
||||
|
||||
/** @brief Templatized operator class used by scan and segmented scan
|
||||
*
|
||||
* This Operator class is used to allow generic support of binary
|
||||
* associative operators in scan. It defines two member functions,
|
||||
* op() and identity(), that are used in place of + and 0 (for
|
||||
* example) in the scan and segmented scan code. Because this is
|
||||
* template code, all decisions in the code are made at compile
|
||||
* time, resulting in optimal operator code. Currently the operators
|
||||
* CUDPP_ADD, CUDPP_MULTIPLY, CUDPP_MIN, and CUDPP_MAX are supported.
|
||||
* Operator is implemented using template specialization for the
|
||||
* types \c int, \c unsigned int, and \c float.
|
||||
*/
|
||||
template <typename T, CUDPPOperator oper>
|
||||
class Operator
|
||||
{
|
||||
public:
|
||||
/** Applies the operator to operands \a a and \a b.
|
||||
* @param a First operand
|
||||
* @param b Second operand
|
||||
* @returns a OP b, where OP is defined by ::CUDPPOperator \a oper.
|
||||
*/
|
||||
static __device__ T op(const T a, const T b)
|
||||
{
|
||||
switch (oper)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
return a + b;
|
||||
case CUDPP_MULTIPLY:
|
||||
return a * b;
|
||||
case CUDPP_MIN:
|
||||
return min(a, b);
|
||||
case CUDPP_MAX:
|
||||
return max(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the identity element defined for type \a T */
|
||||
static __device__ T identity() { return 0; }
|
||||
};
|
||||
|
||||
// specializations for different types
|
||||
template <CUDPPOperator oper>
|
||||
class Operator <int, oper>
|
||||
{
|
||||
public:
|
||||
static __device__ int op(const int a, const int b)
|
||||
{
|
||||
switch (oper)
|
||||
{
|
||||
default:
|
||||
case CUDPP_ADD:
|
||||
return a + b;
|
||||
case CUDPP_MULTIPLY:
|
||||
return a * b;
|
||||
case CUDPP_MIN:
|
||||
return min(a, b);
|
||||
case CUDPP_MAX:
|
||||
return max(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ int identity()
|
||||
{
|
||||
switch (oper)
|
||||
{
|
||||
default:
|
||||
case CUDPP_ADD:
|
||||
return 0;
|
||||
case CUDPP_MULTIPLY:
|
||||
return 1;
|
||||
case CUDPP_MIN:
|
||||
return INT_MAX;
|
||||
case CUDPP_MAX:
|
||||
return INT_MIN;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <CUDPPOperator oper>
|
||||
class Operator <unsigned int, oper>
|
||||
{
|
||||
public:
|
||||
static __device__ unsigned int op(const unsigned int a, const unsigned int b)
|
||||
{
|
||||
switch (oper)
|
||||
{
|
||||
default:
|
||||
case CUDPP_ADD:
|
||||
return a + b;
|
||||
case CUDPP_MULTIPLY:
|
||||
return a * b;
|
||||
case CUDPP_MIN:
|
||||
return min(a, b);
|
||||
case CUDPP_MAX:
|
||||
return max(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ unsigned int identity()
|
||||
{
|
||||
switch (oper)
|
||||
{
|
||||
default:
|
||||
case CUDPP_ADD:
|
||||
return 0;
|
||||
case CUDPP_MULTIPLY:
|
||||
return 1;
|
||||
case CUDPP_MIN:
|
||||
return UINT_MAX;
|
||||
case CUDPP_MAX:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <CUDPPOperator oper>
|
||||
class Operator <float, oper>
|
||||
{
|
||||
public:
|
||||
static __device__ float op(const float a, const float b)
|
||||
{
|
||||
switch (oper)
|
||||
{
|
||||
default:
|
||||
case CUDPP_ADD:
|
||||
return a + b;
|
||||
case CUDPP_MULTIPLY:
|
||||
return a * b;
|
||||
case CUDPP_MIN:
|
||||
return min(a, b);
|
||||
case CUDPP_MAX:
|
||||
return max(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ float identity()
|
||||
{
|
||||
switch (oper)
|
||||
{
|
||||
default:
|
||||
case CUDPP_ADD:
|
||||
return 0.0f;
|
||||
case CUDPP_MULTIPLY:
|
||||
return 1.0f;
|
||||
case CUDPP_MIN:
|
||||
return FLT_MAX;
|
||||
case CUDPP_MAX:
|
||||
return -FLT_MAX;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __CUDPP_UTIL_H__
|
||||
|
||||
// Leave this at the end of the file
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-file-style: "NVIDIA"
|
||||
// End:
|
|
@ -0,0 +1,879 @@
|
|||
/*
|
||||
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*/
|
||||
|
||||
|
||||
/* CUda UTility Library */
|
||||
|
||||
#ifndef _CUTIL_H_
|
||||
#define _CUTIL_H_
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
# pragma warning( disable : 4996 ) // disable deprecated warning
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// helper typedefs for building DLL
|
||||
#ifdef _WIN32
|
||||
# ifdef BUILD_DLL
|
||||
# define DLL_MAPPING __declspec(dllexport)
|
||||
# else
|
||||
# define DLL_MAPPING __declspec(dllimport)
|
||||
# endif
|
||||
#else
|
||||
# define DLL_MAPPING
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#define CUTIL_API __stdcall
|
||||
#else
|
||||
#define CUTIL_API
|
||||
#endif
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! CUT bool type
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
enum CUTBoolean
|
||||
{
|
||||
CUTFalse = 0,
|
||||
CUTTrue = 1
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Deallocate memory allocated within Cutil
|
||||
//! @param pointer to memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
void CUTIL_API
|
||||
cutFree( void* ptr);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Helper for bank conflict checking (should only be used with the
|
||||
//! CUT_BANK_CHECKER macro)
|
||||
//! @param tidx thread id in x dimension of block
|
||||
//! @param tidy thread id in y dimension of block
|
||||
//! @param tidz thread id in z dimension of block
|
||||
//! @param bdimx block size in x dimension
|
||||
//! @param bdimy block size in y dimension
|
||||
//! @param bdimz block size in z dimension
|
||||
//! @param file name of the source file where the access takes place
|
||||
//! @param line line in the source file where the access takes place
|
||||
//! @param aname name of the array which is accessed
|
||||
//! @param index index into the array
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
void CUTIL_API
|
||||
cutCheckBankAccess( unsigned int tidx, unsigned int tidy, unsigned int tidz,
|
||||
unsigned int bdimx, unsigned int bdimy,
|
||||
unsigned int bdimz, const char* file, const int line,
|
||||
const char* aname, const int index);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename within a hardcoded set of paths
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
char* CUTIL_API
|
||||
cutFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename within a specified directory tree
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutFindFile(char * outputPath, const char * startDir, const char * dirName);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename within a specified directory tree
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutFindDir(char * outputPath, const char * startDir, const char * dirName);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFileui( const char* filename,const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutWriteFileub( const char* filename,const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type)
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPPMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPPM4ub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned int as data element type)
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPGMi( const char* file, unsigned int** data,
|
||||
unsigned int* w, unsigned int* h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned short as data element type)
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized withing Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPGMs( const char* file, unsigned short** data,
|
||||
unsigned int* w, unsigned int* h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with float as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized withing Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutLoadPGMf( const char* file, float** data,
|
||||
unsigned int* w, unsigned int* h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutSavePGMub( const char* file, unsigned char* data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutSavePPMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned int as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutSavePGMi( const char* file, unsigned int* data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned short as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutSavePGMs( const char* file, unsigned short* data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with float as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutSavePGMf( const char* file, float* data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return CUTTrue if command line argument \a flag_name has been given,
|
||||
//! otherwise 0
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return CUTTrue if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise CUTFalse
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return CUTTrue if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise CUTFalse
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return CUTTrue if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise CUTFalse
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return CUTTrue if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise CUTFalse
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Extended assert
|
||||
//! @return CUTTrue if the condition \a val holds, otherwise CUTFalse
|
||||
//! @param val condition to test
|
||||
//! @param file __FILE__ macro
|
||||
//! @param line __LINE__ macro
|
||||
//! @note This function should be used via the CONDITION(val) macro
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutCheckCondition( int val, const char* file, const int line);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const int epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Timer functionality
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Create a new timer
|
||||
//! @return CUTTrue if a time has been created, otherwise false
|
||||
//! @param name of the new timer, 0 if the creation failed
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutCreateTimer( unsigned int* name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Delete a timer
|
||||
//! @return CUTTrue if a time has been deleted, otherwise false
|
||||
//! @param name of the timer to delete
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutDeleteTimer( unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Start the time with name \a name
|
||||
//! @param name name of the timer to start
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutStartTimer( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Stop the time with name \a name. Does not reset.
|
||||
//! @param name name of the timer to stop
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutStopTimer( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Resets the timer's counter.
|
||||
//! @param name name of the timer to reset.
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API
|
||||
cutResetTimer( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Returns total execution time in milliseconds for the timer over all
|
||||
//! runs since the last reset or timer creation.
|
||||
//! @param name name of the timer to return the time of
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
float CUTIL_API
|
||||
cutGetTimerValue( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Return the average time in milliseconds for timer execution as the
|
||||
//! total time for the timer dividied by the number of completed (stopped)
|
||||
//! runs the timer has made.
|
||||
//! Excludes the current running time if the timer is currently running.
|
||||
//! @param name name of the timer to return the time of
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
float CUTIL_API
|
||||
cutGetAverageTimerValue( const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Macros
|
||||
|
||||
#ifdef _DEBUG
|
||||
|
||||
#if __DEVICE_EMULATION__
|
||||
// Interface for bank conflict checker
|
||||
#define CUT_BANK_CHECKER( array, index) \
|
||||
(cutCheckBankAccess( threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x, \
|
||||
blockDim.y, blockDim.z, \
|
||||
__FILE__, __LINE__, #array, index ), \
|
||||
array[index])
|
||||
#else
|
||||
#define CUT_BANK_CHECKER( array, index) array[index]
|
||||
#endif
|
||||
|
||||
# define CU_SAFE_CALL_NO_SYNC( call ) do { \
|
||||
CUresult err = call; \
|
||||
if( CUDA_SUCCESS != err) { \
|
||||
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \
|
||||
err, __FILE__, __LINE__ ); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CU_SAFE_CALL( call ) do { \
|
||||
CU_SAFE_CALL_NO_SYNC(call); \
|
||||
CUresult err = cuCtxSynchronize(); \
|
||||
if( CUDA_SUCCESS != err) { \
|
||||
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", \
|
||||
err, __FILE__, __LINE__ ); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CUDA_SAFE_CALL_NO_SYNC( call) do { \
|
||||
cudaError err = call; \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
|
||||
__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CUDA_SAFE_CALL( call) do { \
|
||||
CUDA_SAFE_CALL_NO_SYNC(call); \
|
||||
cudaError err = cudaThreadSynchronize(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
|
||||
__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CUFFT_SAFE_CALL( call) do { \
|
||||
cufftResult err = call; \
|
||||
if( CUFFT_SUCCESS != err) { \
|
||||
fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
# define CUT_SAFE_CALL( call) \
|
||||
if( CUTTrue != call) { \
|
||||
fprintf(stderr, "Cut error in file '%s' in line %i.\n", \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
//! Check for CUDA error
|
||||
# define CUT_CHECK_ERROR(errorMessage) do { \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
err = cudaThreadSynchronize(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while (0)
|
||||
|
||||
//! Check for malloc error
|
||||
# define CUT_SAFE_MALLOC( mallocCall ) do{ \
|
||||
if( !(mallocCall)) { \
|
||||
fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} } while(0);
|
||||
|
||||
//! Check if conditon is true (flexible assert)
|
||||
# define CUT_CONDITION( val) \
|
||||
if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) { \
|
||||
exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
#else // not DEBUG
|
||||
|
||||
#define CUT_BANK_CHECKER( array, index) array[index]
|
||||
|
||||
// void macros for performance reasons
|
||||
# define CUT_CHECK_ERROR(errorMessage)
|
||||
# define CUT_CHECK_ERROR_GL()
|
||||
# define CUT_CONDITION( val)
|
||||
# define CU_SAFE_CALL_NO_SYNC( call) call
|
||||
# define CU_SAFE_CALL( call) call
|
||||
# define CUDA_SAFE_CALL_NO_SYNC( call) call
|
||||
# define CUDA_SAFE_CALL( call) call
|
||||
# define CUT_SAFE_CALL( call) call
|
||||
# define CUFFT_SAFE_CALL( call) call
|
||||
# define CUT_SAFE_MALLOC( mallocCall ) mallocCall
|
||||
|
||||
#endif
|
||||
|
||||
#if __DEVICE_EMULATION__
|
||||
|
||||
# define CUT_DEVICE_INIT(ARGC, ARGV)
|
||||
|
||||
#else
|
||||
|
||||
# define CUT_DEVICE_INIT(ARGC, ARGV) { \
|
||||
int deviceCount; \
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount)); \
|
||||
if (deviceCount == 0) { \
|
||||
fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
int dev = 0; \
|
||||
cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev); \
|
||||
if (dev > deviceCount-1) dev = deviceCount - 1; \
|
||||
cudaDeviceProp deviceProp; \
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev)); \
|
||||
if (deviceProp.major < 1) { \
|
||||
fprintf(stderr, "cutil error: device does not support CUDA.\n"); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \
|
||||
fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name); \
|
||||
CUDA_SAFE_CALL(cudaSetDevice(dev)); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
# define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV) { \
|
||||
cuDevice = 0; \
|
||||
int deviceCount = 0; \
|
||||
CUresult err = cuInit(0); \
|
||||
if (CUDA_SUCCESS == err) \
|
||||
CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount)); \
|
||||
if (deviceCount == 0) { \
|
||||
fprintf(stderr, "cutil error: no devices supporting CUDA\n"); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
int dev = 0; \
|
||||
cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev); \
|
||||
if (dev > deviceCount-1) dev = deviceCount - 1; \
|
||||
CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev)); \
|
||||
char name[100]; \
|
||||
cuDeviceGetName(name, 100, cuDevice); \
|
||||
if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \
|
||||
fprintf(stderr, "Using device %d: %s\n", dev, name); \
|
||||
}
|
||||
|
||||
#define CUT_EXIT(argc, argv) \
|
||||
if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) { \
|
||||
printf("\nPress ENTER to exit...\n"); \
|
||||
fflush( stdout); \
|
||||
fflush( stderr); \
|
||||
getchar(); \
|
||||
} \
|
||||
exit(EXIT_SUCCESS);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // #ifdef _DEBUG (else branch)
|
||||
|
||||
#endif // #ifndef _CUTIL_H_
|
|
@ -0,0 +1,868 @@
|
|||
// -------------------------------------------------------------
|
||||
// CUDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
#include "cudpp_radixsort.h"
|
||||
#include <cudpp_globals.h>
|
||||
#include "sharedmem.h"
|
||||
#include "cta/radixsort_cta.cu"
|
||||
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
#define __EMUSYNC __syncthreads()
|
||||
#else
|
||||
#define __EMUSYNC
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @file
|
||||
* radixsort_app.cu
|
||||
*
|
||||
* @brief CUDPP kernel-level radix sorting routines
|
||||
*/
|
||||
|
||||
/** \addtogroup cudpp_kernel
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @name RadixSort Functions
|
||||
* @{
|
||||
*/
|
||||
|
||||
|
||||
|
||||
typedef unsigned int uint;
|
||||
|
||||
/** @brief And empty kernel used to reset CTA issue hardware
|
||||
**/
|
||||
__global__ void emptyKernel() {}
|
||||
|
||||
|
||||
/** @brief Does special binary arithmetic before sorting floats
|
||||
*
|
||||
* Uses floatFlip function to flip bits.
|
||||
* @param[in,out] values Values to be manipulated
|
||||
* @param[in] numValues Number of values to be flipped
|
||||
**/
|
||||
|
||||
__global__ void
|
||||
LAUNCH_BOUNDS(SORT_CTA_SIZE)
|
||||
flipFloats(uint *values, uint numValues)
|
||||
{
|
||||
uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x;
|
||||
if (index < numValues) values[index] = floatFlip<true>(values[index]);
|
||||
index += blockDim.x;
|
||||
if (index < numValues) values[index] = floatFlip<true>(values[index]);
|
||||
index += blockDim.x;
|
||||
if (index < numValues) values[index] = floatFlip<true>(values[index]);
|
||||
index += blockDim.x;
|
||||
if (index < numValues) values[index] = floatFlip<true>(values[index]);
|
||||
}
|
||||
|
||||
/** @brief Undoes the flips from flipFloats
|
||||
*
|
||||
* Uses floatUnflip function to unflip bits.
|
||||
* @param[in,out] values Values to be manipulated
|
||||
* @param[in] numValues Number of values to be unflipped
|
||||
**/
|
||||
__global__ void
|
||||
LAUNCH_BOUNDS(SORT_CTA_SIZE)
|
||||
unflipFloats(uint *values, uint numValues)
|
||||
{
|
||||
uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x;
|
||||
if (index < numValues) values[index] = floatUnflip<true>(values[index]);
|
||||
index += blockDim.x;
|
||||
if (index < numValues) values[index] = floatUnflip<true>(values[index]);
|
||||
index += blockDim.x;
|
||||
if (index < numValues) values[index] = floatUnflip<true>(values[index]);
|
||||
index += blockDim.x;
|
||||
if (index < numValues) values[index] = floatUnflip<true>(values[index]);
|
||||
}
|
||||
|
||||
|
||||
/** @brief Optimization for sorts of WARP_SIZE or fewer elements
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param[in,out] values Associated values to be sorted (through keys).
|
||||
* @param[in] numElements Number of elements in the sort.
|
||||
*/
|
||||
template <bool flip>
|
||||
__global__
|
||||
LAUNCH_BOUNDS(WARP_SIZE)
|
||||
void radixSortSingleWarp(uint *keys,
|
||||
uint *values,
|
||||
uint numElements)
|
||||
{
|
||||
volatile __shared__ uint sKeys[WARP_SIZE]; //remove class distinctions
|
||||
volatile __shared__ uint sValues[WARP_SIZE];
|
||||
volatile __shared__ uint sFlags[WARP_SIZE];
|
||||
|
||||
sKeys[threadIdx.x] = floatFlip<flip>(keys[threadIdx.x]);
|
||||
sValues[threadIdx.x] = values[threadIdx.x];
|
||||
|
||||
__EMUSYNC; // emulation only
|
||||
|
||||
for(uint i = 1; i < numElements; i++)
|
||||
{
|
||||
uint key_i = sKeys[i];
|
||||
uint val_i = sValues[i];
|
||||
|
||||
sFlags[threadIdx.x] = 0;
|
||||
|
||||
uint temp, tempval;
|
||||
if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) )
|
||||
{
|
||||
temp = sKeys[threadIdx.x];
|
||||
tempval = sValues[threadIdx.x];
|
||||
sFlags[threadIdx.x] = 1;
|
||||
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
}
|
||||
__EMUSYNC;
|
||||
if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) )
|
||||
{
|
||||
#endif
|
||||
sKeys[threadIdx.x + 1] = temp;
|
||||
sValues[threadIdx.x + 1] = tempval;
|
||||
sFlags[threadIdx.x + 1] = 0;
|
||||
}
|
||||
|
||||
|
||||
if(sFlags[threadIdx.x] == 1 )
|
||||
{
|
||||
sKeys[threadIdx.x] = key_i;
|
||||
sValues[threadIdx.x] = val_i;
|
||||
}
|
||||
|
||||
__EMUSYNC; // emulation only
|
||||
|
||||
}
|
||||
keys[threadIdx.x] = floatUnflip<flip>(sKeys[threadIdx.x]);
|
||||
values[threadIdx.x] = sValues[threadIdx.x];
|
||||
}
|
||||
|
||||
|
||||
/** @brief Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted
|
||||
* @param[in] numElements Total number of elements to be sorted
|
||||
**/
|
||||
|
||||
template <bool flip>
|
||||
__global__
|
||||
LAUNCH_BOUNDS(WARP_SIZE)
|
||||
void radixSortSingleWarpKeysOnly(uint *keys,
|
||||
uint numElements)
|
||||
{
|
||||
volatile __shared__ uint sKeys[WARP_SIZE];
|
||||
volatile __shared__ uint sFlags[WARP_SIZE];
|
||||
|
||||
sKeys[threadIdx.x] = floatFlip<flip>(keys[threadIdx.x]);
|
||||
|
||||
__EMUSYNC; // emulation only
|
||||
|
||||
for(uint i = 1; i < numElements; i++)
|
||||
{
|
||||
uint key_i = sKeys[i];
|
||||
|
||||
sFlags[threadIdx.x] = 0;
|
||||
|
||||
uint temp;
|
||||
if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) )
|
||||
{
|
||||
temp = sKeys[threadIdx.x];
|
||||
sFlags[threadIdx.x] = 1;
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
}
|
||||
__EMUSYNC;
|
||||
if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) )
|
||||
{
|
||||
#endif
|
||||
sKeys[threadIdx.x + 1] = temp;
|
||||
sFlags[threadIdx.x + 1] = 0;
|
||||
}
|
||||
if(sFlags[threadIdx.x] == 1 )
|
||||
{
|
||||
sKeys[threadIdx.x] = key_i;
|
||||
}
|
||||
|
||||
__EMUSYNC; // emulation only
|
||||
|
||||
}
|
||||
keys[threadIdx.x] = floatUnflip<flip>(sKeys[threadIdx.x]);
|
||||
}
|
||||
|
||||
/** @brief sorts all blocks of data independently in shared memory.
|
||||
* Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
|
||||
*
|
||||
* The radix sort is done in two stages. This stage calls radixSortBlock on each
|
||||
* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
|
||||
*
|
||||
* Template parameters are used to generate efficient code for various special cases
|
||||
* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
|
||||
* differently than arrays that are not. "flip" is used to only compile in the
|
||||
* float flip code when float keys are used. "loop" is used when persistent CTAs
|
||||
* are used.
|
||||
*
|
||||
* By persistent CTAs we mean that we launch only as many thread blocks as can
|
||||
* be resident in the GPU and no more, rather than launching as many threads as
|
||||
* we have elements. Persistent CTAs loop over blocks of elements until all work
|
||||
* is complete. This can be faster in some cases. In our tests it is faster
|
||||
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
|
||||
* GPUs than it is on compute version 1.2 GPUs.
|
||||
*
|
||||
* @param[out] keysOut Output of sorted keys
|
||||
* @param[out] valuesOut Output of associated values
|
||||
* @param[in] keysIn Input of unsorted keys in GPU
|
||||
* @param[in] valuesIn Input of associated input values
|
||||
* @param[in] numElements Total number of elements to sort
|
||||
* @param[in] totalBlocks The number of blocks of data to sort
|
||||
*/
|
||||
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
|
||||
__global__ void
|
||||
LAUNCH_BOUNDS(SORT_CTA_SIZE)
|
||||
radixSortBlocks(uint4* keysOut, uint4* valuesOut,
|
||||
uint4* keysIn, uint4* valuesIn,
|
||||
uint numElements, uint totalBlocks)
|
||||
{
|
||||
extern __shared__ uint4 sMem[];
|
||||
|
||||
uint4 key, value;
|
||||
|
||||
|
||||
uint blockId = blockIdx.x;
|
||||
|
||||
while (!loop || blockId < totalBlocks)
|
||||
{
|
||||
uint i = blockId * blockDim.x + threadIdx.x;
|
||||
uint idx = i << 2;
|
||||
|
||||
// handle non-full last block if array is not multiple of 1024 numElements
|
||||
if (!fullBlocks && idx+3 >= numElements)
|
||||
{
|
||||
if (idx >= numElements)
|
||||
{
|
||||
key = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
|
||||
value = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
|
||||
}
|
||||
else
|
||||
{
|
||||
// for non-full block, we handle uint1 values instead of uint4
|
||||
uint *keys1 = (uint*)keysIn;
|
||||
uint *values1 = (uint*)valuesIn;
|
||||
|
||||
key.x = (idx < numElements) ? floatFlip<flip>(keys1[idx]) : UINT_MAX;
|
||||
key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
|
||||
key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
|
||||
key.w = UINT_MAX;
|
||||
|
||||
value.x = (idx < numElements) ? values1[idx] : UINT_MAX;
|
||||
value.y = (idx+1 < numElements) ? values1[idx+1] : UINT_MAX;
|
||||
value.z = (idx+2 < numElements) ? values1[idx+2] : UINT_MAX;
|
||||
value.w = UINT_MAX;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
key = keysIn[i];
|
||||
value = valuesIn[i];
|
||||
|
||||
if (flip)
|
||||
{
|
||||
key.x = floatFlip<flip>(key.x);
|
||||
key.y = floatFlip<flip>(key.y);
|
||||
key.z = floatFlip<flip>(key.z);
|
||||
key.w = floatFlip<flip>(key.w);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
radixSortBlock<nbits, startbit>(key, value);
|
||||
|
||||
// handle non-full last block if array is not multiple of 1024 numElements
|
||||
if(!fullBlocks && idx+3 >= numElements)
|
||||
{
|
||||
if (idx < numElements)
|
||||
{
|
||||
// for non-full block, we handle uint1 values instead of uint4
|
||||
uint *keys1 = (uint*)keysOut;
|
||||
uint *values1 = (uint*)valuesOut;
|
||||
|
||||
keys1[idx] = key.x;
|
||||
values1[idx] = value.x;
|
||||
|
||||
if (idx + 1 < numElements)
|
||||
{
|
||||
keys1[idx + 1] = key.y;
|
||||
values1[idx + 1] = value.y;
|
||||
|
||||
if (idx + 2 < numElements)
|
||||
{
|
||||
keys1[idx + 2] = key.z;
|
||||
values1[idx + 2] = value.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
keysOut[i] = key;
|
||||
valuesOut[i] = value;
|
||||
}
|
||||
|
||||
if (loop)
|
||||
blockId += gridDim.x;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Computes the number of keys of each radix in each block stores offset.
|
||||
*
|
||||
* Given an array with blocks sorted according to a 4-bit radix group, each
|
||||
* block counts the number of keys that fall into each radix in the group, and
|
||||
* finds the starting offset of each radix in the block. It then writes the radix
|
||||
* counts to the counters array, and the starting offsets to the blockOffsets array.
|
||||
*
|
||||
* Template parameters are used to generate efficient code for various special cases
|
||||
* For example, we have to handle arrays that are a multiple of the block size
|
||||
* (fullBlocks) differently than arrays that are not. "loop" is used when persistent
|
||||
* CTAs are used.
|
||||
*
|
||||
* By persistent CTAs we mean that we launch only as many thread blocks as can
|
||||
* be resident in the GPU and no more, rather than launching as many threads as
|
||||
* we have elements. Persistent CTAs loop over blocks of elements until all work
|
||||
* is complete. This can be faster in some cases. In our tests it is faster
|
||||
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
|
||||
* GPUs than it is on compute version 1.2 GPUs.
|
||||
*
|
||||
* @param[in] keys Input keys
|
||||
* @param[out] counters Radix count for each block
|
||||
* @param[out] blockOffsets The offset address for each block
|
||||
* @param[in] numElements Total number of elements
|
||||
* @param[in] totalBlocks Total number of blocks
|
||||
**/
|
||||
template<uint startbit, bool fullBlocks, bool loop>
|
||||
__global__ void
|
||||
LAUNCH_BOUNDS(SORT_CTA_SIZE)
|
||||
findRadixOffsets(uint2 *keys,
|
||||
uint *counters,
|
||||
uint *blockOffsets,
|
||||
uint numElements,
|
||||
uint totalBlocks)
|
||||
{
|
||||
extern __shared__ uint sRadix1[];
|
||||
__shared__ uint sStartPointers[16];
|
||||
|
||||
uint blockId = blockIdx.x;
|
||||
|
||||
while (!loop || blockId < totalBlocks)
|
||||
{
|
||||
uint2 radix2;
|
||||
|
||||
uint i = blockId * blockDim.x + threadIdx.x;
|
||||
|
||||
// handle non-full last block if array is not multiple of 1024 numElements
|
||||
if(!fullBlocks && ((i + 1) << 1 ) > numElements )
|
||||
{
|
||||
// handle uint1 rather than uint2 for non-full blocks
|
||||
uint *keys1 = (uint*)keys;
|
||||
uint j = i << 1;
|
||||
|
||||
radix2.x = (j < numElements) ? keys1[j] : UINT_MAX;
|
||||
j++;
|
||||
radix2.y = (j < numElements) ? keys1[j] : UINT_MAX;
|
||||
}
|
||||
else
|
||||
{
|
||||
radix2 = keys[i];
|
||||
}
|
||||
|
||||
sRadix1[2 * threadIdx.x] = (radix2.x >> startbit) & 0xF;
|
||||
sRadix1[2 * threadIdx.x + 1] = (radix2.y >> startbit) & 0xF;
|
||||
|
||||
// Finds the position where the sRadix1 entries differ and stores start
|
||||
// index for each radix.
|
||||
if(threadIdx.x < 16)
|
||||
{
|
||||
sStartPointers[threadIdx.x] = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) )
|
||||
{
|
||||
sStartPointers[sRadix1[threadIdx.x]] = threadIdx.x;
|
||||
}
|
||||
if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1])
|
||||
{
|
||||
sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE]] = threadIdx.x + SORT_CTA_SIZE;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 16)
|
||||
{
|
||||
blockOffsets[blockId*16 + threadIdx.x] = sStartPointers[threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Compute the sizes of each block.
|
||||
if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) )
|
||||
{
|
||||
sStartPointers[sRadix1[threadIdx.x - 1]] =
|
||||
threadIdx.x - sStartPointers[sRadix1[threadIdx.x - 1]];
|
||||
}
|
||||
if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1] )
|
||||
{
|
||||
sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]] =
|
||||
threadIdx.x + SORT_CTA_SIZE - sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]];
|
||||
}
|
||||
|
||||
|
||||
if(threadIdx.x == SORT_CTA_SIZE - 1)
|
||||
{
|
||||
sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]] =
|
||||
2 * SORT_CTA_SIZE - sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 16)
|
||||
{
|
||||
counters[threadIdx.x * totalBlocks + blockId] =
|
||||
sStartPointers[threadIdx.x];
|
||||
}
|
||||
|
||||
if (loop)
|
||||
blockId += gridDim.x;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**@brief Reorders data in the global array.
|
||||
*
|
||||
* reorderData shuffles data in the array globally after the radix
|
||||
* offsets have been found. On compute version 1.1 and earlier GPUs, this code depends
|
||||
* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
|
||||
*
|
||||
* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
|
||||
* that all writes are coalesced using extra work in the kernel. On later
|
||||
* GPUs coalescing rules have been relaxed, so this extra overhead hurts
|
||||
* performance. On these GPUs we set manualCoalesce=false and directly store
|
||||
* the results.
|
||||
*
|
||||
* Template parameters are used to generate efficient code for various special cases
|
||||
* For example, we have to handle arrays that are a multiple of the block size
|
||||
* (fullBlocks) differently than arrays that are not. "loop" is used when persistent
|
||||
* CTAs are used.
|
||||
*
|
||||
* By persistent CTAs we mean that we launch only as many thread blocks as can
|
||||
* be resident in the GPU and no more, rather than launching as many threads as
|
||||
* we have elements. Persistent CTAs loop over blocks of elements until all work
|
||||
* is complete. This can be faster in some cases. In our tests it is faster
|
||||
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
|
||||
* GPUs than it is on compute version 1.2 GPUs.
|
||||
*
|
||||
* @param[out] outKeys Output of sorted keys
|
||||
* @param[out] outValues Output of associated values
|
||||
* @param[in] keys Input of unsorted keys in GPU
|
||||
* @param[in] values Input of associated input values
|
||||
* @param[in] blockOffsets The offset address for each block
|
||||
* @param[in] offsets Address of each radix within each block
|
||||
* @param[in] sizes Number of elements in a block
|
||||
* @param[in] numElements Total number of elements
|
||||
* @param[in] totalBlocks Total number of data blocks to process
|
||||
*
|
||||
* @todo Args that are const below should be prototyped as const
|
||||
**/
|
||||
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
|
||||
__global__ void
|
||||
LAUNCH_BOUNDS(SORT_CTA_SIZE)
|
||||
reorderData(uint *outKeys,
|
||||
uint *outValues,
|
||||
uint2 *keys,
|
||||
uint2 *values,
|
||||
uint *blockOffsets,
|
||||
uint *offsets,
|
||||
uint *sizes,
|
||||
uint numElements,
|
||||
uint totalBlocks)
|
||||
{
|
||||
__shared__ uint2 sKeys2[SORT_CTA_SIZE];
|
||||
__shared__ uint2 sValues2[SORT_CTA_SIZE];
|
||||
__shared__ uint sOffsets[16];
|
||||
__shared__ uint sBlockOffsets[16];
|
||||
|
||||
uint *sKeys1 = (uint*)sKeys2;
|
||||
uint *sValues1 = (uint*)sValues2;
|
||||
|
||||
uint blockId = blockIdx.x;
|
||||
|
||||
while (!loop || blockId < totalBlocks)
|
||||
{
|
||||
uint i = blockId * blockDim.x + threadIdx.x;
|
||||
|
||||
// handle non-full last block if array is not multiple of 1024 numElements
|
||||
if(!fullBlocks && (((i + 1) << 1) > numElements))
|
||||
{
|
||||
uint *keys1 = (uint*)keys;
|
||||
uint *values1 = (uint*)values;
|
||||
uint j = i << 1;
|
||||
|
||||
sKeys1[threadIdx.x << 1] = (j < numElements) ? keys1[j] : UINT_MAX;
|
||||
sValues1[threadIdx.x << 1] = (j < numElements) ? values1[j] : UINT_MAX;
|
||||
j++;
|
||||
sKeys1[(threadIdx.x << 1) + 1] = (j < numElements) ? keys1[j] : UINT_MAX;
|
||||
sValues1[(threadIdx.x << 1) + 1] = (j < numElements) ? values1[j] : UINT_MAX;
|
||||
}
|
||||
else
|
||||
{
|
||||
sKeys2[threadIdx.x] = keys[i];
|
||||
sValues2[threadIdx.x] = values[i];
|
||||
}
|
||||
|
||||
if (!manualCoalesce)
|
||||
{
|
||||
if(threadIdx.x < 16)
|
||||
{
|
||||
sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId];
|
||||
sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
|
||||
uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
|
||||
|
||||
if (fullBlocks || globalOffset < numElements)
|
||||
{
|
||||
outKeys[globalOffset] = floatUnflip<unflip>(sKeys1[threadIdx.x]);
|
||||
outValues[globalOffset] = sValues1[threadIdx.x];
|
||||
}
|
||||
|
||||
radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
|
||||
globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
|
||||
|
||||
if (fullBlocks || globalOffset < numElements)
|
||||
{
|
||||
outKeys[globalOffset] = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
|
||||
outValues[globalOffset] = sValues1[threadIdx.x + SORT_CTA_SIZE];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
__shared__ uint sSizes[16];
|
||||
|
||||
if(threadIdx.x < 16)
|
||||
{
|
||||
sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId];
|
||||
sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
|
||||
sSizes[threadIdx.x] = sizes[threadIdx.x * totalBlocks + blockId];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 1 half-warp is responsible for writing out all values for 1 radix.
|
||||
// Loops if there are more than 16 values to be written out.
|
||||
// All start indices are rounded down to the nearest multiple of 16, and
|
||||
// all end indices are rounded up to the nearest multiple of 16.
|
||||
// Thus it can do extra work if the start and end indices are not multiples of 16
|
||||
// This is bounded by a factor of 2 (it can do 2X more work at most).
|
||||
|
||||
const uint halfWarpID = threadIdx.x >> 4;
|
||||
|
||||
const uint halfWarpOffset = threadIdx.x & 0xF;
|
||||
const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
|
||||
|
||||
uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
|
||||
uint endPos = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 -
|
||||
((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
|
||||
uint numIterations = endPos - startPos;
|
||||
|
||||
uint outOffset = startPos + halfWarpOffset;
|
||||
uint inOffset = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
|
||||
|
||||
for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
|
||||
{
|
||||
if( (outOffset >= sOffsets[halfWarpID]) &&
|
||||
(inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID]))
|
||||
{
|
||||
if(blockId < totalBlocks - 1 || outOffset < numElements)
|
||||
{
|
||||
outKeys[outOffset] = floatUnflip<unflip>(sKeys1[inOffset]);
|
||||
outValues[outOffset] = sValues1[inOffset];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (loop)
|
||||
{
|
||||
blockId += gridDim.x;
|
||||
__syncthreads();
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Sorts all blocks of data independently in shared memory.
|
||||
* Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
|
||||
*
|
||||
* The radix sort is done in two stages. This stage calls radixSortBlock on each
|
||||
* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
|
||||
*
|
||||
* Template parameters are used to generate efficient code for various special cases
|
||||
* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
|
||||
* differently than arrays that are not. "flip" is used to only compile in the
|
||||
* float flip code when float keys are used. "loop" is used when persistent CTAs
|
||||
* are used.
|
||||
*
|
||||
* By persistent CTAs we mean that we launch only as many thread blocks as can
|
||||
* be resident in the GPU and no more, rather than launching as many threads as
|
||||
* we have elements. Persistent CTAs loop over blocks of elements until all work
|
||||
* is complete. This can be faster in some cases. In our tests it is faster
|
||||
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
|
||||
* GPUs than it is on compute version 1.2 GPUs.
|
||||
*
|
||||
* @param[out] keysOut Output of sorted keys GPU main memory
|
||||
* @param[in] keysIn Input of unsorted keys in GPU main memory
|
||||
* @param[in] numElements Total number of elements to sort
|
||||
* @param[in] totalBlocks Total number of blocks to sort
|
||||
*
|
||||
*/
|
||||
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
|
||||
__global__ void
|
||||
LAUNCH_BOUNDS(SORT_CTA_SIZE)
|
||||
radixSortBlocksKeysOnly(uint4* keysOut, uint4* keysIn, uint numElements, uint totalBlocks)
|
||||
{
|
||||
extern __shared__ uint4 sMem[];
|
||||
|
||||
uint4 key;
|
||||
|
||||
uint blockId = blockIdx.x;
|
||||
|
||||
while (!loop || blockId < totalBlocks)
|
||||
{
|
||||
uint i = blockId * blockDim.x + threadIdx.x;
|
||||
uint idx = i << 2;
|
||||
|
||||
// handle non-full last block if array is not multiple of 1024 numElements
|
||||
if (!fullBlocks && idx+3 >= numElements)
|
||||
{
|
||||
if (idx >= numElements)
|
||||
{
|
||||
key = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
|
||||
}
|
||||
else
|
||||
{
|
||||
// for non-full block, we handle uint1 values instead of uint4
|
||||
uint *keys1 = (uint*)keysIn;
|
||||
|
||||
key.x = (idx < numElements) ? floatFlip<flip>(keys1[idx]) : UINT_MAX;
|
||||
key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
|
||||
key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
|
||||
key.w = UINT_MAX;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
key = keysIn[i];
|
||||
if (flip)
|
||||
{
|
||||
key.x = floatFlip<flip>(key.x);
|
||||
key.y = floatFlip<flip>(key.y);
|
||||
key.z = floatFlip<flip>(key.z);
|
||||
key.w = floatFlip<flip>(key.w);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
radixSortBlockKeysOnly<nbits, startbit>(key);
|
||||
|
||||
// handle non-full last block if array is not multiple of 1024 numElements
|
||||
if(!fullBlocks && idx+3 >= numElements)
|
||||
{
|
||||
if (idx < numElements)
|
||||
{
|
||||
// for non-full block, we handle uint1 values instead of uint4
|
||||
uint *keys1 = (uint*)keysOut;
|
||||
|
||||
keys1[idx] = key.x;
|
||||
|
||||
if (idx + 1 < numElements)
|
||||
{
|
||||
keys1[idx + 1] = key.y;
|
||||
|
||||
if (idx + 2 < numElements)
|
||||
{
|
||||
keys1[idx + 2] = key.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
keysOut[i] = key;
|
||||
}
|
||||
|
||||
if (loop)
|
||||
blockId += gridDim.x;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Reorders data in the global array.
|
||||
*
|
||||
* reorderDataKeysOnly shuffles data in the array globally after the radix offsets
|
||||
* have been found. On compute version 1.1 and earlier GPUs, this code depends
|
||||
* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
|
||||
*
|
||||
* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
|
||||
* that all writes are coalesced using extra work in the kernel. On later
|
||||
* GPUs coalescing rules have been relaxed, so this extra overhead hurts
|
||||
* performance. On these GPUs we set manualCoalesce=false and directly store
|
||||
* the results.
|
||||
*
|
||||
* Template parameters are used to generate efficient code for various special cases
|
||||
* For example, we have to handle arrays that are a multiple of the block size
|
||||
* (fullBlocks) differently than arrays that are not. "loop" is used when persistent
|
||||
* CTAs are used.
|
||||
*
|
||||
* By persistent CTAs we mean that we launch only as many thread blocks as can
|
||||
* be resident in the GPU and no more, rather than launching as many threads as
|
||||
* we have elements. Persistent CTAs loop over blocks of elements until all work
|
||||
* is complete. This can be faster in some cases. In our tests it is faster
|
||||
* for large sorts (and the threshold is higher on compute version 1.1 and earlier
|
||||
* GPUs than it is on compute version 1.2 GPUs.
|
||||
*
|
||||
* @param[out] outKeys Output result of reorderDataKeysOnly()
|
||||
* @param[in] keys Keys to be reordered
|
||||
* @param[in] blockOffsets Start offset for each block
|
||||
* @param[in] offsets Offset of each radix within each block
|
||||
* @param[in] sizes Number of elements in a block
|
||||
* @param[in] numElements Total number of elements
|
||||
* @param[in] totalBlocks Total number of blocks
|
||||
*/
|
||||
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
|
||||
__global__ void
|
||||
LAUNCH_BOUNDS(SORT_CTA_SIZE)
|
||||
reorderDataKeysOnly(uint *outKeys,
|
||||
uint2 *keys,
|
||||
uint *blockOffsets,
|
||||
uint *offsets,
|
||||
uint *sizes,
|
||||
uint numElements,
|
||||
uint totalBlocks)
|
||||
{
|
||||
__shared__ uint2 sKeys2[SORT_CTA_SIZE];
|
||||
__shared__ uint sOffsets[16];
|
||||
__shared__ uint sBlockOffsets[16];
|
||||
|
||||
uint *sKeys1 = (uint*)sKeys2;
|
||||
|
||||
uint blockId = blockIdx.x;
|
||||
|
||||
while (!loop || blockId < totalBlocks)
|
||||
{
|
||||
uint i = blockId * blockDim.x + threadIdx.x;
|
||||
|
||||
// handle non-full last block if array is not multiple of 1024 numElements
|
||||
if(!fullBlocks && (((i + 1) << 1) > numElements))
|
||||
{
|
||||
uint *keys1 = (uint*)keys;
|
||||
uint j = i << 1;
|
||||
|
||||
sKeys1[threadIdx.x << 1] = (j < numElements) ? keys1[j] : UINT_MAX;
|
||||
j++;
|
||||
sKeys1[(threadIdx.x << 1) + 1] = (j < numElements) ? keys1[j] : UINT_MAX;
|
||||
}
|
||||
else
|
||||
{
|
||||
sKeys2[threadIdx.x] = keys[i];
|
||||
}
|
||||
|
||||
if (!manualCoalesce)
|
||||
{
|
||||
if(threadIdx.x < 16)
|
||||
{
|
||||
sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId];
|
||||
sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
|
||||
uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
|
||||
|
||||
if (fullBlocks || globalOffset < numElements)
|
||||
{
|
||||
outKeys[globalOffset] = floatUnflip<unflip>(sKeys1[threadIdx.x]);
|
||||
}
|
||||
|
||||
radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
|
||||
globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
|
||||
|
||||
if (fullBlocks || globalOffset < numElements)
|
||||
{
|
||||
outKeys[globalOffset] = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
__shared__ uint sSizes[16];
|
||||
|
||||
if(threadIdx.x < 16)
|
||||
{
|
||||
sOffsets[threadIdx.x] = offsets[threadIdx.x * totalBlocks + blockId];
|
||||
sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
|
||||
sSizes[threadIdx.x] = sizes[threadIdx.x * totalBlocks + blockId];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 1 half-warp is responsible for writing out all values for 1 radix.
|
||||
// Loops if there are more than 16 values to be written out.
|
||||
// All start indices are rounded down to the nearest multiple of 16, and
|
||||
// all end indices are rounded up to the nearest multiple of 16.
|
||||
// Thus it can do extra work if the start and end indices are not multiples of 16
|
||||
// This is bounded by a factor of 2 (it can do 2X more work at most).
|
||||
|
||||
const uint halfWarpID = threadIdx.x >> 4;
|
||||
|
||||
const uint halfWarpOffset = threadIdx.x & 0xF;
|
||||
const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
|
||||
|
||||
uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
|
||||
uint endPos = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 -
|
||||
((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
|
||||
uint numIterations = endPos - startPos;
|
||||
|
||||
uint outOffset = startPos + halfWarpOffset;
|
||||
uint inOffset = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
|
||||
|
||||
for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
|
||||
{
|
||||
if( (outOffset >= sOffsets[halfWarpID]) &&
|
||||
(inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID]))
|
||||
{
|
||||
if(blockId < totalBlocks - 1 || outOffset < numElements)
|
||||
{
|
||||
outKeys[outOffset] = floatUnflip<unflip>(sKeys1[inOffset]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (loop)
|
||||
{
|
||||
blockId += gridDim.x;
|
||||
__syncthreads();
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/** @} */ // end radixsort functions
|
||||
/** @} */ // end cudpp_kernel
|
|
@ -0,0 +1,113 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 5633 $
|
||||
// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* scan_kernel.cu
|
||||
*
|
||||
* @brief CUDPP kernel-level scan routines
|
||||
*/
|
||||
|
||||
/** \defgroup cudpp_kernel CUDPP Kernel-Level API
|
||||
* The CUDPP Kernel-Level API contains functions that run on the GPU
|
||||
* device across a grid of Cooperative Thread Array (CTA, aka Thread
|
||||
* Block). These kernels are declared \c __global__ so that they
|
||||
* must be invoked from host (CPU) code. They generally invoke GPU
|
||||
* \c __device__ routines in the CUDPP \link cudpp_cta CTA-Level API\endlink.
|
||||
* Kernel-Level API functions are used by CUDPP
|
||||
* \link cudpp_app Application-Level\endlink functions to implement their
|
||||
* functionality.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @name Scan Functions
|
||||
* @{
|
||||
*/
|
||||
|
||||
#include <cudpp_globals.h>
|
||||
#include "cta/scan_cta.cu"
|
||||
#include "sharedmem.h"
|
||||
|
||||
/**
|
||||
* @brief Main scan kernel
|
||||
*
|
||||
* This __global__ device function performs one level of a multiblock scan on
|
||||
* an arbitrary-dimensioned array in \a d_in, returning the result in \a d_out
|
||||
* (which may point to the same array). The same function may be used for
|
||||
* single or multi-row scans. To perform a multirow scan, pass the width of
|
||||
* each row of the input row (in elements) in \a dataRowPitch, and the width of
|
||||
* the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke
|
||||
* with a thread block grid with height greater than 1.
|
||||
*
|
||||
* This function peforms one level of a recursive, multiblock scan. At the
|
||||
* app level, this function is called by cudppScan and cudppMultiScan and used
|
||||
* in combination with vectorAddUniform4() to produce a complete scan.
|
||||
*
|
||||
* Template parameter \a T is the datatype of the array to be scanned.
|
||||
* Template parameter \a traits is the ScanTraits struct containing
|
||||
* compile-time options for the scan, such as whether it is forward or
|
||||
* backward, exclusive or inclusive, multi- or single-row, etc.
|
||||
*
|
||||
* @param[out] d_out The output (scanned) array
|
||||
* @param[in] d_in The input array to be scanned
|
||||
* @param[out] d_blockSums The array of per-block sums
|
||||
* @param[in] numElements The number of elements to scan
|
||||
* @param[in] dataRowPitch The width of each row of \a d_in in elements
|
||||
* (for multi-row scans)
|
||||
* @param[in] blockSumRowPitch The with of each row of \a d_blockSums in elements
|
||||
* (for multi-row scans)
|
||||
*/
|
||||
template<class T, class traits>
|
||||
__global__ void scan4(T *d_out,
|
||||
const T *d_in,
|
||||
T *d_blockSums,
|
||||
int numElements,
|
||||
unsigned int dataRowPitch,
|
||||
unsigned int blockSumRowPitch)
|
||||
{
|
||||
SharedMemory<T> smem;
|
||||
T* temp = smem.getPointer();
|
||||
|
||||
int devOffset, ai, bi, aiDev, biDev;
|
||||
T threadScan0[4], threadScan1[4];
|
||||
|
||||
unsigned int blockN = numElements;
|
||||
unsigned int blockSumIndex = blockIdx.x;
|
||||
|
||||
if (traits::isMultiRow())
|
||||
{
|
||||
//int width = __mul24(gridDim.x, blockDim.x) << 1;
|
||||
int yIndex = __umul24(blockDim.y, blockIdx.y) + threadIdx.y;
|
||||
devOffset = __umul24(dataRowPitch, yIndex);
|
||||
blockN += (devOffset << 2);
|
||||
devOffset += __umul24(blockIdx.x, blockDim.x << 1);
|
||||
blockSumIndex += __umul24(blockSumRowPitch << 2, yIndex) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
devOffset = __umul24(blockIdx.x, (blockDim.x << 1));
|
||||
}
|
||||
|
||||
// load data into shared memory
|
||||
loadSharedChunkFromMem4<T, traits>
|
||||
(temp, threadScan0, threadScan1, d_in,
|
||||
blockN, devOffset, ai, bi, aiDev, biDev);
|
||||
|
||||
scanCTA<T, traits>(temp, d_blockSums, blockSumIndex);
|
||||
|
||||
// write results to device memory
|
||||
storeSharedChunkToMem4<T, traits>
|
||||
(d_out, threadScan0, threadScan1, temp,
|
||||
blockN, devOffset, ai, bi, aiDev, biDev);
|
||||
|
||||
}
|
||||
|
||||
/** @} */ // end scan functions
|
||||
/** @} */ // end cudpp_kernel
|
|
@ -0,0 +1,469 @@
|
|||
// -------------------------------------------------------------
|
||||
// CUDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 5632 $
|
||||
// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt in
|
||||
// the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* vector_kernel.cu
|
||||
*
|
||||
* @brief CUDA kernel methods for basic operations on vectors.
|
||||
*
|
||||
* CUDA kernel methods for basic operations on vectors.
|
||||
*
|
||||
* Examples:
|
||||
* - vectorAddConstant(): d_vector + constant
|
||||
* - vectorAddUniform(): d_vector + uniform (per-block constants)
|
||||
* - vectorAddVectorVector(): d_vector + d_vector
|
||||
*/
|
||||
|
||||
// MJH: these functions assume there are 2N elements for N threads.
|
||||
// Is this always going to be a good idea? There may be cases where
|
||||
// we have as many threads as elements, but for large problems
|
||||
// we are probably limited by max CTA size for simple kernels like
|
||||
// this so we should process multiple elements per thread.
|
||||
// we may want to extend these with looping versions that process
|
||||
// many elements per thread.
|
||||
|
||||
#include "cudpp_util.h"
|
||||
#include "sharedmem.h"
|
||||
#include "cudpp.h"
|
||||
|
||||
/** \addtogroup cudpp_kernel
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @name Vector Functions
|
||||
* CUDA kernel methods for basic operations on vectors.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @brief Adds a constant value to all values in the input d_vector
|
||||
*
|
||||
* Each thread adds two pairs of elements.
|
||||
* @todo Test this function -- it is currently not yet used.
|
||||
*
|
||||
* @param[in,out] d_vector The array of elements to be modified
|
||||
* @param[in] constant The constant value to be added to elements of
|
||||
* \a d_vector
|
||||
* @param[in] n The number of elements in the d_vector to be modified
|
||||
* @param[in] baseIndex An optional offset to the beginning of the
|
||||
* elements in the input array to be processed
|
||||
*/
|
||||
template <class T>
|
||||
__global__ void vectorAddConstant(T *d_vector,
|
||||
T constant,
|
||||
int n,
|
||||
int baseIndex)
|
||||
{
|
||||
// Compute this thread's output address
|
||||
unsigned int address = baseIndex + threadIdx.x +
|
||||
__mul24(blockIdx.x, (blockDim.x << 1));
|
||||
|
||||
// note two adds per thread: one in first half of the block, one in last
|
||||
d_vector[address] += constant;
|
||||
d_vector[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * constant;
|
||||
}
|
||||
|
||||
/** @brief Add a uniform value to each data element of an array
|
||||
*
|
||||
* This function reads one value per CTA from \a d_uniforms into shared
|
||||
* memory and adds that value to all values "owned" by the CTA in \a
|
||||
* d_vector. Each thread adds two pairs of values.
|
||||
*
|
||||
* @param[out] d_vector The d_vector whose values will have the uniform added
|
||||
* @param[in] d_uniforms The array of uniform values (one per CTA)
|
||||
* @param[in] numElements The number of elements in \a d_vector to process
|
||||
* @param[in] blockOffset an optional offset to the beginning of this block's
|
||||
* data.
|
||||
* @param[in] baseIndex an optional offset to the beginning of the array
|
||||
* within \a d_vector.
|
||||
*/
|
||||
template <class T>
|
||||
__global__ void vectorAddUniform(T *d_vector,
|
||||
const T *d_uniforms,
|
||||
int numElements,
|
||||
int blockOffset,
|
||||
int baseIndex)
|
||||
{
|
||||
__shared__ T uni;
|
||||
// Get this block's uniform value from the uniform array in device memory
|
||||
// We store it in shared memory so that the hardware's shared memory
|
||||
// broadcast capability can be used to share among all threads in each warp
|
||||
// in a single cycle
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
uni = d_uniforms[blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset];
|
||||
}
|
||||
|
||||
// Compute this thread's output address
|
||||
int width = __mul24(gridDim.x,(blockDim.x << 1));
|
||||
|
||||
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
|
||||
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 1));
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// note two adds per thread: one in first half of the block, one in last
|
||||
d_vector[address] += uni;
|
||||
if (threadIdx.x + blockDim.x < numElements) d_vector[address + blockDim.x] += uni;
|
||||
}
|
||||
|
||||
|
||||
/** @brief Add a uniform value to each data element of an array (vec4 version)
|
||||
*
|
||||
* This function reads one value per CTA from \a d_uniforms into shared
|
||||
* memory and adds that value to all values "owned" by the CTA in \a d_vector.
|
||||
* Each thread adds the uniform value to eight values in \a d_vector.
|
||||
*
|
||||
* @param[out] d_vector The d_vector whose values will have the uniform added
|
||||
* @param[in] d_uniforms The array of uniform values (one per CTA)
|
||||
* @param[in] numElements The number of elements in \a d_vector to process
|
||||
* @param[in] vectorRowPitch For 2D arrays, the pitch (in elements) of the
|
||||
* rows of \a d_vector.
|
||||
* @param[in] uniformRowPitch For 2D arrays, the pitch (in elements) of the
|
||||
* rows of \a d_uniforms.
|
||||
* @param[in] blockOffset an optional offset to the beginning of this block's
|
||||
* data.
|
||||
* @param[in] baseIndex an optional offset to the beginning of the array
|
||||
* within \a d_vector.
|
||||
*/
|
||||
template <class T, CUDPPOperator op, int elementsPerThread>
|
||||
__global__ void vectorAddUniform4(T *d_vector,
|
||||
const T *d_uniforms,
|
||||
int numElements,
|
||||
int vectorRowPitch, // width of input array in elements
|
||||
int uniformRowPitch, // width of uniform array in elements
|
||||
int blockOffset,
|
||||
int baseIndex)
|
||||
{
|
||||
__shared__ T uni;
|
||||
// Get this block's uniform value from the uniform array in device memory
|
||||
// We store it in shared memory so that the hardware's shared memory
|
||||
// broadcast capability can be used to share among all threads in each warp
|
||||
// in a single cycle
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
uni = d_uniforms[blockIdx.x + __umul24(uniformRowPitch, blockIdx.y) + blockOffset];
|
||||
}
|
||||
|
||||
// Compute this thread's output address
|
||||
//int width = __mul24(gridDim.x,(blockDim.x << 1));
|
||||
|
||||
unsigned int address = baseIndex + __umul24(vectorRowPitch, blockIdx.y)
|
||||
+ threadIdx.x + __umul24(blockIdx.x, (blockDim.x * elementsPerThread));
|
||||
numElements += __umul24(vectorRowPitch, blockIdx.y);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
switch (op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
for (int i = 0; i < elementsPerThread && address < numElements; i++)
|
||||
{
|
||||
d_vector[address] += uni;
|
||||
address += blockDim.x;
|
||||
}
|
||||
break;
|
||||
|
||||
case CUDPP_MULTIPLY:
|
||||
for (int i = 0; i < elementsPerThread && address < numElements; i++)
|
||||
{
|
||||
d_vector[address] *= uni;
|
||||
address += blockDim.x;
|
||||
}
|
||||
break;
|
||||
|
||||
case CUDPP_MAX:
|
||||
for (int i = 0; i < elementsPerThread && address < numElements; i++)
|
||||
{
|
||||
d_vector[address] = max(d_vector[address], uni);
|
||||
address += blockDim.x;
|
||||
}
|
||||
break;
|
||||
|
||||
case CUDPP_MIN:
|
||||
for (int i = 0; i < elementsPerThread && address < numElements; i++)
|
||||
{
|
||||
d_vector[address] = min(d_vector[address], uni);
|
||||
address += blockDim.x;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Adds together two vectors
|
||||
*
|
||||
* Each thread adds two pairs of elements.
|
||||
* @todo Test this function -- it is currently not yet used.
|
||||
*
|
||||
* @param[out] d_vectorA The left operand array and the result
|
||||
* @param[in] d_vectorB The right operand array
|
||||
* @param[in] numElements The number of elements in the vectors to be added.
|
||||
* @param[in] baseIndex An optional offset to the beginning of the
|
||||
* elements in the input arrays to be processed
|
||||
*/
|
||||
template <class T>
|
||||
__global__ void vectorAddVector(T *d_vectorA, // A += B
|
||||
const T *d_vectorB,
|
||||
int numElements,
|
||||
int baseIndex)
|
||||
{
|
||||
// Compute this thread's output address
|
||||
unsigned int address = baseIndex + threadIdx.x +
|
||||
__mul24(blockIdx.x, (blockDim.x << 1));
|
||||
|
||||
// note two adds per thread: one in first half of the block, one in last
|
||||
d_vectorA[address] += d_vectorB[address];
|
||||
d_vectorA[address + blockDim.x] +=
|
||||
(threadIdx.x + blockDim.x < numElements) * d_vectorB[address];
|
||||
}
|
||||
|
||||
/** @brief Add a uniform value to data elements of an array (vec4 version)
|
||||
*
|
||||
* This function reads one value per CTA from \a d_uniforms into shared
|
||||
* memory and adds that value to values "owned" by the CTA in \a d_vector.
|
||||
* The uniform value is added to only those values "owned" by the CTA which
|
||||
* have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX
|
||||
* it adds the uniform to all values "owned" by the CTA.
|
||||
* Each thread adds the uniform value to eight values in \a d_vector.
|
||||
*
|
||||
* @param[out] d_vector The d_vector whose values will have the uniform added
|
||||
* @param[in] d_uniforms The array of uniform values (one per CTA)
|
||||
* @param[in] d_maxIndices The array of maximum indices (one per CTA). This is
|
||||
* index upto which the uniform would be added. If this is UINT_MAX
|
||||
* the uniform is added to all elements of the CTA. This index is
|
||||
* 1-based.
|
||||
* @param[in] numElements The number of elements in \a d_vector to process
|
||||
* @param[in] blockOffset an optional offset to the beginning of this block's
|
||||
* data.
|
||||
* @param[in] baseIndex an optional offset to the beginning of the array
|
||||
* within \a d_vector.
|
||||
*/
|
||||
template <class T, CUDPPOperator oper, bool isLastBlockFull>
|
||||
__global__ void vectorSegmentedAddUniform4(T *d_vector,
|
||||
const T *d_uniforms,
|
||||
const unsigned int *d_maxIndices,
|
||||
unsigned int numElements,
|
||||
int blockOffset,
|
||||
int baseIndex)
|
||||
{
|
||||
__shared__ T uni[2];
|
||||
|
||||
unsigned int blockAddress =
|
||||
blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
|
||||
|
||||
// Get this block's uniform value from the uniform array in device memory
|
||||
// We store it in shared memory so that the hardware's shared memory
|
||||
// broadcast capability can be used to share among all threads in each warp
|
||||
// in a single cycle
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
if (blockAddress > 0)
|
||||
uni[0] = d_uniforms[blockAddress-1];
|
||||
else
|
||||
uni[0] = Operator<T, oper>::identity();
|
||||
|
||||
// Tacit assumption that T is four-byte wide
|
||||
uni[1] = (T)(d_maxIndices[blockAddress]);
|
||||
}
|
||||
|
||||
// Compute this thread's output address
|
||||
int width = __mul24(gridDim.x,(blockDim.x << 1));
|
||||
|
||||
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
|
||||
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3));
|
||||
|
||||
__syncthreads();
|
||||
|
||||
unsigned int maxIndex = (unsigned int)(uni[1]);
|
||||
|
||||
bool isLastBlock = (blockIdx.x == (gridDim.x-1));
|
||||
|
||||
if (maxIndex < UINT_MAX)
|
||||
{
|
||||
// Since maxIndex is a 1 based index
|
||||
--maxIndex;
|
||||
bool leftLess = address < maxIndex;
|
||||
bool rightLess = (address + 7 * blockDim.x) < maxIndex;
|
||||
|
||||
if (leftLess)
|
||||
{
|
||||
if (rightLess)
|
||||
{
|
||||
for (unsigned int i = 0; i < 8; ++i)
|
||||
d_vector[address + i * blockDim.x] =
|
||||
Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned int i=0; i < 8; ++i)
|
||||
{
|
||||
if (address < maxIndex)
|
||||
d_vector[address] =
|
||||
Operator<T, oper>::op(d_vector[address], uni[0]);
|
||||
|
||||
address += blockDim.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!isLastBlockFull && isLastBlock)
|
||||
{
|
||||
for (unsigned int i = 0; i < 8; ++i)
|
||||
{
|
||||
if (address < numElements)
|
||||
d_vector[address] =
|
||||
Operator<T, oper>::op(d_vector[address], uni[0]);
|
||||
|
||||
address += blockDim.x;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned int i=0; i<8; ++i)
|
||||
{
|
||||
d_vector[address] =
|
||||
Operator<T, oper>::op(d_vector[address], uni[0]);
|
||||
|
||||
address += blockDim.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Add a uniform value to data elements of an array (vec4 version)
|
||||
*
|
||||
* This function reads one value per CTA from \a d_uniforms into shared
|
||||
* memory and adds that value to values "owned" by the CTA in \a d_vector.
|
||||
* The uniform value is added to only those values "owned" by the CTA which
|
||||
* have an index greater than d_minIndex. If d_minIndex for that CTA is 0
|
||||
* it adds the uniform to all values "owned" by the CTA.
|
||||
* Each thread adds the uniform value to eight values in \a d_vector.
|
||||
*
|
||||
* @param[out] d_vector The d_vector whose values will have the uniform added
|
||||
* @param[in] d_uniforms The array of uniform values (one per CTA)
|
||||
* @param[in] d_minIndices The array of minimum indices (one per CTA). The
|
||||
* uniform is added to the right of this index (that is, to every index
|
||||
* that is greater than this index). If this is 0, the uniform is
|
||||
* added to all elements of the CTA. This index is 1-based to
|
||||
* prevent overloading of what 0 means. In our case it means
|
||||
* absence of a flag. But if the first element of a CTA has
|
||||
* flag the index will also be 0. Hence we use 1-based indices
|
||||
* so the index is 1 in the latter case.
|
||||
* @param[in] numElements The number of elements in \a d_vector to process
|
||||
* @param[in] blockOffset an optional offset to the beginning of this block's
|
||||
* data.
|
||||
* @param[in] baseIndex an optional offset to the beginning of the array
|
||||
* within \a d_vector.
|
||||
*
|
||||
*/
|
||||
template <class T, CUDPPOperator oper, bool isLastBlockFull>
|
||||
__global__ void vectorSegmentedAddUniformToRight4(T *d_vector,
|
||||
const T *d_uniforms,
|
||||
const unsigned int *d_minIndices,
|
||||
unsigned int numElements,
|
||||
int blockOffset,
|
||||
int baseIndex)
|
||||
{
|
||||
__shared__ T uni[2];
|
||||
|
||||
unsigned int blockAddress =
|
||||
blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
|
||||
|
||||
// Get this block's uniform value from the uniform array in device memory
|
||||
// We store it in shared memory so that the hardware's shared memory
|
||||
// broadcast capability can be used to share among all threads in each warp
|
||||
// in a single cycle
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
// FIXME - blockAddress test here is incompatible with how it is calculated
|
||||
// above
|
||||
if (blockAddress < (gridDim.x-1))
|
||||
uni[0] = d_uniforms[blockAddress+1];
|
||||
else
|
||||
uni[0] = Operator<T, oper>::identity();
|
||||
|
||||
// Tacit assumption that T is four-byte wide
|
||||
uni[1] = (T)(d_minIndices[blockAddress]);
|
||||
}
|
||||
|
||||
// Compute this thread's output address
|
||||
int width = __mul24(gridDim.x,(blockDim.x << 1));
|
||||
|
||||
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
|
||||
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3));
|
||||
|
||||
__syncthreads();
|
||||
|
||||
unsigned int minIndex = (unsigned int)(uni[1]);
|
||||
|
||||
bool isLastBlock = (blockIdx.x == (gridDim.x-1));
|
||||
|
||||
if (minIndex > 0)
|
||||
{
|
||||
// Since minIndex is a 1 based index
|
||||
--minIndex;
|
||||
bool leftInRange = address > minIndex;
|
||||
bool rightInRange = (address + 7 * blockDim.x) > minIndex;
|
||||
|
||||
if (rightInRange)
|
||||
{
|
||||
if (leftInRange)
|
||||
{
|
||||
for (unsigned int i = 0; i < 8; ++i)
|
||||
d_vector[address + i * blockDim.x] =
|
||||
Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned int i=0; i < 8; ++i)
|
||||
{
|
||||
if (address > minIndex)
|
||||
d_vector[address] =
|
||||
Operator<T, oper>::op(d_vector[address], uni[0]);
|
||||
|
||||
address += blockDim.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!isLastBlockFull && isLastBlock)
|
||||
{
|
||||
for (unsigned int i = 0; i < 8; ++i)
|
||||
{
|
||||
if (address < numElements)
|
||||
d_vector[address] =
|
||||
Operator<T, oper>::op(d_vector[address], uni[0]);
|
||||
|
||||
address += blockDim.x;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned int i=0; i<8; ++i)
|
||||
{
|
||||
d_vector[address] =
|
||||
Operator<T, oper>::op(d_vector[address], uni[0]);
|
||||
|
||||
address += blockDim.x;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** @} */ // end d_vector functions
|
||||
/** @} */ // end cudpp_kernel
|
|
@ -0,0 +1,25 @@
|
|||
Copyright (c) 2007-2010 The Regents of the University of California, Davis
|
||||
campus ("The Regents") and NVIDIA Corporation ("NVIDIA"). All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of the The Regents, nor NVIDIA, nor the names of its
|
||||
contributors may be used to endorse or promote products derived from this
|
||||
software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,993 @@
|
|||
// -------------------------------------------------------------
|
||||
// CUDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* radixsort_app.cu
|
||||
*
|
||||
* @brief CUDPP application-level radix sorting routines
|
||||
*/
|
||||
|
||||
/** @addtogroup cudpp_app
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @name RadixSort Functions
|
||||
* @{
|
||||
*/
|
||||
|
||||
|
||||
#include "cudpp.h"
|
||||
#include "cudpp_util.h"
|
||||
#include "cudpp_radixsort.h"
|
||||
#include "cudpp_scan.h"
|
||||
#include "kernel/radixsort_kernel.cu"
|
||||
|
||||
#include <cutil.h>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <assert.h>
|
||||
|
||||
typedef unsigned int uint;
|
||||
|
||||
/** @brief Perform one step of the radix sort. Sorts by nbits key bits per step,
|
||||
* starting at startbit.
|
||||
*
|
||||
* Uses cudppScanDispatch() for the prefix sum of radix counters.
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param[in,out] values Associated values to be sorted (through keys).
|
||||
* @param[in] plan Configuration information for RadixSort.
|
||||
* @param[in] numElements Number of elements in the sort.
|
||||
**/
|
||||
template<uint nbits, uint startbit, bool flip, bool unflip>
|
||||
void radixSortStep(uint *keys,
|
||||
uint *values,
|
||||
const CUDPPRadixSortPlan *plan,
|
||||
uint numElements)
|
||||
{
|
||||
const uint eltsPerBlock = SORT_CTA_SIZE * 4;
|
||||
const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
|
||||
|
||||
bool fullBlocks = ((numElements % eltsPerBlock) == 0);
|
||||
uint numBlocks = (fullBlocks) ?
|
||||
(numElements / eltsPerBlock) :
|
||||
(numElements / eltsPerBlock + 1);
|
||||
uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
|
||||
(numElements / eltsPerBlock2) :
|
||||
(numElements / eltsPerBlock2 + 1);
|
||||
|
||||
bool loop = numBlocks > 65535;
|
||||
uint blocks = loop ? 65535 : numBlocks;
|
||||
uint blocksFind = loop ? 65535 : numBlocks2;
|
||||
uint blocksReorder = loop ? 65535 : numBlocks2;
|
||||
|
||||
uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0];
|
||||
|
||||
bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
|
||||
|
||||
if (persist)
|
||||
{
|
||||
loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
|
||||
|
||||
blocks = numBlocks;
|
||||
blocksFind = numBlocks2;
|
||||
blocksReorder = numBlocks2;
|
||||
|
||||
// Run an empty kernel -- this seems to reset some of the CTA scheduling hardware
|
||||
// on GT200, resulting in better scheduling and lower run times
|
||||
if (startbit > 0)
|
||||
{
|
||||
emptyKernel<<<numCTAs(emptyKernel), SORT_CTA_SIZE>>>();
|
||||
}
|
||||
}
|
||||
|
||||
if (fullBlocks)
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) :
|
||||
numCTAs(radixSortBlocks<4, 0, true, false, true>);
|
||||
}
|
||||
|
||||
radixSortBlocks<nbits, startbit, true, flip, true>
|
||||
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
|
||||
}
|
||||
else
|
||||
{
|
||||
radixSortBlocks<nbits, startbit, true, flip, false>
|
||||
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) :
|
||||
numCTAs(radixSortBlocks<4, 0, false, false, true>);
|
||||
}
|
||||
|
||||
radixSortBlocks<nbits, startbit, false, flip, true>
|
||||
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
|
||||
}
|
||||
else
|
||||
{
|
||||
radixSortBlocks<nbits, startbit, false, flip, false>
|
||||
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
|
||||
}
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("radixSortBlocks");
|
||||
|
||||
if (fullBlocks)
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksFind = numCTAs(findRadixOffsets<0, true, true>);
|
||||
}
|
||||
findRadixOffsets<startbit, true, true>
|
||||
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
{
|
||||
findRadixOffsets<startbit, true, false>
|
||||
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksFind = numCTAs(findRadixOffsets<0, false, true>);
|
||||
}
|
||||
findRadixOffsets<startbit, false, true>
|
||||
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
{
|
||||
findRadixOffsets<startbit, false, false>
|
||||
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
|
||||
}
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("findRadixOffsets");
|
||||
|
||||
cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
|
||||
|
||||
if (fullBlocks)
|
||||
{
|
||||
if (plan->m_bManualCoalesce)
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) :
|
||||
numCTAs(reorderData<0, true, true, false, true>);
|
||||
}
|
||||
reorderData<startbit, true, true, unflip, true>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
|
||||
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
{
|
||||
reorderData<startbit, true, true, unflip, false>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
|
||||
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) :
|
||||
numCTAs(reorderData<0, true, false, false, true>);
|
||||
}
|
||||
reorderData<startbit, true, false, unflip, true>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
|
||||
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
{
|
||||
reorderData<startbit, true, false, unflip, false>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
|
||||
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (plan->m_bManualCoalesce)
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksReorder = unflip ?
|
||||
numCTAs(reorderData<0, false, true, true, true>) :
|
||||
numCTAs(reorderData<0, false, true, false, true>);
|
||||
}
|
||||
reorderData<startbit, false, true, unflip, true>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
|
||||
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
{
|
||||
reorderData<startbit, false, true, unflip, false>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
|
||||
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksReorder = unflip ?
|
||||
numCTAs(reorderData<0, false, false, true, true>) :
|
||||
numCTAs(reorderData<0, false, false, false, true>);
|
||||
}
|
||||
reorderData<startbit, false, false, unflip, true>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
|
||||
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
{
|
||||
reorderData<startbit, false, false, unflip, false>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
|
||||
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("radixSortStep");
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param[in,out] values Associated values to be sorted (through keys).
|
||||
* @param numElements Number of elements in the sort.
|
||||
**/
|
||||
template <bool flip>
|
||||
void radixSortSingleBlock(uint *keys,
|
||||
uint *values,
|
||||
uint numElements)
|
||||
{
|
||||
bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
|
||||
if (fullBlocks)
|
||||
{
|
||||
radixSortBlocks<32, 0, true, flip, false>
|
||||
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)keys, (uint4*)values,
|
||||
(uint4*)keys, (uint4*)values,
|
||||
numElements, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
radixSortBlocks<32, 0, false, flip, false>
|
||||
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)keys, (uint4*)values,
|
||||
(uint4*)keys, (uint4*)values,
|
||||
numElements, 0);
|
||||
}
|
||||
|
||||
if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
|
||||
|
||||
CUT_CHECK_ERROR("radixSortSingleBlock");
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Main radix sort function
|
||||
*
|
||||
* Main radix sort function. Sorts in place in the keys and values arrays,
|
||||
* but uses the other device arrays as temporary storage. All pointer
|
||||
* parameters are device pointers. Uses cudppScan() for the prefix sum of
|
||||
* radix counters.
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param[in,out] values Associated values to be sorted (through keys).
|
||||
* @param[in] plan Configuration information for RadixSort.
|
||||
* @param[in] numElements Number of elements in the sort.
|
||||
* @param[in] flipBits Is set true if key datatype is a float
|
||||
* (neg. numbers) for special float sorting operations.
|
||||
* @param[in] keyBits Number of interesting bits in the key
|
||||
**/
|
||||
void radixSort(uint *keys,
|
||||
uint* values,
|
||||
const CUDPPRadixSortPlan *plan,
|
||||
size_t numElements,
|
||||
bool flipBits,
|
||||
int keyBits)
|
||||
{
|
||||
if(numElements <= WARP_SIZE)
|
||||
{
|
||||
if (flipBits)
|
||||
radixSortSingleWarp<true><<<1, numElements>>>
|
||||
(keys, values, numElements);
|
||||
else
|
||||
radixSortSingleWarp<false><<<1, numElements>>>
|
||||
(keys, values, numElements);
|
||||
|
||||
CUT_CHECK_ERROR("radixSortSingleWarp");
|
||||
return;
|
||||
}
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
printf("bits: %d\n", keyBits);
|
||||
#endif
|
||||
|
||||
if(numElements <= SORT_CTA_SIZE * 4)
|
||||
{
|
||||
if (flipBits)
|
||||
radixSortSingleBlock<true>(keys, values, numElements);
|
||||
else
|
||||
radixSortSingleBlock<false>(keys, values, numElements);
|
||||
return;
|
||||
}
|
||||
|
||||
// flip float bits on the first pass, unflip on the last pass
|
||||
if (flipBits)
|
||||
{
|
||||
radixSortStep<4, 0, true, false>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
else
|
||||
{
|
||||
radixSortStep<4, 0, false, false>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
|
||||
if (keyBits > 4)
|
||||
{
|
||||
radixSortStep<4, 4, false, false>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
if (keyBits > 8)
|
||||
{
|
||||
radixSortStep<4, 8, false, false>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
if (keyBits > 12)
|
||||
{
|
||||
radixSortStep<4, 12, false, false>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
if (keyBits > 16)
|
||||
{
|
||||
radixSortStep<4, 16, false, false>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
if (keyBits > 20)
|
||||
{
|
||||
radixSortStep<4, 20, false, false>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
if (keyBits > 24)
|
||||
{
|
||||
radixSortStep<4, 24, false, false>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
if (keyBits > 28)
|
||||
{
|
||||
if (flipBits) // last pass
|
||||
{
|
||||
radixSortStep<4, 28, false, true>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
else
|
||||
{
|
||||
radixSortStep<4, 28, false, false>
|
||||
(keys, values, plan, numElements);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Wrapper to call main radix sort function. For float configuration.
|
||||
*
|
||||
* Calls the main radix sort function. For float configuration.
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param[in,out] values Associated values to be sorted (through keys).
|
||||
* @param[in] plan Configuration information for RadixSort.
|
||||
* @param[in] numElements Number of elements in the sort.
|
||||
* @param[in] negativeKeys Is set true if key datatype has neg. numbers.
|
||||
* @param[in] keyBits Number of interesting bits in the key
|
||||
**/
|
||||
extern "C"
|
||||
void radixSortFloatKeys(float* keys,
|
||||
uint* values,
|
||||
const CUDPPRadixSortPlan *plan,
|
||||
size_t numElements,
|
||||
bool negativeKeys,
|
||||
int keyBits)
|
||||
{
|
||||
|
||||
radixSort((uint*)keys, (uint*)values, plan,
|
||||
numElements, negativeKeys, keyBits);
|
||||
}
|
||||
|
||||
/** @brief Perform one step of the radix sort. Sorts by nbits key bits per step,
|
||||
* starting at startbit.
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param[in] plan Configuration information for RadixSort.
|
||||
* @param[in] numElements Number of elements in the sort.
|
||||
**/
|
||||
template<uint nbits, uint startbit, bool flip, bool unflip>
|
||||
void radixSortStepKeysOnly(uint *keys,
|
||||
const CUDPPRadixSortPlan *plan,
|
||||
uint numElements)
|
||||
{
|
||||
const uint eltsPerBlock = SORT_CTA_SIZE * 4;
|
||||
const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
|
||||
|
||||
bool fullBlocks = ((numElements % eltsPerBlock) == 0);
|
||||
uint numBlocks = (fullBlocks) ?
|
||||
(numElements / eltsPerBlock) :
|
||||
(numElements / eltsPerBlock + 1);
|
||||
uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
|
||||
(numElements / eltsPerBlock2) :
|
||||
(numElements / eltsPerBlock2 + 1);
|
||||
|
||||
bool loop = numBlocks > 65535;
|
||||
|
||||
uint blocks = loop ? 65535 : numBlocks;
|
||||
uint blocksFind = loop ? 65535 : numBlocks2;
|
||||
uint blocksReorder = loop ? 65535 : numBlocks2;
|
||||
|
||||
uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1];
|
||||
|
||||
bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
|
||||
|
||||
if (persist)
|
||||
{
|
||||
loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
|
||||
|
||||
blocks = numBlocks;
|
||||
blocksFind = numBlocks2;
|
||||
blocksReorder = numBlocks2;
|
||||
}
|
||||
|
||||
if (fullBlocks)
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) :
|
||||
numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>);
|
||||
}
|
||||
|
||||
radixSortBlocksKeysOnly<nbits, startbit, true, flip, true>
|
||||
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
|
||||
}
|
||||
else
|
||||
radixSortBlocksKeysOnly<nbits, startbit, true, flip, false>
|
||||
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) :
|
||||
numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>);
|
||||
}
|
||||
|
||||
radixSortBlocksKeysOnly<nbits, startbit, false, flip, true>
|
||||
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
|
||||
}
|
||||
else
|
||||
radixSortBlocksKeysOnly<nbits, startbit, false, flip, false>
|
||||
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
|
||||
|
||||
}
|
||||
|
||||
if (fullBlocks)
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksFind = numCTAs(findRadixOffsets<0, true, true>);
|
||||
}
|
||||
findRadixOffsets<startbit, true, true>
|
||||
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
findRadixOffsets<startbit, true, false>
|
||||
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksFind = numCTAs(findRadixOffsets<0, false, true>);
|
||||
}
|
||||
findRadixOffsets<startbit, false, true>
|
||||
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
findRadixOffsets<startbit, false, false>
|
||||
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
|
||||
|
||||
}
|
||||
|
||||
cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
|
||||
|
||||
if (fullBlocks)
|
||||
{
|
||||
if (plan->m_bManualCoalesce)
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksReorder = unflip ?
|
||||
numCTAs(reorderDataKeysOnly<0, true, true, true, true>) :
|
||||
numCTAs(reorderDataKeysOnly<0, true, true, false, true>);
|
||||
}
|
||||
reorderDataKeysOnly<startbit, true, true, unflip, true>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
|
||||
numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
reorderDataKeysOnly<startbit, true, true, unflip, false>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
|
||||
numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksReorder = unflip ?
|
||||
numCTAs(reorderDataKeysOnly<0, true, false, true, true>) :
|
||||
numCTAs(reorderDataKeysOnly<0, true, false, false, true>);
|
||||
}
|
||||
reorderDataKeysOnly<startbit, true, false, unflip, true>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
|
||||
numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
reorderDataKeysOnly<startbit, true, false, unflip, false>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
|
||||
numElements, numBlocks2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (plan->m_bManualCoalesce)
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksReorder = unflip ?
|
||||
numCTAs(reorderDataKeysOnly<0, false, true, true, true>) :
|
||||
numCTAs(reorderDataKeysOnly<0, false, true, false, true>);
|
||||
}
|
||||
reorderDataKeysOnly<startbit, false, true, unflip, true>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
|
||||
numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
reorderDataKeysOnly<startbit, false, true, unflip, false>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
|
||||
numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (loop)
|
||||
{
|
||||
if (persist)
|
||||
{
|
||||
blocksReorder = unflip ?
|
||||
numCTAs(reorderDataKeysOnly<0, false, false, true, true>) :
|
||||
numCTAs(reorderDataKeysOnly<0, false, false, false, true>);
|
||||
}
|
||||
reorderDataKeysOnly<startbit, false, false, unflip, true>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
|
||||
numElements, numBlocks2);
|
||||
}
|
||||
else
|
||||
reorderDataKeysOnly<startbit, false, false, unflip, false>
|
||||
<<<blocksReorder, SORT_CTA_SIZE>>>
|
||||
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
|
||||
numElements, numBlocks2);
|
||||
}
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("radixSortStepKeysOnly");
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only).
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param numElements Number of elements in the sort.
|
||||
**/
|
||||
template <bool flip>
|
||||
void radixSortSingleBlockKeysOnly(uint *keys,
|
||||
uint numElements)
|
||||
{
|
||||
bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
|
||||
if (fullBlocks)
|
||||
{
|
||||
radixSortBlocksKeysOnly<32, 0, true, flip, false>
|
||||
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)keys, (uint4*)keys, numElements, 1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
radixSortBlocksKeysOnly<32, 0, false, flip, false>
|
||||
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
|
||||
((uint4*)keys, (uint4*)keys, numElements, 1 );
|
||||
}
|
||||
|
||||
if (flip)
|
||||
unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
|
||||
|
||||
|
||||
CUT_CHECK_ERROR("radixSortSingleBlock");
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Main radix sort function. For keys only configuration.
|
||||
*
|
||||
* Main radix sort function. Sorts in place in the keys array,
|
||||
* but uses the other device arrays as temporary storage. All pointer
|
||||
* parameters are device pointers. Uses scan for the prefix sum of
|
||||
* radix counters.
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param[in] plan Configuration information for RadixSort.
|
||||
* @param[in] flipBits Is set true if key datatype is a float (neg. numbers)
|
||||
* for special float sorting operations.
|
||||
* @param[in] numElements Number of elements in the sort.
|
||||
* @param[in] keyBits Number of interesting bits in the key
|
||||
**/
|
||||
extern "C"
|
||||
void radixSortKeysOnly(uint *keys,
|
||||
const CUDPPRadixSortPlan *plan,
|
||||
bool flipBits,
|
||||
size_t numElements,
|
||||
int keyBits)
|
||||
{
|
||||
|
||||
if(numElements <= WARP_SIZE)
|
||||
{
|
||||
if (flipBits)
|
||||
radixSortSingleWarpKeysOnly<true><<<1, numElements>>>(keys, numElements);
|
||||
else
|
||||
radixSortSingleWarpKeysOnly<false><<<1, numElements>>>(keys, numElements);
|
||||
return;
|
||||
}
|
||||
if(numElements <= SORT_CTA_SIZE * 4)
|
||||
{
|
||||
if (flipBits)
|
||||
radixSortSingleBlockKeysOnly<true>(keys, numElements);
|
||||
else
|
||||
radixSortSingleBlockKeysOnly<false>(keys, numElements);
|
||||
return;
|
||||
}
|
||||
|
||||
// flip float bits on the first pass, unflip on the last pass
|
||||
if (flipBits)
|
||||
{
|
||||
radixSortStepKeysOnly<4, 0, true, false>(keys, plan, numElements);
|
||||
}
|
||||
else
|
||||
{
|
||||
radixSortStepKeysOnly<4, 0, false, false>(keys, plan, numElements);
|
||||
}
|
||||
|
||||
if (keyBits > 4)
|
||||
{
|
||||
radixSortStepKeysOnly<4, 4, false, false>(keys, plan, numElements);
|
||||
}
|
||||
if (keyBits > 8)
|
||||
{
|
||||
radixSortStepKeysOnly<4, 8, false, false>(keys, plan, numElements);
|
||||
}
|
||||
if (keyBits > 12)
|
||||
{
|
||||
radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements);
|
||||
}
|
||||
if (keyBits > 16)
|
||||
{
|
||||
radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements);
|
||||
}
|
||||
if (keyBits > 20)
|
||||
{
|
||||
radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements);
|
||||
}
|
||||
if (keyBits > 24)
|
||||
{
|
||||
radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements);
|
||||
}
|
||||
if (keyBits > 28)
|
||||
{
|
||||
if (flipBits) // last pass
|
||||
{
|
||||
radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements);
|
||||
}
|
||||
else
|
||||
{
|
||||
radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Wrapper to call main radix sort function. For floats and keys only.
|
||||
*
|
||||
* Calls the radixSortKeysOnly function setting parameters for floats.
|
||||
*
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param[in] plan Configuration information for RadixSort.
|
||||
* @param[in] negativeKeys Is set true if key flipBits is to be true in
|
||||
* radixSortKeysOnly().
|
||||
* @param[in] numElements Number of elements in the sort.
|
||||
* @param[in] keyBits Number of interesting bits in the key
|
||||
**/
|
||||
extern "C"
|
||||
void radixSortFloatKeysOnly(float *keys,
|
||||
const CUDPPRadixSortPlan *plan,
|
||||
bool negativeKeys,
|
||||
size_t numElements,
|
||||
int keyBits)
|
||||
{
|
||||
radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void initDeviceParameters(CUDPPRadixSortPlan *plan)
|
||||
{
|
||||
int deviceID = -1;
|
||||
if (cudaSuccess == cudaGetDevice(&deviceID))
|
||||
{
|
||||
cudaDeviceProp devprop;
|
||||
cudaGetDeviceProperties(&devprop, deviceID);
|
||||
|
||||
int smVersion = devprop.major * 10 + devprop.minor;
|
||||
|
||||
// sm_12 and later devices don't need help with coalesce in reorderData kernel
|
||||
plan->m_bManualCoalesce = (smVersion < 12);
|
||||
|
||||
// sm_20 and later devices are better off not using persistent CTAs
|
||||
plan->m_bUsePersistentCTAs = (smVersion < 20);
|
||||
|
||||
if (plan->m_bUsePersistentCTAs)
|
||||
{
|
||||
// The following is only true on pre-sm_20 devices (pre-Fermi):
|
||||
// Empirically we have found that for some (usually larger) sort
|
||||
// sizes it is better to use exactly as many "persistent" CTAs
|
||||
// as can fill the GPU, which loop over the "blocks" of work. For smaller
|
||||
// arrays it is better to use the typical CUDA approach of launching one CTA
|
||||
// per block of work.
|
||||
// 0-element of these two-element arrays is for key-value sorts
|
||||
// 1-element is for key-only sorts
|
||||
plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288;
|
||||
plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288;
|
||||
plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608;
|
||||
plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0;
|
||||
|
||||
// create a map of function pointers to register counts for more accurate occupancy calculation
|
||||
// Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it
|
||||
// Note we only insert the "loop" version of the kernels (the one with the last template param = true)
|
||||
// Because those are the only ones that require persistent CTAs that maximally fill the device.
|
||||
computeNumCTAs(radixSortBlocks<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
computeNumCTAs(radixSortBlocks<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
computeNumCTAs(radixSortBlocks<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
computeNumCTAs(radixSortBlocks<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
|
||||
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
|
||||
computeNumCTAs(findRadixOffsets<0, false, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
computeNumCTAs(findRadixOffsets<0, true, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
|
||||
|
||||
computeNumCTAs(reorderData<0, false, false, false, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderData<0, false, false, true, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderData<0, false, true, false, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderData<0, false, true, true, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderData<0, true, false, false, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderData<0, true, false, true, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderData<0, true, true, false, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderData<0, true, true, true, true>, 0, SORT_CTA_SIZE);
|
||||
|
||||
computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>, 0, SORT_CTA_SIZE);
|
||||
computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>, 0, SORT_CTA_SIZE);
|
||||
|
||||
computeNumCTAs(emptyKernel, 0, SORT_CTA_SIZE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief From the programmer-specified sort configuration,
|
||||
* creates internal memory for performing the sort.
|
||||
*
|
||||
* @param[in] plan Pointer to CUDPPRadixSortPlan object
|
||||
**/
|
||||
extern "C"
|
||||
void allocRadixSortStorage(CUDPPRadixSortPlan *plan)
|
||||
{
|
||||
|
||||
unsigned int numElements = plan->m_numElements;
|
||||
|
||||
unsigned int numBlocks =
|
||||
((numElements % (SORT_CTA_SIZE * 4)) == 0) ?
|
||||
(numElements / (SORT_CTA_SIZE * 4)) :
|
||||
(numElements / (SORT_CTA_SIZE * 4) + 1);
|
||||
|
||||
switch(plan->m_config.datatype)
|
||||
{
|
||||
case CUDPP_UINT:
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
|
||||
numElements * sizeof(unsigned int)));
|
||||
|
||||
if (!plan->m_bKeysOnly)
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
|
||||
numElements * sizeof(unsigned int)));
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
|
||||
WARP_SIZE * numBlocks * sizeof(unsigned int)));
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
|
||||
WARP_SIZE * numBlocks * sizeof(unsigned int)));
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
|
||||
WARP_SIZE * numBlocks * sizeof(unsigned int)));
|
||||
break;
|
||||
|
||||
case CUDPP_FLOAT:
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
|
||||
numElements * sizeof(float)));
|
||||
|
||||
if (!plan->m_bKeysOnly)
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
|
||||
numElements * sizeof(float)));
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
|
||||
WARP_SIZE * numBlocks * sizeof(float)));
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
|
||||
WARP_SIZE * numBlocks * sizeof(float)));
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
|
||||
WARP_SIZE * numBlocks * sizeof(float)));
|
||||
break;
|
||||
}
|
||||
|
||||
initDeviceParameters(plan);
|
||||
}
|
||||
|
||||
/** @brief Deallocates intermediate memory from allocRadixSortStorage.
|
||||
*
|
||||
*
|
||||
* @param[in] plan Pointer to CUDPPRadixSortPlan object
|
||||
**/
|
||||
extern "C"
|
||||
void freeRadixSortStorage(CUDPPRadixSortPlan* plan)
|
||||
{
|
||||
CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys));
|
||||
CUDA_SAFE_CALL( cudaFree(plan->m_tempValues));
|
||||
CUDA_SAFE_CALL( cudaFree(plan->m_counters));
|
||||
CUDA_SAFE_CALL( cudaFree(plan->m_countersSum));
|
||||
CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets));
|
||||
}
|
||||
|
||||
/** @brief Dispatch function to perform a sort on an array with
|
||||
* a specified configuration.
|
||||
*
|
||||
* This is the dispatch routine which calls radixSort...() with
|
||||
* appropriate template parameters and arguments as specified by
|
||||
* the plan.
|
||||
* @param[in,out] keys Keys to be sorted.
|
||||
* @param[in,out] values Associated values to be sorted (through keys).
|
||||
* @param[in] numElements Number of elements in the sort.
|
||||
* @param[in] keyBits Number of interesting bits in the key*
|
||||
* @param[in] plan Configuration information for RadixSort.
|
||||
**/
|
||||
extern "C"
|
||||
void cudppRadixSortDispatch(void *keys,
|
||||
void *values,
|
||||
size_t numElements,
|
||||
int keyBits,
|
||||
const CUDPPRadixSortPlan *plan)
|
||||
{
|
||||
if(plan->m_bKeysOnly)
|
||||
{
|
||||
switch(plan->m_config.datatype)
|
||||
{
|
||||
case CUDPP_UINT:
|
||||
radixSortKeysOnly((uint*)keys, plan, false,
|
||||
numElements, keyBits);
|
||||
break;
|
||||
case CUDPP_FLOAT:
|
||||
radixSortFloatKeysOnly((float*)keys, plan, true,
|
||||
numElements, keyBits);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch(plan->m_config.datatype)
|
||||
{
|
||||
case CUDPP_UINT:
|
||||
radixSort((uint*)keys, (uint*) values, plan,
|
||||
numElements, false, keyBits);
|
||||
break;
|
||||
case CUDPP_FLOAT:
|
||||
radixSortFloatKeys((float*)keys, (uint*) values, plan,
|
||||
numElements, true, keyBits);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** @} */ // end radixsort functions
|
||||
/** @} */ // end cudpp_app
|
|
@ -0,0 +1,771 @@
|
|||
// -------------------------------------------------------------
|
||||
// CUDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision: 5633 $
|
||||
// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* scan_app.cu
|
||||
*
|
||||
* @brief CUDPP application-level scan routines
|
||||
*/
|
||||
|
||||
/** \defgroup cudpp_app CUDPP Application-Level API
|
||||
* The CUDPP Application-Level API contains functions
|
||||
* that run on the host CPU and invoke GPU routines in
|
||||
* the CUDPP \link cudpp_kernel Kernel-Level API\endlink.
|
||||
* Application-Level API functions are used by
|
||||
* CUDPP \link publicInterface Public Interface\endlink
|
||||
* functions to implement CUDPP's core functionality.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/** @name Scan Functions
|
||||
* @{
|
||||
*/
|
||||
|
||||
#include "cudpp.h"
|
||||
#include "cudpp_util.h"
|
||||
#include "cudpp_plan.h"
|
||||
#include "kernel/scan_kernel.cu"
|
||||
#include "kernel/vector_kernel.cu"
|
||||
|
||||
|
||||
#include <cutil.h>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <assert.h>
|
||||
|
||||
/** @brief Perform recursive scan on arbitrary size arrays
|
||||
*
|
||||
* This is the CPU-side workhorse function of the scan engine. This function
|
||||
* invokes the CUDA kernels which perform the scan on individual blocks.
|
||||
*
|
||||
* Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans,
|
||||
* where each block is scanned by a single CUDA thread block. At each recursive level of the
|
||||
* scanArrayRecursive first invokes a kernel to scan all blocks of that level, and if the level
|
||||
* has more than one block, it calls itself recursively. On returning from each recursive level,
|
||||
* the total sum of each block from the level below is added to all elements of the corresponding
|
||||
* block in this level. See "Parallel Prefix Sum (Scan) in CUDA" for more information (see
|
||||
* \ref references ).
|
||||
*
|
||||
* Template parameter \a T is the datatype; \a isBackward specifies backward or forward scan;
|
||||
* \a isExclusive specifies exclusive or inclusive scan, and \a op specifies the binary associative
|
||||
* operator to be used.
|
||||
*
|
||||
* @param[out] d_out The output array for the scan results
|
||||
* @param[in] d_in The input array to be scanned
|
||||
* @param[out] d_blockSums Array of arrays of per-block sums (one array per recursive level, allocated
|
||||
* by allocScanStorage())
|
||||
* @param[in] numElements The number of elements in the array to scan
|
||||
* @param[in] numRows The number of rows in the array to scan
|
||||
* @param[in] rowPitches Array of row pitches (one array per recursive level, allocated by
|
||||
* allocScanStorage())
|
||||
* @param[in] level The current recursive level of the scan
|
||||
*/
|
||||
template <class T, bool isBackward, bool isExclusive, CUDPPOperator op>
|
||||
void scanArrayRecursive(T *d_out,
|
||||
const T *d_in,
|
||||
T **d_blockSums,
|
||||
size_t numElements,
|
||||
size_t numRows,
|
||||
const size_t *rowPitches,
|
||||
int level)
|
||||
{
|
||||
unsigned int numBlocks =
|
||||
max(1, (unsigned int)ceil((double)numElements / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
|
||||
|
||||
unsigned int sharedEltsPerBlock = CTA_SIZE * 2;
|
||||
|
||||
unsigned int sharedMemSize = sizeof(T) * sharedEltsPerBlock;
|
||||
|
||||
// divide pitch by four since scan's load/store addresses are for vec4 elements
|
||||
unsigned int rowPitch = 1;
|
||||
unsigned int blockSumRowPitch = 1;
|
||||
|
||||
if (numRows > 1)
|
||||
{
|
||||
rowPitch = rowPitches[level] / 4;
|
||||
blockSumRowPitch = (numBlocks > 1) ? rowPitches[level+1] / 4 : 0;
|
||||
}
|
||||
|
||||
bool fullBlock = (numElements == numBlocks * SCAN_ELTS_PER_THREAD * CTA_SIZE);
|
||||
|
||||
// setup execution parameters
|
||||
dim3 grid(numBlocks, numRows, 1);
|
||||
dim3 threads(CTA_SIZE, 1, 1);
|
||||
|
||||
// make sure there are no CUDA errors before we start
|
||||
CUT_CHECK_ERROR("scanArray before kernels");
|
||||
|
||||
unsigned int traitsCode = 0;
|
||||
if (numBlocks > 1) traitsCode |= 1;
|
||||
if (numRows > 1) traitsCode |= 2;
|
||||
if (fullBlock) traitsCode |= 4;
|
||||
|
||||
switch (traitsCode)
|
||||
{
|
||||
case 0: // single block, single row, non-full block
|
||||
scan4<T, ScanTraits<T, op, isBackward, isExclusive, false, false, false> >
|
||||
<<< grid, threads, sharedMemSize >>>
|
||||
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
|
||||
break;
|
||||
case 1: // multiblock, single row, non-full block
|
||||
scan4< T, ScanTraits<T, op, isBackward, isExclusive, false, true, false> >
|
||||
<<< grid, threads, sharedMemSize >>>
|
||||
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
|
||||
break;
|
||||
case 2: // single block, multirow, non-full block
|
||||
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, false, false> >
|
||||
<<< grid, threads, sharedMemSize >>>
|
||||
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
|
||||
break;
|
||||
case 3: // multiblock, multirow, non-full block
|
||||
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, true, false> >
|
||||
<<< grid, threads, sharedMemSize >>>
|
||||
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
|
||||
break;
|
||||
case 4: // single block, single row, full block
|
||||
scan4<T, ScanTraits<T, op, isBackward, isExclusive, false, false, true> >
|
||||
<<< grid, threads, sharedMemSize >>>
|
||||
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
|
||||
break;
|
||||
case 5: // multiblock, single row, full block
|
||||
scan4< T, ScanTraits<T, op, isBackward, isExclusive, false, true, true> >
|
||||
<<< grid, threads, sharedMemSize >>>
|
||||
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
|
||||
break;
|
||||
case 6: // single block, multirow, full block
|
||||
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, false, true> >
|
||||
<<< grid, threads, sharedMemSize >>>
|
||||
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
|
||||
break;
|
||||
case 7: // multiblock, multirow, full block
|
||||
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, true, true> >
|
||||
<<< grid, threads, sharedMemSize >>>
|
||||
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
|
||||
break;
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("prescan");
|
||||
|
||||
if (numBlocks > 1)
|
||||
{
|
||||
// After scanning all the sub-blocks, we are mostly done. But
|
||||
// now we need to take all of the last values of the
|
||||
// sub-blocks and scan those. This will give us a new value
|
||||
// that must be sdded to each block to get the final results.
|
||||
|
||||
scanArrayRecursive<T, isBackward, true, op>
|
||||
((T*)d_blockSums[level], (const T*)d_blockSums[level],
|
||||
(T**)d_blockSums, numBlocks, numRows, rowPitches, level + 1); // recursive (CPU) call
|
||||
|
||||
vectorAddUniform4<T, op, SCAN_ELTS_PER_THREAD>
|
||||
<<< grid, threads >>>(d_out,
|
||||
(T*)d_blockSums[level],
|
||||
numElements,
|
||||
rowPitch*4,
|
||||
blockSumRowPitch*4,
|
||||
0, 0);
|
||||
CUT_CHECK_ERROR("vectorAddUniform");
|
||||
}
|
||||
}
|
||||
|
||||
// global
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/** @brief Allocate intermediate arrays used by scan.
|
||||
*
|
||||
* Scans of large arrays must be split (possibly recursively) into a hierarchy
|
||||
* of block scans, where each block is scanned by a single CUDA thread block.
|
||||
* At each recursive level of the scan, we need an array in which to store the
|
||||
* total sums of all blocks in that level. This function computes the amount
|
||||
* of storage needed and allocates it.
|
||||
*
|
||||
* @param plan Pointer to CUDPPScanPlan object containing options and number
|
||||
* of elements, which is used to compute storage requirements, and
|
||||
* within which intermediate storage is allocated.
|
||||
*/
|
||||
void allocScanStorage(CUDPPScanPlan *plan)
|
||||
{
|
||||
//assert(config->_numEltsAllocated == 0); // shouldn't be called
|
||||
|
||||
plan->m_numEltsAllocated = plan->m_numElements;
|
||||
|
||||
size_t numElts = plan->m_numElements;
|
||||
|
||||
size_t level = 0;
|
||||
|
||||
do
|
||||
{
|
||||
size_t numBlocks =
|
||||
max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
|
||||
if (numBlocks > 1)
|
||||
{
|
||||
level++;
|
||||
}
|
||||
numElts = numBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
size_t elementSize = 0;
|
||||
|
||||
switch(plan->m_config.datatype)
|
||||
{
|
||||
case CUDPP_INT:
|
||||
plan->m_blockSums = (void**) malloc(level * sizeof(int*));
|
||||
elementSize = sizeof(int);
|
||||
break;
|
||||
case CUDPP_UINT:
|
||||
plan->m_blockSums = (void**) malloc(level * sizeof(unsigned int*));
|
||||
elementSize = sizeof(unsigned int);
|
||||
break;
|
||||
case CUDPP_FLOAT:
|
||||
plan->m_blockSums = (void**) malloc(level * sizeof(float*));
|
||||
elementSize = sizeof(float);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
plan->m_numLevelsAllocated = level;
|
||||
numElts = plan->m_numElements;
|
||||
size_t numRows = plan->m_numRows;
|
||||
plan->m_numRowsAllocated = numRows;
|
||||
plan->m_rowPitches = 0;
|
||||
|
||||
if (numRows > 1)
|
||||
{
|
||||
plan->m_rowPitches = (size_t*) malloc((level + 1) * sizeof(size_t));
|
||||
plan->m_rowPitches[0] = plan->m_rowPitch;
|
||||
}
|
||||
|
||||
level = 0;
|
||||
|
||||
do
|
||||
{
|
||||
size_t numBlocks =
|
||||
max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
|
||||
if (numBlocks > 1)
|
||||
{
|
||||
// Use cudaMallocPitch for multi-row block sums to ensure alignment
|
||||
if (numRows > 1)
|
||||
{
|
||||
size_t dpitch;
|
||||
CUDA_SAFE_CALL( cudaMallocPitch((void**) &(plan->m_blockSums[level]),
|
||||
&dpitch,
|
||||
numBlocks * elementSize,
|
||||
numRows));
|
||||
plan->m_rowPitches[level+1] = dpitch / elementSize;
|
||||
level++;
|
||||
}
|
||||
else
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaMalloc((void**) &(plan->m_blockSums[level++]),
|
||||
numBlocks * elementSize));
|
||||
}
|
||||
}
|
||||
numElts = numBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
CUT_CHECK_ERROR("allocScanStorage");
|
||||
}
|
||||
|
||||
/** @brief Deallocate intermediate block sums arrays in a CUDPPScanPlan object.
|
||||
*
|
||||
* These arrays must have been allocated by allocScanStorage(), which is called
|
||||
* by the constructor of cudppScanPlan().
|
||||
*
|
||||
* @param plan Pointer to CUDPPScanPlan object initialized by allocScanStorage().
|
||||
*/
|
||||
void freeScanStorage(CUDPPScanPlan *plan)
|
||||
{
|
||||
for (unsigned int i = 0; i < plan->m_numLevelsAllocated; i++)
|
||||
{
|
||||
cudaFree(plan->m_blockSums[i]);
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("freeScanStorage");
|
||||
|
||||
free((void**)plan->m_blockSums);
|
||||
if (plan->m_numRows > 1)
|
||||
free((void*)plan->m_rowPitches);
|
||||
|
||||
plan->m_blockSums = 0;
|
||||
plan->m_numEltsAllocated = 0;
|
||||
plan->m_numLevelsAllocated = 0;
|
||||
}
|
||||
|
||||
|
||||
/** @brief Dispatch function to perform a scan (prefix sum) on an
|
||||
* array with the specified configuration.
|
||||
*
|
||||
* This is the dispatch routine which calls scanArrayRecursive() with
|
||||
* appropriate template parameters and arguments to achieve the scan as
|
||||
* specified in \a plan.
|
||||
*
|
||||
* @param[out] d_out The output array of scan results
|
||||
* @param[in] d_in The input array
|
||||
* @param[in] numElements The number of elements to scan
|
||||
* @param[in] numRows The number of rows to scan in parallel
|
||||
* @param[in] plan Pointer to CUDPPScanPlan object containing scan options
|
||||
* and intermediate storage
|
||||
*/
|
||||
void cudppScanDispatch(void *d_out,
|
||||
const void *d_in,
|
||||
size_t numElements,
|
||||
size_t numRows,
|
||||
const CUDPPScanPlan *plan)
|
||||
{
|
||||
if (CUDPP_OPTION_EXCLUSIVE & plan->m_config.options)
|
||||
{
|
||||
if (CUDPP_OPTION_BACKWARD & plan->m_config.options)
|
||||
{
|
||||
switch (plan->m_config.datatype)
|
||||
{
|
||||
case CUDPP_INT:
|
||||
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<int, true, true, CUDPP_ADD>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<int, true, true, CUDPP_MULTIPLY>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<int, true, true, CUDPP_MAX>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<int, true, true, CUDPP_MIN>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case CUDPP_UINT:
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<unsigned int, true, true, CUDPP_ADD>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<unsigned int, true, true, CUDPP_MULTIPLY>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<unsigned int, true, true, CUDPP_MAX>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<unsigned int, true, true, CUDPP_MIN>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case CUDPP_FLOAT:
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<float, true, true, CUDPP_ADD>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<float, true, true, CUDPP_MULTIPLY>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<float, true, true, CUDPP_MAX>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<float, true, true, CUDPP_MIN>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (plan->m_config.datatype)
|
||||
{
|
||||
case CUDPP_INT:
|
||||
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<int, false, true, CUDPP_ADD>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<int, false, true, CUDPP_MULTIPLY>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<int, false, true, CUDPP_MAX>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<int, false, true, CUDPP_MIN>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case CUDPP_UINT:
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<unsigned int, false, true, CUDPP_ADD>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<unsigned int, false, true, CUDPP_MULTIPLY>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<unsigned int, false, true, CUDPP_MAX>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<unsigned int, false, true, CUDPP_MIN>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case CUDPP_FLOAT:
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<float, false, true, CUDPP_ADD>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<float, false, true, CUDPP_MULTIPLY>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<float, false, true, CUDPP_MAX>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<float, false, true, CUDPP_MIN>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (CUDPP_OPTION_BACKWARD & plan->m_config.options)
|
||||
{
|
||||
switch (plan->m_config.datatype)
|
||||
{
|
||||
case CUDPP_INT:
|
||||
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<int, true, false, CUDPP_ADD>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<int, true, false, CUDPP_MULTIPLY>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<int, true, false, CUDPP_MAX>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<int, true, false, CUDPP_MIN>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case CUDPP_UINT:
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<unsigned int, true, false, CUDPP_ADD>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<unsigned int, true, false, CUDPP_MULTIPLY>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<unsigned int, true, false, CUDPP_MAX>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<unsigned int, true, false, CUDPP_MIN>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case CUDPP_FLOAT:
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<float, true, false, CUDPP_ADD>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<float, true, false, CUDPP_MULTIPLY>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<float, true, false, CUDPP_MAX>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<float, true, false, CUDPP_MIN>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (plan->m_config.datatype)
|
||||
{
|
||||
case CUDPP_INT:
|
||||
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<int, false, false, CUDPP_ADD>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<int, false, false, CUDPP_MULTIPLY>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<int, false, false, CUDPP_MAX>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<int, false, false, CUDPP_MIN>
|
||||
((int*)d_out, (const int*)d_in,
|
||||
(int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case CUDPP_UINT:
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<unsigned int, false, false, CUDPP_ADD>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<unsigned int, false, false, CUDPP_MULTIPLY>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<unsigned int, false, false, CUDPP_MAX>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<unsigned int, false, false, CUDPP_MIN>
|
||||
((unsigned int*)d_out, (const unsigned int*)d_in,
|
||||
(unsigned int**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case CUDPP_FLOAT:
|
||||
switch(plan->m_config.op)
|
||||
{
|
||||
case CUDPP_ADD:
|
||||
scanArrayRecursive<float, false, false, CUDPP_ADD>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MULTIPLY:
|
||||
scanArrayRecursive<float, false, false, CUDPP_MULTIPLY>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MAX:
|
||||
scanArrayRecursive<float, false, false, CUDPP_MAX>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
case CUDPP_MIN:
|
||||
scanArrayRecursive<float, false, false, CUDPP_MIN>
|
||||
((float*)d_out, (const float*)d_in,
|
||||
(float**)plan->m_blockSums,
|
||||
numElements, numRows, plan->m_rowPitches, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/** @} */ // end scan functions
|
||||
/** @} */ // end cudpp_app
|
|
@ -0,0 +1,166 @@
|
|||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @file
|
||||
* sharedmem.h
|
||||
*
|
||||
* @brief Shared memory declaration struct for templatized types.
|
||||
*
|
||||
* Because dynamically sized shared memory arrays are declared "extern" in CUDA,
|
||||
* we can't templatize their types directly. To get around this, we declare a
|
||||
* simple wrapper struct that will declare the extern array with a different
|
||||
* name depending on the type. This avoids linker errors about multiple
|
||||
* definitions.
|
||||
*
|
||||
* To use dynamically allocated shared memory in a templatized __global__ or
|
||||
* __device__ function, just replace code like this:
|
||||
*
|
||||
* <pre>
|
||||
* template<class T>
|
||||
* __global__ void
|
||||
* foo( T* d_out, T* d_in)
|
||||
* {
|
||||
* // Shared mem size is determined by the host app at run time
|
||||
* extern __shared__ T sdata[];
|
||||
* ...
|
||||
* doStuff(sdata);
|
||||
* ...
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* With this
|
||||
* <pre>
|
||||
* template<class T>
|
||||
* __global__ void
|
||||
* foo( T* d_out, T* d_in)
|
||||
* {
|
||||
* // Shared mem size is determined by the host app at run time
|
||||
* SharedMemory<T> smem;
|
||||
* T* sdata = smem.getPointer();
|
||||
* ...
|
||||
* doStuff(sdata);
|
||||
* ...
|
||||
* }
|
||||
* </pre>
|
||||
*/
|
||||
|
||||
#ifndef _SHAREDMEM_H_
|
||||
#define _SHAREDMEM_H_
|
||||
|
||||
|
||||
/** @brief Wrapper class for templatized dynamic shared memory arrays.
|
||||
*
|
||||
* This struct uses template specialization on the type \a T to declare
|
||||
* a differently named dynamic shared memory array for each type
|
||||
* (\code extern __shared__ T s_type[] \endcode).
|
||||
*
|
||||
* Currently there are specializations for the following types:
|
||||
* \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long,
|
||||
* \c unsigned long, \c bool, \c float, and \c double. One can also specialize it
|
||||
* for user defined types.
|
||||
*/
|
||||
template <typename T>
|
||||
struct SharedMemory
|
||||
{
|
||||
/** Return a pointer to the runtime-sized shared memory array. **/
|
||||
__device__ T* getPointer()
|
||||
{
|
||||
extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
|
||||
Error_UnsupportedType();
|
||||
return (T*)0;
|
||||
}
|
||||
// TODO: Use operator overloading to make this class look like a regular array
|
||||
};
|
||||
|
||||
// Following are the specializations for the following types.
|
||||
// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
|
||||
// One could also specialize it for user-defined types.
|
||||
|
||||
template <>
|
||||
struct SharedMemory <int>
|
||||
{
|
||||
__device__ int* getPointer() { extern __shared__ int s_int[]; return s_int; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <unsigned int>
|
||||
{
|
||||
__device__ unsigned int* getPointer() { extern __shared__ unsigned int s_uint[]; return s_uint; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <char>
|
||||
{
|
||||
__device__ char* getPointer() { extern __shared__ char s_char[]; return s_char; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <unsigned char>
|
||||
{
|
||||
__device__ unsigned char* getPointer() { extern __shared__ unsigned char s_uchar[]; return s_uchar; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <short>
|
||||
{
|
||||
__device__ short* getPointer() { extern __shared__ short s_short[]; return s_short; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <unsigned short>
|
||||
{
|
||||
__device__ unsigned short* getPointer() { extern __shared__ unsigned short s_ushort[]; return s_ushort; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <long>
|
||||
{
|
||||
__device__ long* getPointer() { extern __shared__ long s_long[]; return s_long; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <unsigned long>
|
||||
{
|
||||
__device__ unsigned long* getPointer() { extern __shared__ unsigned long s_ulong[]; return s_ulong; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <bool>
|
||||
{
|
||||
__device__ bool* getPointer() { extern __shared__ bool s_bool[]; return s_bool; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <float>
|
||||
{
|
||||
__device__ float* getPointer() { extern __shared__ float s_float[]; return s_float; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <double>
|
||||
{
|
||||
__device__ double* getPointer() { extern __shared__ double s_double[]; return s_double; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct SharedMemory <uchar4>
|
||||
{
|
||||
__device__ uchar4* getPointer() { extern __shared__ uchar4 s_uchar4[]; return s_uchar4; }
|
||||
};
|
||||
|
||||
|
||||
#endif //_SHAREDMEM_H_
|
||||
|
||||
// Leave this at the end of the file
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-file-style: "NVIDIA"
|
||||
// End:
|
|
@ -0,0 +1,449 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <math.h>
|
||||
|
||||
#include "gb_gpu_memory.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF;
|
||||
#define GBMT GB_GPU_Memory<numtyp,acctyp>
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
|
||||
const int inum, const int form_low, const int form_high) {
|
||||
int stride=gbm.nbor->nbor_pitch();
|
||||
int anall=gbm.atom->nall();
|
||||
if (gbm.shared_types) {
|
||||
GBMF.k_gb_nbor_fast.set_size(GX,BX);
|
||||
GBMF.k_gb_nbor_fast.run(&gbm.atom->dev_x.begin(),
|
||||
&gbm.cut_form.begin(), &gbm.nbor->dev_nbor.begin(), &stride,
|
||||
&start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
|
||||
&form_high, &anall);
|
||||
} else {
|
||||
GBMF.k_gb_nbor.set_size(GX,BX);
|
||||
GBMF.k_gb_nbor.run(&gbm.atom->dev_x.begin(), &gbm.cut_form.begin(),
|
||||
&gbm._lj_types, &gbm.nbor->dev_nbor.begin(), &stride,
|
||||
&start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
|
||||
&form_high, &anall);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool gb_gpu_init(const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **shape,
|
||||
double **well, double **cutsq, double **sigma,
|
||||
double **epsilon, double *host_lshape, int **form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
GBMF.clear();
|
||||
gpu_mode=GBMF.device->gpu_mode();
|
||||
double gpu_split=GBMF.device->particle_split();
|
||||
int first_gpu=GBMF.device->first_device();
|
||||
int last_gpu=GBMF.device->last_device();
|
||||
int world_me=GBMF.device->world_me();
|
||||
int gpu_rank=GBMF.device->gpu_rank();
|
||||
int procs_per_gpu=GBMF.device->procs_per_gpu();
|
||||
|
||||
GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (world_me==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
|
||||
sigma, epsilon, host_lshape, form, host_lj1,
|
||||
host_lj2, host_lj3, host_lj4, offset, special_lj,
|
||||
inum, nall, max_nbors, cell_size, gpu_split, screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
|
||||
sigma, epsilon, host_lshape, form, host_lj1,
|
||||
host_lj2, host_lj3, host_lj4, offset, special_lj,
|
||||
inum, nall, max_nbors, cell_size, gpu_split,
|
||||
screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
MPI_Barrier(GBMF.device->gpu_comm);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Clear memory on host and device
|
||||
// ---------------------------------------------------------------------------
|
||||
void gb_gpu_clear() {
|
||||
GBMF.clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Build neighbor list on device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
|
||||
const int host_inum, const int nall,
|
||||
double **host_x, double **host_quat,
|
||||
int *host_type, double *boxlo,
|
||||
double *boxhi, bool &success) {
|
||||
gbm.nbor_time_avail=true;
|
||||
|
||||
success=true;
|
||||
gbm.resize_atom(inum,nall,success);
|
||||
gbm.resize_local(inum,host_inum,gbm.nbor->max_nbors(),0,success);
|
||||
if (!success)
|
||||
return;
|
||||
|
||||
gbm.atom->cast_copy_x(host_x,host_type);
|
||||
int mn;
|
||||
gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
|
||||
boxlo, boxhi, NULL, NULL, NULL, success, mn);
|
||||
gbm.nbor->copy_unpacked(inum,mn);
|
||||
gbm.last_ellipse=inum;
|
||||
gbm.max_last_ellipse=inum;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy neighbor list from host and (if spheres) reorder so ellipses first
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
|
||||
const int inum, const int osize,
|
||||
int *ilist, int *numj,
|
||||
int *type, int **firstneigh,
|
||||
bool &success) {
|
||||
success=true;
|
||||
|
||||
gbm.nbor_time_avail=true;
|
||||
|
||||
int mn=gbm.nbor->max_nbor_loop(inum,numj);
|
||||
gbm.resize_atom(inum,nall,success);
|
||||
gbm.resize_local(inum,0,mn,osize,success);
|
||||
if (!success)
|
||||
return;
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
int p=0;
|
||||
for (int i=0; i<osize; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
p++;
|
||||
}
|
||||
}
|
||||
gbm.max_last_ellipse=p;
|
||||
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
||||
for (int i=0; i<osize; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
p++;
|
||||
}
|
||||
}
|
||||
gbm.nbor->get_host(inum,gbm.host_olist.begin(),numj,firstneigh,
|
||||
gbm.block_size());
|
||||
gbm.nbor->copy_unpacked(inum,mn);
|
||||
return;
|
||||
}
|
||||
gbm.last_ellipse=inum;
|
||||
gbm.max_last_ellipse=inum;
|
||||
gbm.nbor->get_host(inum,ilist,numj,firstneigh,gbm.block_size());
|
||||
gbm.nbor->copy_unpacked(inum,mn);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=gbm.block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
|
||||
int stride=gbm.nbor->nbor_pitch();
|
||||
int ainum=gbm.atom->inum();
|
||||
int anall=gbm.atom->nall();
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
gbm.time_kernel.start();
|
||||
if (gbm.last_ellipse>0) {
|
||||
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
|
||||
static_cast<double>(BX)));
|
||||
gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
|
||||
ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
|
||||
gbm.time_gayberne.start();
|
||||
GBMF.k_gayberne.set_size(GX,BX);
|
||||
GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(),
|
||||
&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
|
||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
||||
&stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
|
||||
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
|
||||
gbm.time_gayberne.stop();
|
||||
|
||||
if (gbm.last_ellipse==gbm.atom->inum()) {
|
||||
gbm.time_kernel2.start();
|
||||
gbm.time_kernel2.stop();
|
||||
gbm.time_gayberne2.start();
|
||||
gbm.time_gayberne2.stop();
|
||||
gbm.time_pair.start();
|
||||
gbm.time_pair.stop();
|
||||
return;
|
||||
}
|
||||
|
||||
// ------------ SPHERE_ELLIPSE ---------------
|
||||
|
||||
gbm.time_kernel2.start();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
|
||||
gbm.last_ellipse)/BX));
|
||||
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
|
||||
SPHERE_ELLIPSE,SPHERE_ELLIPSE);
|
||||
gbm.time_kernel2.stop();
|
||||
|
||||
gbm.time_gayberne2.start();
|
||||
GBMF.k_sphere_gb.set_size(GX,BX);
|
||||
GBMF.k_sphere_gb.run(&gbm.atom->dev_x.begin(),&gbm.atom->dev_quat.begin(),
|
||||
&gbm.shape.begin(), &gbm.well.begin(),
|
||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||
&gbm._lj_types, &gbm.lshape.begin(),
|
||||
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
|
||||
&vflag, &gbm.last_ellipse, &ainum, &anall);
|
||||
gbm.time_gayberne2.stop();
|
||||
} else {
|
||||
gbm.atom->dev_ans.zero();
|
||||
gbm.atom->dev_engv.zero();
|
||||
gbm.time_kernel.stop();
|
||||
gbm.time_gayberne.start();
|
||||
gbm.time_gayberne.stop();
|
||||
gbm.time_kernel2.start();
|
||||
gbm.time_kernel2.stop();
|
||||
gbm.time_gayberne2.start();
|
||||
gbm.time_gayberne2.stop();
|
||||
}
|
||||
|
||||
// ------------ LJ ---------------
|
||||
gbm.time_pair.start();
|
||||
if (gbm.last_ellipse<gbm.atom->inum()) {
|
||||
if (gbm.shared_types) {
|
||||
GBMF.k_lj_fast.set_size(GX,BX);
|
||||
GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
||||
&gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
|
||||
&stride, &gbm.nbor->dev_packed.begin(),
|
||||
&gbm.atom->dev_ans.begin(),
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
|
||||
} else {
|
||||
GBMF.k_lj.set_size(GX,BX);
|
||||
GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
||||
&gbm.lj3.begin(), &gbm._lj_types,
|
||||
&gbm.gamma_upsilon_mu.begin(), &stride,
|
||||
&gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
|
||||
}
|
||||
}
|
||||
gbm.time_pair.stop();
|
||||
} else {
|
||||
gbm.time_kernel.start();
|
||||
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
|
||||
ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
gbm.time_gayberne.start();
|
||||
GBMF.k_gayberne.set_size(GX,BX);
|
||||
GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(), &gbm.atom->dev_quat.begin(),
|
||||
&gbm.shape.begin(), &gbm.well.begin(),
|
||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
||||
&stride, &gbm.atom->dev_ans.begin(), &ainum,
|
||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||
&eflag, &vflag, &ainum, &anall);
|
||||
gbm.time_gayberne.stop();
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary and then compute forces, torques, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
|
||||
const int inum_full, const int nall,
|
||||
double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
gbm.acc_timers();
|
||||
if (inum_full==0) {
|
||||
gbm.zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
|
||||
int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full);
|
||||
gbm.atom->inum(inum);
|
||||
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
|
||||
host_quat, host_type, boxlo, boxhi, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
gbm.atom->cast_quat_data(host_quat[0]);
|
||||
gbm.hd_balancer.start_timer();
|
||||
} else {
|
||||
gbm.atom->cast_x_data(host_x,host_type);
|
||||
gbm.atom->cast_quat_data(host_quat[0]);
|
||||
gbm.hd_balancer.start_timer();
|
||||
gbm.atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
|
||||
gbm.atom->add_other_data();
|
||||
|
||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
||||
gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
|
||||
gbm.hd_balancer.stop_timer();
|
||||
return gbm.device->nbor.host_nbor.begin();
|
||||
}
|
||||
|
||||
int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x,
|
||||
host_type, boxlo, boxhi, eflag, vflag, eatom, vatom,
|
||||
host_start, cpu_time, success, host_quat);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy nbor list from host if necessary and then calculate forces, torques,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
|
||||
const int inum_full,const int nall,double **host_x,
|
||||
int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
gbm.acc_timers();
|
||||
if (inum_full==0) {
|
||||
gbm.zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int ago=gbm.hd_balancer.ago_first(f_ago);
|
||||
int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time,
|
||||
gbm.nbor->gpu_nbor());
|
||||
gbm.atom->inum(inum);
|
||||
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
||||
host_start=inum;
|
||||
|
||||
if (ago==0) {
|
||||
_gb_gpu_reset_nbors(gbm, nall, inum, inum_full, ilist, numj, host_type,
|
||||
firstneigh, success);
|
||||
if (!success)
|
||||
return NULL;
|
||||
}
|
||||
int *list;
|
||||
if (gbm.multiple_forms)
|
||||
list=gbm.host_olist.begin();
|
||||
else
|
||||
list=ilist;
|
||||
|
||||
gbm.atom->cast_x_data(host_x,host_type);
|
||||
gbm.atom->cast_quat_data(host_quat[0]);
|
||||
gbm.hd_balancer.start_timer();
|
||||
gbm.atom->add_x_data(host_x,host_type);
|
||||
gbm.atom->add_other_data();
|
||||
|
||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
||||
gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
|
||||
gbm.hd_balancer.stop_timer();
|
||||
return list;
|
||||
}
|
||||
|
||||
int * gb_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double **host_quat) {
|
||||
return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x,
|
||||
host_type, ilist, numj, firstneigh, eflag, vflag,
|
||||
eatom, vatom, host_start, cpu_time, success,
|
||||
host_quat);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Return memory usage
|
||||
// ---------------------------------------------------------------------------
|
||||
double gb_gpu_bytes() {
|
||||
return GBMF.host_memory_usage();
|
||||
}
|
|
@ -1,595 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include "nvc_macros.h"
|
||||
#include "nvc_timer.h"
|
||||
#include "nvc_device.h"
|
||||
#include "gb_gpu_memory.cu"
|
||||
#include "gb_gpu_kernel.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
|
||||
#define GBMT GB_GPU_Memory<numtyp,acctyp>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
||||
// -- Only pack neighbors matching the specified inclusive range of forms
|
||||
// -- Only pack neighbors within cutoff
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp>
|
||||
__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
|
||||
const int start, const int inum,
|
||||
const int *dev_ij, const int form_low,
|
||||
const int form_high, const int nall) {
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
|
||||
if (ii<inum) {
|
||||
int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
nbor+=nbor_pitch;
|
||||
int *nbor_newj=nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
vec4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
int newj=0;
|
||||
for ( ; list<list_end; list++) {
|
||||
int j=*list;
|
||||
if (j>=nall)
|
||||
j%=nall;
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=jx.x-ix.x;
|
||||
rsq*=rsq;
|
||||
numtyp t=jx.y-ix.y;
|
||||
rsq+=t*t;
|
||||
t=jx.z-ix.z;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq< _cutsq_<numtyp>(itype,jtype)) {
|
||||
*nbor=j;
|
||||
nbor+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
*nbor_newj=newj;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
||||
// -- Only pack neighbors matching the specified inclusive range of forms
|
||||
// -- Only pack neighbors within cutoff
|
||||
// -- Fast version of routine that uses shared memory for LJ constants
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp>
|
||||
__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch,
|
||||
const int start, const int inum,
|
||||
const int *dev_ij, const int form_low,
|
||||
const int form_high, const int nall) {
|
||||
|
||||
int ii=threadIdx.x;
|
||||
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
int itype=ii/MAX_SHARED_TYPES;
|
||||
int jtype=ii%MAX_SHARED_TYPES;
|
||||
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
|
||||
form[ii]=_form_(itype,jtype);
|
||||
}
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
nbor+=nbor_pitch;
|
||||
int *nbor_newj=nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
vec4 ix=x_[i];
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
|
||||
|
||||
int newj=0;
|
||||
for ( ; list<list_end; list++) {
|
||||
int j=*list;
|
||||
if (j>=nall)
|
||||
j%=nall;
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
|
||||
if (form[mtype]>=form_low && form[mtype]<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=jx.x-ix.x;
|
||||
rsq*=rsq;
|
||||
numtyp t=jx.y-ix.y;
|
||||
rsq+=t*t;
|
||||
t=jx.z-ix.z;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq<cutsq[mtype]) {
|
||||
*nbor=j;
|
||||
nbor+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
*nbor_newj=newj;
|
||||
}
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
|
||||
const int inum, const int form_low, const int form_high) {
|
||||
if (gbm.shared_types) {
|
||||
kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
|
||||
gbm.atom.inum(), start, inum,
|
||||
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
|
||||
} else
|
||||
kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(),
|
||||
gbm.atom.inum(), start, inum,
|
||||
gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Convert something to a string
|
||||
// ---------------------------------------------------------------------------
|
||||
#include <sstream>
|
||||
template <class t>
|
||||
inline string gb_gpu_toa(const t& in) {
|
||||
ostringstream o;
|
||||
o.precision(2);
|
||||
o << in;
|
||||
return o.str();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Return string with GPU info
|
||||
// ---------------------------------------------------------------------------
|
||||
EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
|
||||
string sname=GBMF[0].gpu.name(id)+", "+
|
||||
gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
|
||||
gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
|
||||
gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ";
|
||||
strcpy(name,sname.c_str());
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **shape,
|
||||
double **well, double **cutsq, double **sigma,
|
||||
double **epsilon, double *host_lshape, int **form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int nlocal, const int nall,
|
||||
const int max_nbors, const int thread, const int gpu_id) {
|
||||
assert(thread<MAX_GPU_THREADS);
|
||||
|
||||
GBMF[thread].gpu.init();
|
||||
|
||||
if (GBMF[thread].gpu.num_devices()==0)
|
||||
return false;
|
||||
|
||||
ij_size=IJ_SIZE;
|
||||
return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
|
||||
well, cutsq, sigma, epsilon, host_lshape, form,
|
||||
host_lj1, host_lj2, host_lj3, host_lj4, offset,
|
||||
special_lj, nlocal, nall, max_nbors, false,
|
||||
gpu_id);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Clear memory on host and device
|
||||
// ---------------------------------------------------------------------------
|
||||
EXTERN void gb_gpu_clear(const int thread) {
|
||||
GBMF[thread].clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// copy atom positions, quaternions, and optionally types to device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
|
||||
double **host_quat, const int *host_type,
|
||||
const bool rebuild, cudaStream_t &stream) {
|
||||
atom.time_atom.start();
|
||||
atom.reset_write_buffer();
|
||||
|
||||
// Rows 1-3 of dev_x are position; rows 4-7 are quaternion
|
||||
atom.add_x_data(host_x,host_type);
|
||||
atom.add_q_data(host_quat[0]);
|
||||
|
||||
atom.copy_x_data(stream);
|
||||
atom.copy_q_data(stream);
|
||||
atom.time_atom.stop();
|
||||
}
|
||||
|
||||
EXTERN void gb_gpu_atom(double **host_x, double **host_quat,
|
||||
const int *host_type, const bool rebuild,
|
||||
const int thread) {
|
||||
_gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
|
||||
GBMF[thread].pair_stream);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Signal that we need to transfer a new neighbor list
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal,
|
||||
const int inum, int *ilist, const int *numj,
|
||||
const int *type, bool &success) {
|
||||
success=true;
|
||||
|
||||
gbm.nbor.time_nbor.start();
|
||||
|
||||
int mn=0;
|
||||
for (int i=0; i<inum; i++)
|
||||
mn=std::max(mn,numj[i]);
|
||||
|
||||
if (nall>gbm.max_atoms)
|
||||
gbm.resize_atom(nall,success);
|
||||
if (nlocal>gbm.max_local || mn>gbm._max_nbors)
|
||||
gbm.resize_local(nlocal,mn,success);
|
||||
if (!success)
|
||||
return false;
|
||||
|
||||
gbm.atom.nall(nall);
|
||||
gbm.atom.inum(inum);
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
int ij_size=gbm.nbor.host_ij.numel();
|
||||
if (inum*2<ij_size) {
|
||||
int p=0, acc=0;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[p+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
}
|
||||
}
|
||||
gbm.last_ellipse=p;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[p]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[p+inum]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
}
|
||||
}
|
||||
gbm.nbor.ij_total=0;
|
||||
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
|
||||
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum,
|
||||
2*inum,gbm.pair_stream);
|
||||
} else {
|
||||
int p=0, acc=0;
|
||||
int offset=0;
|
||||
int half=ij_size/2;
|
||||
int hi=0;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[hi]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[hi+half]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
hi++;
|
||||
if (hi==half) {
|
||||
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
|
||||
half,gbm.pair_stream);
|
||||
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
|
||||
inum*2+offset,
|
||||
half,gbm.pair_stream);
|
||||
hi=0;
|
||||
offset+=half;
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
gbm.last_ellipse=p;
|
||||
for (int i=0; i<inum; i++) {
|
||||
int itype=type[ilist[i]];
|
||||
if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
|
||||
gbm.host_olist[p]=ilist[i];
|
||||
gbm.nbor.host_ij[hi]=numj[ilist[i]];
|
||||
gbm.nbor.host_ij[hi+half]=acc;
|
||||
acc+=numj[ilist[i]];
|
||||
p++;
|
||||
hi++;
|
||||
if (hi==half) {
|
||||
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
|
||||
half,gbm.pair_stream);
|
||||
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
|
||||
inum*2+offset,
|
||||
half,gbm.pair_stream);
|
||||
hi=0;
|
||||
offset+=half;
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
|
||||
if (hi>0) {
|
||||
gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
|
||||
hi,gbm.pair_stream);
|
||||
gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
|
||||
inum*2+offset,
|
||||
hi,gbm.pair_stream);
|
||||
}
|
||||
gbm.nbor.ij_total=0;
|
||||
}
|
||||
} else {
|
||||
gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
|
||||
gbm.last_ellipse=inum;
|
||||
}
|
||||
|
||||
gbm.nbor.time_nbor.stop();
|
||||
|
||||
if (gbm.multiple_forms)
|
||||
return gbm.host_olist.begin();
|
||||
return ilist;
|
||||
}
|
||||
|
||||
EXTERN int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
|
||||
int *ilist, const int *numj, const int *type,
|
||||
const int thread, bool &success) {
|
||||
return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
|
||||
success);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy a set of ij_size ij interactions to device and compute energies,
|
||||
// forces, and torques for those interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class gbmtyp>
|
||||
void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij,
|
||||
const bool eflag) {
|
||||
gbm.nbor.time_nbor.add_to_total();
|
||||
// CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
|
||||
|
||||
memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
|
||||
gbm.nbor.time_nbor.start();
|
||||
gbm.nbor.add(num_ij,gbm.pair_stream);
|
||||
gbm.nbor.time_nbor.stop();
|
||||
}
|
||||
|
||||
EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
|
||||
const int thread) {
|
||||
_gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
|
||||
}
|
||||
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) {
|
||||
gbm.atom.time_answer.start();
|
||||
gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
|
||||
gbm.atom.time_answer.stop();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques for all ij interactions
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
|
||||
const bool rebuild) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=BLOCK_1D;
|
||||
int ans_pitch=6;
|
||||
if (eflag)
|
||||
ans_pitch++;
|
||||
if (vflag)
|
||||
ans_pitch+=6;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
|
||||
|
||||
if (gbm.multiple_forms) {
|
||||
gbm.time_kernel.start();
|
||||
if (gbm.last_ellipse>0) {
|
||||
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
|
||||
static_cast<double>(BX)));
|
||||
pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
|
||||
gbm.time_gayberne.start();
|
||||
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
|
||||
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
|
||||
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
|
||||
gbm.time_gayberne.stop();
|
||||
|
||||
if (gbm.last_ellipse==gbm.atom.inum()) {
|
||||
gbm.time_kernel2.start();
|
||||
gbm.time_kernel2.stop();
|
||||
gbm.time_gayberne2.start();
|
||||
gbm.time_gayberne2.stop();
|
||||
gbm.time_pair.start();
|
||||
gbm.time_pair.stop();
|
||||
return;
|
||||
}
|
||||
|
||||
// ------------ SPHERE_ELLIPSE ---------------
|
||||
|
||||
gbm.time_kernel2.start();
|
||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
|
||||
gbm.last_ellipse)/BX));
|
||||
pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
|
||||
ELLIPSE_SPHERE);
|
||||
gbm.time_kernel2.stop();
|
||||
|
||||
gbm.time_gayberne2.start();
|
||||
kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
|
||||
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
|
||||
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
||||
gbm.time_gayberne2.stop();
|
||||
} else {
|
||||
gbm.atom.ans.zero();
|
||||
gbm.time_kernel.stop();
|
||||
gbm.time_gayberne.start();
|
||||
gbm.time_gayberne.stop();
|
||||
gbm.time_kernel2.start();
|
||||
gbm.time_kernel2.stop();
|
||||
gbm.time_gayberne2.start();
|
||||
gbm.time_gayberne2.stop();
|
||||
}
|
||||
|
||||
// ------------ LJ ---------------
|
||||
gbm.time_pair.start();
|
||||
if (gbm.last_ellipse<gbm.atom.inum()) {
|
||||
if (gbm.shared_types)
|
||||
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
|
||||
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag,
|
||||
vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
||||
else
|
||||
kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
|
||||
gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
|
||||
}
|
||||
gbm.time_pair.stop();
|
||||
} else {
|
||||
gbm.time_kernel.start();
|
||||
pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
|
||||
gbm.time_kernel.stop();
|
||||
|
||||
gbm.time_gayberne.start();
|
||||
kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
|
||||
((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(),
|
||||
gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(),
|
||||
gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
|
||||
gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(),
|
||||
eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
|
||||
gbm.time_gayberne.stop();
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild,
|
||||
const int thread) {
|
||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
|
||||
_gb_gpu_enqueue<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Get energies, forces, and torques to host
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp, class acctyp>
|
||||
double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
|
||||
const bool eflag, const bool vflag, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial) {
|
||||
double evdw;
|
||||
|
||||
gbm.atom.time_atom.add_to_total();
|
||||
gbm.nbor.time_nbor.add_to_total();
|
||||
gbm.time_kernel.add_to_total();
|
||||
gbm.time_gayberne.add_to_total();
|
||||
if (gbm.multiple_forms) {
|
||||
gbm.time_kernel2.add_to_total();
|
||||
gbm.time_gayberne2.add_to_total();
|
||||
gbm.time_pair.add_to_total();
|
||||
}
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
|
||||
if (gbm.last_ellipse>gbm.atom.inum()) {
|
||||
if (eflag || vflag)
|
||||
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
|
||||
f,tor,gbm.atom.inum());
|
||||
else
|
||||
gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum());
|
||||
} else {
|
||||
if (eflag || vflag)
|
||||
evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
|
||||
f,tor,gbm.last_ellipse);
|
||||
else
|
||||
gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse);
|
||||
}
|
||||
gbm.atom.time_answer.add_to_total();
|
||||
return evdw;
|
||||
}
|
||||
|
||||
EXTERN double gb_gpu_forces(double **f, double **tor, const int *ilist,
|
||||
const bool eflag, const bool vflag, const bool eflag_atom,
|
||||
const bool vflag_atom, double *eatom, double **vatom,
|
||||
double *virial, const int thread) {
|
||||
return _gb_gpu_forces<PRECISION,ACC_PRECISION>
|
||||
(GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
|
||||
vflag_atom,eatom,vatom,virial);
|
||||
}
|
||||
|
||||
EXTERN void gb_gpu_time(const int i) {
|
||||
cout.precision(4);
|
||||
cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds()
|
||||
<< " s.\n"
|
||||
<< "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds()
|
||||
<< " s.\n"
|
||||
<< "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
|
||||
GBMF[i].time_kernel2.total_seconds() << " s.\n"
|
||||
<< "Force calc: " << GBMF[i].time_gayberne.total_seconds()+
|
||||
GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
|
||||
if (GBMF[i].multiple_forms)
|
||||
cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n";
|
||||
cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds()
|
||||
<< " s.\n";
|
||||
}
|
||||
|
||||
EXTERN int gb_gpu_num_devices() {
|
||||
return GBMF[0].gpu.num_devices();
|
||||
}
|
||||
|
||||
EXTERN double gb_gpu_bytes() {
|
||||
return GBMF[0].host_memory_usage();
|
||||
}
|
||||
|
|
@ -12,44 +12,60 @@
|
|||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef GB_GPU_EXTRA_H
|
||||
#define GB_GPU_EXTRA_H
|
||||
|
||||
#include "math.h"
|
||||
#include "stdio.h"
|
||||
#include "string.h"
|
||||
#define MAX_SHARED_TYPES 8
|
||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Atomic update of global memory
|
||||
------------------------------------------------------------------------- */
|
||||
/*
|
||||
template <class numtyp> __device__
|
||||
inline void atomicAdd(numtyp *address, numtyp val);
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
template <>
|
||||
__device__ inline void atomicAdd<float>(float *address, float val)
|
||||
{
|
||||
int i_val = __float_as_int(val);
|
||||
int tmp0 = 0;
|
||||
int tmp1;
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) {
|
||||
tmp0 = tmp1;
|
||||
i_val = __float_as_int(val + __int_as_float(tmp1));
|
||||
}
|
||||
}*/
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#endif
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
dot product of 2 vectors
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
|
||||
__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
|
||||
{
|
||||
return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
|
||||
}
|
||||
|
@ -58,9 +74,7 @@ static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
|
|||
cross product of 2 vectors
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_cross3(const numtyp *v1,
|
||||
const numtyp *v2, numtyp *ans)
|
||||
__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
|
||||
{
|
||||
ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
|
||||
ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
|
||||
|
@ -71,8 +85,7 @@ static __inline__ __device__ void gpu_cross3(const numtyp *v1,
|
|||
determinant of a matrix
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
|
||||
__inline numtyp gpu_det3(const numtyp m[9])
|
||||
{
|
||||
numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] -
|
||||
m[3]*m[1]*m[8] + m[3]*m[2]*m[7] +
|
||||
|
@ -84,47 +97,25 @@ static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
|
|||
diagonal matrix times a full matrix
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9],
|
||||
numtyp ans[9])
|
||||
__inline void gpu_times3(const numtyp4 shape, const numtyp m[9],
|
||||
numtyp ans[9])
|
||||
{
|
||||
ans[0] = _well_<numtyp>(i,0)*m[0];
|
||||
ans[1] = _well_<numtyp>(i,0)*m[1];
|
||||
ans[2] = _well_<numtyp>(i,0)*m[2];
|
||||
ans[3] = _well_<numtyp>(i,1)*m[3];
|
||||
ans[4] = _well_<numtyp>(i,1)*m[4];
|
||||
ans[5] = _well_<numtyp>(i,1)*m[5];
|
||||
ans[6] = _well_<numtyp>(i,2)*m[6];
|
||||
ans[7] = _well_<numtyp>(i,2)*m[7];
|
||||
ans[8] = _well_<numtyp>(i,2)*m[8];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
diagonal matrix times a full matrix
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9],
|
||||
numtyp ans[9])
|
||||
{
|
||||
ans[0] = _shape_<numtyp>(i,0)*m[0];
|
||||
ans[1] = _shape_<numtyp>(i,0)*m[1];
|
||||
ans[2] = _shape_<numtyp>(i,0)*m[2];
|
||||
ans[3] = _shape_<numtyp>(i,1)*m[3];
|
||||
ans[4] = _shape_<numtyp>(i,1)*m[4];
|
||||
ans[5] = _shape_<numtyp>(i,1)*m[5];
|
||||
ans[6] = _shape_<numtyp>(i,2)*m[6];
|
||||
ans[7] = _shape_<numtyp>(i,2)*m[7];
|
||||
ans[8] = _shape_<numtyp>(i,2)*m[8];
|
||||
ans[0] = shape.x*m[0];
|
||||
ans[1] = shape.x*m[1];
|
||||
ans[2] = shape.x*m[2];
|
||||
ans[3] = shape.y*m[3];
|
||||
ans[4] = shape.y*m[4];
|
||||
ans[5] = shape.y*m[5];
|
||||
ans[6] = shape.z*m[6];
|
||||
ans[7] = shape.z*m[7];
|
||||
ans[8] = shape.z*m[8];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
add two matrices
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_plus3(const numtyp m[9],
|
||||
const numtyp m2[9], numtyp ans[9])
|
||||
__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
|
||||
{
|
||||
ans[0] = m[0]+m2[0];
|
||||
ans[1] = m[1]+m2[1];
|
||||
|
@ -141,10 +132,8 @@ static __inline__ __device__ void gpu_plus3(const numtyp m[9],
|
|||
multiply the transpose of mat1 times mat2
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9],
|
||||
const numtyp m2[9],
|
||||
numtyp ans[9])
|
||||
__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
|
||||
numtyp ans[9])
|
||||
{
|
||||
ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
|
||||
ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
|
||||
|
@ -161,9 +150,7 @@ static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9],
|
|||
row vector times matrix
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_row_times3(const numtyp *v,
|
||||
const numtyp m[9], numtyp *ans)
|
||||
__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
|
||||
{
|
||||
ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
|
||||
ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
|
||||
|
@ -176,10 +163,8 @@ static __inline__ __device__ void gpu_row_times3(const numtyp *v,
|
|||
error_flag set to 2 if bad matrix inversion attempted
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
|
||||
const numtyp *v, numtyp *ans,
|
||||
int *error_flag)
|
||||
__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
|
||||
__global int *error_flag)
|
||||
{
|
||||
// create augmented matrix for pivoting
|
||||
|
||||
|
@ -297,12 +282,10 @@ static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
|
|||
quat = [w i j k]
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void gpu_quat_to_mat_trans(const vec4 *qif,
|
||||
const int qi,
|
||||
numtyp mat[9])
|
||||
__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
|
||||
numtyp mat[9])
|
||||
{
|
||||
vec4 q=qif[qi];
|
||||
numtyp4 q=qif[qi];
|
||||
|
||||
numtyp w2 = q.x*q.x;
|
||||
numtyp i2 = q.y*q.y;
|
||||
|
|
|
@ -0,0 +1,383 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef GB_GPU_KERNEL
|
||||
#define GB_GPU_KERNEL
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "gb_gpu_extra.h"
|
||||
#endif
|
||||
|
||||
__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape,
|
||||
numtyp ans[9])
|
||||
{
|
||||
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
|
||||
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
|
||||
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
|
||||
den = (numtyp)1.0/den;
|
||||
|
||||
ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
|
||||
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
|
||||
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
|
||||
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
|
||||
|
||||
ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
|
||||
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
|
||||
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
|
||||
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
|
||||
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
|
||||
|
||||
ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
|
||||
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
|
||||
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
|
||||
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
|
||||
|
||||
ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
|
||||
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
|
||||
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
|
||||
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
|
||||
|
||||
ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
|
||||
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
|
||||
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
|
||||
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
|
||||
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
|
||||
|
||||
ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
|
||||
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
|
||||
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
|
||||
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
|
||||
|
||||
ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
|
||||
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
|
||||
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
|
||||
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
|
||||
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
|
||||
|
||||
ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
|
||||
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
|
||||
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
|
||||
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
|
||||
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
|
||||
|
||||
ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
|
||||
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
|
||||
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
|
||||
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
|
||||
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
||||
}
|
||||
|
||||
__kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape, __global numtyp4* well,
|
||||
__global numtyp *gum, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global numtyp *lshape,
|
||||
__global int *dev_nbor, const int stride,
|
||||
__global acctyp4 *ans, const int astride,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall) {
|
||||
__local numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=gum[ii+3];
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp4 tor;
|
||||
tor.x=(numtyp)0;
|
||||
tor.y=(numtyp)0;
|
||||
tor.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+mul24(stride,numj);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
numtyp a1[9], b1[9], g1[9];
|
||||
numtyp4 ishape=shape[itype];
|
||||
{
|
||||
numtyp t[9];
|
||||
gpu_quat_to_mat_trans(q,i,a1);
|
||||
gpu_times3(ishape,a1,t);
|
||||
gpu_transpose_times3(a1,t,g1);
|
||||
gpu_times3(well[itype],a1,t);
|
||||
gpu_transpose_times3(a1,t,b1);
|
||||
}
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=stride) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
{
|
||||
numtyp g2[9];
|
||||
{
|
||||
gpu_times3(shape[jtype],a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
gpu_plus3(g1,g2,g12);
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
|
||||
// -- replace r12 with r12 hat
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
numtyp sigma = sig_eps[mtype].x;
|
||||
numtyp epsilon = sig_eps[mtype].y;
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
|
||||
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
|
||||
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
|
||||
}
|
||||
|
||||
// torque for particle 1
|
||||
{
|
||||
numtyp tempv[3], tempv2[3];
|
||||
tempv[0] = -uslj_rsq*kappa[0];
|
||||
tempv[1] = -uslj_rsq*kappa[1];
|
||||
tempv[2] = -uslj_rsq*kappa[2];
|
||||
gpu_row_times3(kappa,g1,tempv2);
|
||||
gpu_cross3(tempv,tempv2,tUr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
|
||||
// Compute teta
|
||||
numtyp temp[9], tempv[3], tempv2[3];
|
||||
compute_eta_torque(g12,a1,ishape,temp);
|
||||
numtyp temp1 = -eta*gum[1];
|
||||
|
||||
tempv[0] = temp1*temp[0];
|
||||
tempv[1] = temp1*temp[1];
|
||||
tempv[2] = temp1*temp[2];
|
||||
gpu_cross3(a1,tempv,tempv2);
|
||||
teta[0] = tempv2[0];
|
||||
teta[1] = tempv2[1];
|
||||
teta[2] = tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[3];
|
||||
tempv[1] = temp1*temp[4];
|
||||
tempv[2] = temp1*temp[5];
|
||||
gpu_cross3(a1+3,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[6];
|
||||
tempv[1] = temp1*temp[7];
|
||||
tempv[2] = temp1*temp[8];
|
||||
gpu_cross3(a1+6,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
}
|
||||
|
||||
numtyp chi, dchi[3], tchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// Compute b12
|
||||
numtyp b2[9], b12[9];
|
||||
{
|
||||
gpu_times3(well[jtype],a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
gpu_plus3(b1,b2,b12);
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
r12[0]*=r;
|
||||
r12[1]*=r;
|
||||
r12[2]*=r;
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
chi = gpu_dot3(r12,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
|
||||
gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
|
||||
|
||||
// compute t_chi
|
||||
numtyp tempv[3];
|
||||
gpu_row_times3(iota,b1,tempv);
|
||||
gpu_cross3(tempv,iota,tchi);
|
||||
temp1 = (numtyp)-4.0*ir*ir;
|
||||
tchi[0] *= temp1;
|
||||
tchi[1] *= temp1;
|
||||
tchi[2] *= temp1;
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag>0)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag>0) {
|
||||
r12[0]*=-r;
|
||||
r12[1]*=-r;
|
||||
r12[2]*=-r;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.x+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.y+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
f.z+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
f.x+=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.y+=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.z+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
|
||||
// Torque on 1
|
||||
temp1 = -u_r*eta*factor_lj;
|
||||
temp2 = -u_r*chi*factor_lj;
|
||||
numtyp temp3 = -chi*eta*factor_lj;
|
||||
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
|
||||
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
|
||||
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=astride;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=astride;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
ans[ii+astride]=tor;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -1,863 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef GB_GPU_KERNEL
|
||||
#define GB_GPU_KERNEL
|
||||
|
||||
#include "gb_gpu_extra.h"
|
||||
|
||||
template <class numtyp>
|
||||
static __inline__ __device__ void compute_eta_torque(numtyp m[9],
|
||||
numtyp m2[9],
|
||||
const int i,
|
||||
numtyp ans[9])
|
||||
{
|
||||
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
|
||||
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
|
||||
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
|
||||
den = (numtyp)1.0/den;
|
||||
|
||||
numtyp shapex=_shape_<numtyp>(i,0);
|
||||
numtyp shapey=_shape_<numtyp>(i,1);
|
||||
numtyp shapez=_shape_<numtyp>(i,2);
|
||||
|
||||
ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
|
||||
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
|
||||
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
|
||||
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
|
||||
|
||||
ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
|
||||
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
|
||||
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
|
||||
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
|
||||
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
|
||||
|
||||
ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
|
||||
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
|
||||
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
|
||||
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
|
||||
|
||||
ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
|
||||
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
|
||||
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
|
||||
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
|
||||
|
||||
ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
|
||||
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
|
||||
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
|
||||
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
|
||||
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
|
||||
|
||||
ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
|
||||
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
|
||||
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
|
||||
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
|
||||
|
||||
ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
|
||||
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
|
||||
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
|
||||
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
|
||||
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
|
||||
|
||||
ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
|
||||
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
|
||||
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
|
||||
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
|
||||
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
|
||||
|
||||
ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
|
||||
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
|
||||
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
|
||||
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
|
||||
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
||||
}
|
||||
|
||||
#include "gb_gpu_kernel.h"
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_gayberne(const vec4* x_, const vec4 *q,
|
||||
const numtyp *gum, const numtyp *special_lj,
|
||||
const int *dev_nbor, const size_t nbor_pitch,
|
||||
acctyp *ans, size_t ans_pitch, int *err_flag,
|
||||
const bool eflag, const bool vflag,
|
||||
const int inum, const int nall) {
|
||||
|
||||
__shared__ numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x);
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp torx=(numtyp)0;
|
||||
acctyp tory=(numtyp)0;
|
||||
acctyp torz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
nbor+=nbor_pitch;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *nbor_end=nbor+nbor_pitch*numj;
|
||||
|
||||
vec4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
numtyp a1[9], b1[9], g1[9];
|
||||
{
|
||||
numtyp t[9];
|
||||
gpu_quat_to_mat_trans(q,i,a1);
|
||||
gpu_shape_times3(itype,a1,t);
|
||||
gpu_transpose_times3(a1,t,g1);
|
||||
gpu_well_times3(itype,a1,t);
|
||||
gpu_transpose_times3(a1,t,b1);
|
||||
}
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
{
|
||||
numtyp g2[9];
|
||||
{
|
||||
gpu_shape_times3(jtype,a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
gpu_plus3(g1,g2,g12);
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
|
||||
// -- replace r12 with r12 hat
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
numtyp sigma = _sigma_<numtyp>(itype,jtype);
|
||||
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
|
||||
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
|
||||
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
|
||||
}
|
||||
|
||||
// torque for particle 1
|
||||
{
|
||||
numtyp tempv[3], tempv2[3];
|
||||
tempv[0] = -uslj_rsq*kappa[0];
|
||||
tempv[1] = -uslj_rsq*kappa[1];
|
||||
tempv[2] = -uslj_rsq*kappa[2];
|
||||
gpu_row_times3(kappa,g1,tempv2);
|
||||
gpu_cross3(tempv,tempv2,tUr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
|
||||
// Compute teta
|
||||
numtyp temp[9], tempv[3], tempv2[3];
|
||||
compute_eta_torque(g12,a1,itype,temp);
|
||||
numtyp temp1 = -eta*gum[1];
|
||||
|
||||
tempv[0] = temp1*temp[0];
|
||||
tempv[1] = temp1*temp[1];
|
||||
tempv[2] = temp1*temp[2];
|
||||
gpu_cross3(a1,tempv,tempv2);
|
||||
teta[0] = tempv2[0];
|
||||
teta[1] = tempv2[1];
|
||||
teta[2] = tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[3];
|
||||
tempv[1] = temp1*temp[4];
|
||||
tempv[2] = temp1*temp[5];
|
||||
gpu_cross3(a1+3,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[6];
|
||||
tempv[1] = temp1*temp[7];
|
||||
tempv[2] = temp1*temp[8];
|
||||
gpu_cross3(a1+6,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
}
|
||||
|
||||
numtyp chi, dchi[3], tchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// Compute b12
|
||||
numtyp b2[9], b12[9];
|
||||
{
|
||||
gpu_well_times3(jtype,a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
gpu_plus3(b1,b2,b12);
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
r12[0]*=r;
|
||||
r12[1]*=r;
|
||||
r12[2]*=r;
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
chi = gpu_dot3(r12,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
|
||||
|
||||
// compute t_chi
|
||||
numtyp tempv[3];
|
||||
gpu_row_times3(iota,b1,tempv);
|
||||
gpu_cross3(tempv,iota,tchi);
|
||||
temp1 = (numtyp)-4.0*ir*ir;
|
||||
tchi[0] *= temp1;
|
||||
tchi[1] *= temp1;
|
||||
tchi[2] *= temp1;
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag) {
|
||||
r12[0]*=-r;
|
||||
r12[1]*=-r;
|
||||
r12[2]*=-r;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
fx+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
fy+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
fz+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
fx+=temp1*dchi[0]-temp2*dUr[0];
|
||||
fy+=temp1*dchi[1]-temp2*dUr[1];
|
||||
fz+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
|
||||
// Torque on 1
|
||||
temp1 = -u_r*eta*factor_lj;
|
||||
temp2 = -u_r*chi*factor_lj;
|
||||
numtyp temp3 = -chi*eta*factor_lj;
|
||||
torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
|
||||
tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
|
||||
torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii*ans_pitch;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1++;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1++;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1++;
|
||||
*ap1=fy;
|
||||
ap1++;
|
||||
*ap1=fz;
|
||||
ap1++;
|
||||
*ap1=torx;
|
||||
ap1++;
|
||||
*ap1=tory;
|
||||
ap1++;
|
||||
*ap1=torz;
|
||||
|
||||
} // if ii
|
||||
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q,
|
||||
const numtyp *gum, const numtyp *special_lj,
|
||||
const int *dev_nbor, const size_t nbor_pitch,
|
||||
acctyp *ans, size_t ans_pitch, int *err_flag,
|
||||
const bool eflag, const bool vflag,
|
||||
const int start, const int inum,
|
||||
const int nall) {
|
||||
__shared__ numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
nbor+=nbor_pitch;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *nbor_end=nbor+nbor_pitch*numj;
|
||||
|
||||
vec4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp oner=_shape_<numtyp>(itype,0);
|
||||
numtyp one_well=_well_<numtyp>(itype,0);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp r12hat[3];
|
||||
r12hat[0]=r12[0]*ir;
|
||||
r12hat[1]=r12[1]*ir;
|
||||
r12hat[2]=r12[2]*ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], eta;
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
{
|
||||
{
|
||||
numtyp g2[9];
|
||||
gpu_shape_times3(jtype,a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
g12[0]=g2[0]+oner;
|
||||
g12[4]=g2[4]+oner;
|
||||
g12[8]=g2[8]+oner;
|
||||
g12[1]=g2[1];
|
||||
g12[2]=g2[2];
|
||||
g12[3]=g2[3];
|
||||
g12[5]=g2[5];
|
||||
g12[6]=g2[6];
|
||||
g12[7]=g2[7];
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12hat,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
numtyp sigma = _sigma_<numtyp>(itype,jtype);
|
||||
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12hat);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
|
||||
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
|
||||
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
}
|
||||
|
||||
numtyp chi, dchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// Compute b12
|
||||
numtyp b12[9];
|
||||
{
|
||||
numtyp b2[9];
|
||||
gpu_well_times3(jtype,a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
b12[0]=b2[0]+one_well;
|
||||
b12[4]=b2[4]+one_well;
|
||||
b12[8]=b2[8]+one_well;
|
||||
b12[1]=b2[1];
|
||||
b12[2]=b2[2];
|
||||
b12[3]=b2[3];
|
||||
b12[5]=b2[5];
|
||||
b12[6]=b2[6];
|
||||
b12[7]=b2[7];
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
chi = gpu_dot3(r12hat,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12hat);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag) {
|
||||
r12[0]*=-1;
|
||||
r12[1]*=-1;
|
||||
r12[2]*=-1;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
fx+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
fy+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
fz+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
fx+=temp1*dchi[0]-temp2*dUr[0];
|
||||
fy+=temp1*dchi[1]-temp2*dUr[1];
|
||||
fz+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii*ans_pitch;
|
||||
if (eflag) {
|
||||
*ap1=energy;
|
||||
ap1++;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1++;
|
||||
}
|
||||
}
|
||||
*ap1=fx;
|
||||
ap1++;
|
||||
*ap1=fy;
|
||||
ap1++;
|
||||
*ap1=fz;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj(const vec4 *x_,
|
||||
const numtyp *special_lj, const int *dev_nbor,
|
||||
const size_t nbor_pitch, const int *dev_ij, acctyp *ans,
|
||||
size_t ans_pitch, int *err_flag, const bool eflag,
|
||||
const bool vflag, const int start, const int inum,
|
||||
const int nall) {
|
||||
__shared__ numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
vec4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; list<list_end; list++) {
|
||||
|
||||
int j=*list;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
vec4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
|
||||
_form_(itype,jtype)==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
|
||||
_lj1_<numtyp>(itype,jtype).y);
|
||||
force*=factor_lj;
|
||||
|
||||
fx+=delx*force;
|
||||
fy+=dely*force;
|
||||
fz+=delz*force;
|
||||
|
||||
if (eflag) {
|
||||
numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
|
||||
_lj3_<numtyp>(itype,jtype).y);
|
||||
energy+=factor_lj*(e-_offset_<numtyp>(1,1));
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii*ans_pitch;
|
||||
if (eflag) {
|
||||
*ap1+=energy;
|
||||
ap1++;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1++;
|
||||
}
|
||||
}
|
||||
*ap1+=fx;
|
||||
ap1++;
|
||||
*ap1+=fy;
|
||||
ap1++;
|
||||
*ap1+=fz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj_fast(const vec4 *x_,
|
||||
const numtyp *special_lj, const int *dev_nbor,
|
||||
const size_t nbor_pitch, const int *dev_ij,
|
||||
acctyp *ans, size_t ans_pitch,int *err_flag,
|
||||
const bool eflag, const bool vflag,
|
||||
const int start, const int inum, const int nall){
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=threadIdx.x;
|
||||
__shared__ numtyp sp_lj[4];
|
||||
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=special_lj[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
int itype=ii/MAX_SHARED_TYPES;
|
||||
int jtype=ii%MAX_SHARED_TYPES;
|
||||
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
|
||||
form[ii]=_form_(itype,jtype);
|
||||
lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
|
||||
lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
|
||||
if (eflag) {
|
||||
lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
|
||||
lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
|
||||
offset[ii]=_offset_<numtyp>(itype,jtype);
|
||||
}
|
||||
}
|
||||
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp fx=(numtyp)0;
|
||||
acctyp fy=(numtyp)0;
|
||||
acctyp fz=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
const int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
const int *list=dev_ij+*nbor;
|
||||
const int *list_end=list+numj;
|
||||
|
||||
vec4 ix=x_[i];
|
||||
int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; list<list_end; list++) {
|
||||
|
||||
int j=*list;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
vec4 jx=x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
|
||||
|
||||
fx+=delx*force;
|
||||
fy+=dely*force;
|
||||
fz+=delz*force;
|
||||
|
||||
if (eflag) {
|
||||
numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
|
||||
energy+=factor_lj*(e-offset[mtype]);
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
acctyp *ap1=ans+ii*ans_pitch;
|
||||
if (eflag) {
|
||||
*ap1+=energy;
|
||||
ap1++;
|
||||
}
|
||||
if (vflag) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1++;
|
||||
}
|
||||
}
|
||||
*ap1+=fx;
|
||||
ap1++;
|
||||
*ap1+=fy;
|
||||
ap1++;
|
||||
*ap1+=fz;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,472 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef GB_GPU_KERNEL_LJ
|
||||
#define GB_GPU_KERNEL_LJ
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "gb_gpu_extra.h"
|
||||
#endif
|
||||
|
||||
__kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape,__global numtyp4* well,
|
||||
__global numtyp *gum, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global numtyp *lshape,
|
||||
__global int *dev_nbor, const int stride,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag,const int start, const int inum,
|
||||
const int nall) {
|
||||
__local numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=gum[ii+3];
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+stride*numj;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp oner=shape[itype].x;
|
||||
numtyp one_well=well[itype].x;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=stride) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp r12hat[3];
|
||||
r12hat[0]=r12[0]*ir;
|
||||
r12hat[1]=r12[1]*ir;
|
||||
r12hat[2]=r12[2]*ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], eta;
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
{
|
||||
{
|
||||
numtyp g2[9];
|
||||
gpu_times3(shape[jtype],a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
g12[0]=g2[0]+oner;
|
||||
g12[4]=g2[4]+oner;
|
||||
g12[8]=g2[8]+oner;
|
||||
g12[1]=g2[1];
|
||||
g12[2]=g2[2];
|
||||
g12[3]=g2[3];
|
||||
g12[5]=g2[5];
|
||||
g12[6]=g2[6];
|
||||
g12[7]=g2[7];
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12hat,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
numtyp sigma = sig_eps[mtype].x;
|
||||
numtyp epsilon = sig_eps[mtype].y;
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12hat);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
|
||||
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
|
||||
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
}
|
||||
|
||||
numtyp chi, dchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// Compute b12
|
||||
numtyp b12[9];
|
||||
{
|
||||
numtyp b2[9];
|
||||
gpu_times3(well[jtype],a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
b12[0]=b2[0]+one_well;
|
||||
b12[4]=b2[4]+one_well;
|
||||
b12[8]=b2[8]+one_well;
|
||||
b12[1]=b2[1];
|
||||
b12[2]=b2[2];
|
||||
b12[3]=b2[3];
|
||||
b12[5]=b2[5];
|
||||
b12[6]=b2[6];
|
||||
b12[7]=b2[7];
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
chi = gpu_dot3(r12hat,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12hat);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag>0)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag>0) {
|
||||
r12[0]*=-1;
|
||||
r12[1]*=-1;
|
||||
r12[2]*=-1;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.x+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.y+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
f.z+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
f.x+=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.y+=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.z+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *gum,
|
||||
const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int nall) {
|
||||
__local numtyp sp_lj[4];
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
if (ii<4)
|
||||
sp_lj[ii]=gum[ii+3];
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *list_end=nbor+mul24(stride,numj);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=stride) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int ii=itype*lj_types+jtype;
|
||||
if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
|
||||
force*=factor_lj;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
|
||||
energy+=factor_lj*(e-lj3[ii].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1+=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
acctyp4 old=ans[ii];
|
||||
old.x+=f.x;
|
||||
old.y+=f.y;
|
||||
old.z+=f.z;
|
||||
ans[ii]=old;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in, __global numtyp *gum,
|
||||
const int stride,
|
||||
__global int *dev_ij, __global acctyp4 *ans,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag,const int vflag, const int start,
|
||||
const int inum, const int nall) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
__local numtyp sp_lj[4];
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=gum[ii+3];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *list_end=nbor+mul24(stride,numj);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=stride) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1+=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
acctyp4 old=ans[ii];
|
||||
old.x+=f.x;
|
||||
old.y+=f.y;
|
||||
old.z+=f.z;
|
||||
ans[ii]=old;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,170 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef PAIR_GPU_KERNEL_H
|
||||
#define PAIR_GPU_KERNEL_H
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#else
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
|
||||
#endif
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
||||
// -- Only unpack neighbors matching the specified inclusive range of forms
|
||||
// -- Only unpack neighbors within cutoff
|
||||
// ---------------------------------------------------------------------------
|
||||
__kernel void kernel_gb_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
|
||||
const int ntypes, __global int *dev_nbor,
|
||||
const int nbor_pitch,
|
||||
const int start, const int inum,
|
||||
__global int *dev_ij, const int form_low,
|
||||
const int form_high, const int nall) {
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X+start;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24(iw,ntypes);
|
||||
int newj=0;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
if (j>=nall)
|
||||
j%=nall;
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
numtyp2 cf=cut_form[mtype];
|
||||
if (cf.y>=form_low && cf.y<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=jx.x-ix.x;
|
||||
rsq*=rsq;
|
||||
numtyp t=jx.y-ix.y;
|
||||
rsq+=t*t;
|
||||
t=jx.z-ix.z;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq<cf.x) {
|
||||
*packed=j;
|
||||
packed+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
dev_nbor[ii+nbor_pitch]=newj;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
||||
// -- Only unpack neighbors matching the specified inclusive range of forms
|
||||
// -- Only unpack neighbors within cutoff
|
||||
// -- Fast version of routine that uses shared memory for LJ constants
|
||||
// ---------------------------------------------------------------------------
|
||||
__kernel void kernel_gb_nbor_fast(__global numtyp4 *x_,
|
||||
__global numtyp2 *cut_form,
|
||||
__global int *dev_nbor,
|
||||
const int nbor_pitch,
|
||||
const int start, const int inum,
|
||||
__global int *dev_ij, const int form_low,
|
||||
const int form_high, const int nall) {
|
||||
|
||||
int ii=THREAD_ID_X;
|
||||
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
cutsq[ii]=cut_form[ii].x;
|
||||
form[ii]=cut_form[ii].y;
|
||||
}
|
||||
ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
int newj=0;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
if (j>=nall)
|
||||
j%=nall;
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
|
||||
if (form[mtype]>=form_low && form[mtype]<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=jx.x-ix.x;
|
||||
rsq*=rsq;
|
||||
numtyp t=jx.y-ix.y;
|
||||
rsq+=t*t;
|
||||
t=jx.z-ix.z;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq<cutsq[mtype]) {
|
||||
*packed=j;
|
||||
packed+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
dev_nbor[ii+nbor_pitch]=newj;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,334 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "gb_gpu_cl.h"
|
||||
#include "gb_gpu_nbor_cl.h"
|
||||
#else
|
||||
#include "gb_gpu_ptx.h"
|
||||
#endif
|
||||
|
||||
#include "gb_gpu_memory.h"
|
||||
#include <cassert>
|
||||
#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
|
||||
_max_bytes(0.0) {
|
||||
device=&pair_gpu_device;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GB_GPU_MemoryT::~GB_GPU_Memory() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu,
|
||||
double **host_shape, double **host_well,
|
||||
double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double *host_lshape,
|
||||
int **h_form, double **host_lj1, double **host_lj2,
|
||||
double **host_lj3, double **host_lj4,
|
||||
double **host_offset, const double *host_special_lj,
|
||||
const int nlocal, const int nall,
|
||||
const int max_nbors, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
nbor_time_avail=false;
|
||||
screen=_screen;
|
||||
|
||||
bool gpu_nbor=false;
|
||||
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
|
||||
gpu_nbor=true;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host,
|
||||
max_nbors,cell_size,true))
|
||||
return false;
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
nbor=&device->nbor;
|
||||
|
||||
_block_size=BLOCK_1D;
|
||||
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
|
||||
_block_size=ucl_device->group_size();
|
||||
compile_kernels(*ucl_device);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_split);
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init(*ucl_device);
|
||||
time_pair.zero();
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for copying type data
|
||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*ucl_device,
|
||||
UCL_WRITE_OPTIMIZED);
|
||||
|
||||
for (int i=0; i<lj_types*lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
sigma_epsilon.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
|
||||
host_sigma,host_epsilon);
|
||||
|
||||
cut_form.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,lj_types,cut_form,host_write,
|
||||
host_cutsq,h_form);
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq,h_form);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
|
||||
dev_error.alloc(1,*ucl_device);
|
||||
dev_error.zero();
|
||||
|
||||
_allocated=true;
|
||||
|
||||
host_form=h_form;
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_kernel.init(*ucl_device);
|
||||
time_gayberne.init(*ucl_device);
|
||||
time_kernel2.init(*ucl_device);
|
||||
time_gayberne2.init(*ucl_device);
|
||||
time_kernel.zero();
|
||||
time_gayberne.zero();
|
||||
time_kernel2.zero();
|
||||
time_gayberne2.zero();
|
||||
|
||||
// Allocate, cast and asynchronous memcpy of constant data
|
||||
// Copy data for bonded interactions
|
||||
gamma_upsilon_mu.alloc(7,*ucl_device,UCL_READ_ONLY);
|
||||
host_write[0]=static_cast<numtyp>(gamma);
|
||||
host_write[1]=static_cast<numtyp>(upsilon);
|
||||
host_write[2]=static_cast<numtyp>(mu);
|
||||
host_write[3]=static_cast<numtyp>(host_special_lj[0]);
|
||||
host_write[4]=static_cast<numtyp>(host_special_lj[1]);
|
||||
host_write[5]=static_cast<numtyp>(host_special_lj[2]);
|
||||
host_write[6]=static_cast<numtyp>(host_special_lj[3]);
|
||||
ucl_copy(gamma_upsilon_mu,host_write,7,false);
|
||||
|
||||
lshape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
|
||||
UCL_H_Vec<double> d_view;
|
||||
d_view.view(host_lshape,lshape.numel(),*ucl_device);
|
||||
ucl_copy(lshape,d_view,false);
|
||||
|
||||
// Copy shape, well, sigma, epsilon, and cutsq onto GPU
|
||||
// - cast if necessary
|
||||
shape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
|
||||
for (int i=0; i<ntypes; i++) {
|
||||
host_write[i*4]=host_shape[i][0];
|
||||
host_write[i*4+1]=host_shape[i][1];
|
||||
host_write[i*4+2]=host_shape[i][2];
|
||||
}
|
||||
UCL_H_Vec<numtyp4> view4;
|
||||
view4.view((numtyp4*)host_write.begin(),shape.numel(),*ucl_device);
|
||||
ucl_copy(shape,view4,false);
|
||||
|
||||
well.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
|
||||
for (int i=0; i<ntypes; i++) {
|
||||
host_write[i*4]=host_well[i][0];
|
||||
host_write[i*4+1]=host_well[i][1];
|
||||
host_write[i*4+2]=host_well[i][2];
|
||||
}
|
||||
view4.view((numtyp4*)host_write.begin(),well.numel(),*ucl_device);
|
||||
ucl_copy(well,view4,false);
|
||||
|
||||
// See if we want fast GB-sphere or sphere-sphere calculations
|
||||
multiple_forms=false;
|
||||
for (int i=1; i<ntypes; i++)
|
||||
for (int j=i; j<ntypes; j++)
|
||||
if (host_form[i][j]!=ELLIPSE_ELLIPSE)
|
||||
multiple_forms=true;
|
||||
if (multiple_forms && host_nlocal>0) {
|
||||
std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (multiple_forms)
|
||||
atom->dev_ans.zero();
|
||||
|
||||
_max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
// Memory for ilist ordered by particle type
|
||||
return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
|
||||
UCL_H_Vec<int> err_flag(1,*ucl_device);
|
||||
ucl_copy(err_flag,dev_error,false);
|
||||
if (err_flag[0] == 2)
|
||||
std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
|
||||
err_flag.clear();
|
||||
|
||||
_allocated=false;
|
||||
|
||||
// Output any timing information
|
||||
acc_timers();
|
||||
double single[6], times[6];
|
||||
|
||||
single[0]=atom->transfer_time();
|
||||
single[1]=nbor->time_nbor.total_seconds();
|
||||
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
|
||||
nbor->time_kernel.total_seconds();
|
||||
single[3]=time_gayberne.total_seconds()+time_gayberne2.total_seconds();
|
||||
if (multiple_forms)
|
||||
single[4]=time_pair.total_seconds();
|
||||
else
|
||||
single[4]=0;
|
||||
single[5]=atom->cast_time();
|
||||
|
||||
MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
|
||||
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
|
||||
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
|
||||
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
|
||||
gamma_upsilon_mu.row_bytes();
|
||||
double mpi_max_bytes;
|
||||
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
|
||||
double max_mb=mpi_max_bytes/(1024*1024);
|
||||
|
||||
if (device->world_me()==0)
|
||||
if (screen && times[3]>0.0) {
|
||||
int world_size=device->world_size();
|
||||
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
fprintf(screen," GPU Time Info (average): ");
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
if (device->procs_per_gpu()==1) {
|
||||
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/world_size);
|
||||
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/world_size);
|
||||
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/world_size);
|
||||
if (nbor->gpu_nbor())
|
||||
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/world_size);
|
||||
else
|
||||
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/world_size);
|
||||
fprintf(screen,"Force calc: %.4f s.\n",times[3]/world_size);
|
||||
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/world_size);
|
||||
}
|
||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||
fprintf(screen,"-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n\n");
|
||||
}
|
||||
_max_bytes=0.0;
|
||||
|
||||
dev_error.clear();
|
||||
lj1.clear();
|
||||
lj3.clear();
|
||||
sigma_epsilon.clear();
|
||||
cut_form.clear();
|
||||
|
||||
shape.clear();
|
||||
well.clear();
|
||||
lshape.clear();
|
||||
gamma_upsilon_mu.clear();
|
||||
host_olist.clear();
|
||||
|
||||
time_kernel.clear();
|
||||
time_gayberne.clear();
|
||||
time_kernel2.clear();
|
||||
time_gayberne2.clear();
|
||||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
if (_compiled) {
|
||||
k_gb_nbor_fast.clear();
|
||||
k_gb_nbor.clear();
|
||||
k_gayberne.clear();
|
||||
k_sphere_gb.clear();
|
||||
k_lj_fast.clear();
|
||||
k_lj.clear();
|
||||
delete pair_program;
|
||||
delete gb_program;
|
||||
delete gb_lj_program;
|
||||
_compiled=false;
|
||||
}
|
||||
|
||||
device->clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double GB_GPU_MemoryT::host_memory_usage() const {
|
||||
return device->atom.host_memory_usage()+
|
||||
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
||||
sizeof(GB_GPU_Memory<numtyp,acctyp>)+
|
||||
device->nbor.max_atoms()*sizeof(int);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::compile_kernels(UCL_Device &dev) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
||||
std::string(OCL_PRECISION_COMPILE);
|
||||
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(gb_gpu_kernel_nbor,flags.c_str());
|
||||
k_gb_nbor_fast.set_function(*pair_program,"kernel_gb_nbor_fast");
|
||||
k_gb_nbor.set_function(*pair_program,"kernel_gb_nbor");
|
||||
|
||||
gb_program=new UCL_Program(dev);
|
||||
gb_program->load_string(gb_gpu_kernel,flags.c_str());
|
||||
k_gayberne.set_function(*gb_program,"kernel_gayberne");
|
||||
|
||||
gb_lj_program=new UCL_Program(dev);
|
||||
gb_lj_program->load_string(gb_gpu_kernel_lj,flags.c_str());
|
||||
k_sphere_gb.set_function(*gb_lj_program,"kernel_sphere_gb");
|
||||
k_lj_fast.set_function(*gb_lj_program,"kernel_lj_fast");
|
||||
k_lj.set_function(*gb_lj_program,"kernel_lj");
|
||||
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
|
||||
|
|
@ -1,156 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "gb_gpu_memory.h"
|
||||
#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() {
|
||||
this->atom.atom_fields(8);
|
||||
this->atom.ans_fields(13);
|
||||
this->nbor.packing(true);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
GB_GPU_MemoryT::~GB_GPU_Memory() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
const double gamma, const double upsilon,
|
||||
const double mu, double **host_shape,
|
||||
double **host_well, double **host_cutsq,
|
||||
double **host_sigma, double **host_epsilon,
|
||||
double *host_lshape, int **h_form, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const bool force_d, const int me) {
|
||||
_max_nbors=max_nbors;
|
||||
if (this->allocated)
|
||||
clear();
|
||||
|
||||
bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
|
||||
host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
host_offset, host_special_lj, max_nbors, me,
|
||||
nlocal, nall);
|
||||
if (!p)
|
||||
return false;
|
||||
|
||||
host_form=h_form;
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_kernel.init();
|
||||
time_gayberne.init();
|
||||
time_kernel2.init();
|
||||
time_gayberne2.init();
|
||||
|
||||
// Use the write buffer from atom for data initialization
|
||||
NVC_HostT &host_write=this->atom.host_write;
|
||||
assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
|
||||
|
||||
// Allocate, cast and asynchronous memcpy of constant data
|
||||
gamma_upsilon_mu.safe_alloc(3);
|
||||
host_write[0]=static_cast<numtyp>(gamma);
|
||||
host_write[1]=static_cast<numtyp>(upsilon);
|
||||
host_write[2]=static_cast<numtyp>(mu);
|
||||
gamma_upsilon_mu.copy_from_host(host_write.begin());
|
||||
|
||||
lshape.safe_alloc(ntypes,lshape_get_texture<numtyp>());
|
||||
lshape.cast_copy(host_lshape,host_write);
|
||||
lshape.copy_from_host(host_write.begin());
|
||||
|
||||
// Copy shape, well, sigma, epsilon, and cutsq onto GPU
|
||||
shape.safe_alloc(ntypes,3,shape_get_texture<numtyp>());
|
||||
shape.cast_copy(host_shape[0],host_write);
|
||||
well.safe_alloc(ntypes,3,well_get_texture<numtyp>());
|
||||
well.cast_copy(host_well[0],host_write);
|
||||
|
||||
// Copy LJ data onto GPU
|
||||
int lj_types=ntypes;
|
||||
if (lj_types<=MAX_SHARED_TYPES)
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
form.safe_alloc(lj_types,lj_types,form_get_texture());
|
||||
form.copy_2Dfrom_host(host_form[0],ntypes,ntypes);
|
||||
|
||||
// See if we want fast GB-sphere or sphere-sphere calculations
|
||||
multiple_forms=false;
|
||||
for (int i=1; i<ntypes; i++)
|
||||
for (int j=i; j<ntypes; j++)
|
||||
if (host_form[i][j]!=ELLIPSE_ELLIPSE)
|
||||
multiple_forms=true;
|
||||
|
||||
// Memory for ilist ordered by particle type
|
||||
return host_olist.alloc_rw(this->max_local);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::resize_atom(const int nall, bool &success) {
|
||||
this->max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
this->atom.resize(this->max_atoms, success);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::resize_local(const int nlocal, const int max_nbors,
|
||||
bool &success) {
|
||||
if (nlocal>this->max_local) {
|
||||
this->max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
|
||||
host_olist.clear();
|
||||
success=success && host_olist.alloc_rw(this->max_local);
|
||||
}
|
||||
if (max_nbors>_max_nbors)
|
||||
_max_nbors=static_cast<int>(static_cast<double>(max_nbors)*1.10);
|
||||
this->nbor.resize(this->max_local,_max_nbors,success);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void GB_GPU_MemoryT::clear() {
|
||||
if (!this->allocated)
|
||||
return;
|
||||
|
||||
int err_flag;
|
||||
this->dev_error.copy_to_host(&err_flag);
|
||||
if (err_flag == 1)
|
||||
std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N "
|
||||
<< "and RECOMPILE.\n";
|
||||
else if (err_flag == 2)
|
||||
std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
|
||||
|
||||
LJ_GPU_MemoryT::clear();
|
||||
|
||||
lshape.unbind();
|
||||
|
||||
shape.clear();
|
||||
well.clear();
|
||||
form.clear();
|
||||
lshape.clear();
|
||||
gamma_upsilon_mu.clear();
|
||||
host_olist.clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double GB_GPU_MemoryT::host_memory_usage() {
|
||||
return this->atom.host_memory_usage(this->max_atoms)+
|
||||
this->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
||||
sizeof(GB_GPU_Memory<numtyp,acctyp>)+this->max_atoms*sizeof(int);
|
||||
}
|
||||
|
||||
template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
@ -12,61 +12,183 @@
|
|||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef GB_GPU_MEMORY_H
|
||||
#define GB_GPU_MEMORY_H
|
||||
|
||||
#define MAX_GPU_THREADS 4
|
||||
#include "lj_gpu_memory.h"
|
||||
#define BLOCK_1D 64
|
||||
|
||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
#include "pair_gpu_device.h"
|
||||
#include "pair_gpu_balance.h"
|
||||
#include "mpi.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
|
||||
class GB_GPU_Memory {
|
||||
public:
|
||||
GB_GPU_Memory();
|
||||
~GB_GPU_Memory();
|
||||
|
||||
bool init(const int ij_size, const int ntypes, const double gamma,
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param gpu_nbor true if neighboring performed on device
|
||||
* \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \return false if there is not sufficient memory or device init prob **/
|
||||
bool init(const int ntypes, const double gamma,
|
||||
const double upsilon, const double mu, double **host_shape,
|
||||
double **host_well, double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double *host_lshape, int **h_form,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int max_nbors, const int nlocal, const int nall,
|
||||
const bool force_d, const int me);
|
||||
double **host_lj4, double **host_offset,
|
||||
const double *host_special_lj, const int nlocal, const int nall,
|
||||
const int max_nbors, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
void resize_atom(const int nall, bool &success);
|
||||
void resize_local(const int nlocal, const int max_nbors, bool &success);
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
atom->resize(inum, nall, success);
|
||||
if (multiple_forms) atom->dev_ans.zero();
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_bytes)
|
||||
_max_bytes=bytes;
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \param olist_size size of list of particles from CPU neighboring
|
||||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note if GPU is neighboring nlocal+host_inum=total number local particles
|
||||
* \note if CPU is neighboring olist_size=total number of local particles
|
||||
* \note if GPU is neighboring olist_size=0 **/
|
||||
inline void resize_local(const int nlocal, const int host_inum,
|
||||
const int max_nbors, const int olist_size,
|
||||
bool &success) {
|
||||
if (olist_size>static_cast<int>(host_olist.numel())) {
|
||||
host_olist.clear();
|
||||
int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
|
||||
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
|
||||
}
|
||||
nbor->resize(nlocal,host_inum,max_nbors,success);
|
||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_bytes)
|
||||
_max_bytes=bytes;
|
||||
}
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
double host_memory_usage();
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (nbor_time_avail) {
|
||||
nbor->time_nbor.add_to_total();
|
||||
nbor->time_kernel.add_to_total();
|
||||
nbor_time_avail=false;
|
||||
}
|
||||
time_kernel.add_to_total();
|
||||
time_gayberne.add_to_total();
|
||||
if (multiple_forms) {
|
||||
time_kernel2.add_to_total();
|
||||
time_gayberne2.add_to_total();
|
||||
time_pair.add_to_total();
|
||||
}
|
||||
atom->acc_timers();
|
||||
}
|
||||
|
||||
// ---------------------------- DATA ----------------------------
|
||||
/// Accumulate timers
|
||||
inline void zero_timers() {
|
||||
nbor_time_avail=false;
|
||||
time_kernel.zero();
|
||||
time_gayberne.zero();
|
||||
if (multiple_forms) {
|
||||
time_kernel2.zero();
|
||||
time_gayberne2.zero();
|
||||
time_pair.zero();
|
||||
}
|
||||
atom->zero_timers();
|
||||
}
|
||||
|
||||
// ilist with particles sorted by type
|
||||
NVC_HostI host_olist;
|
||||
|
||||
// --------------- Const Data for Atoms
|
||||
NVC_ConstMatT shape, well;
|
||||
NVC_ConstMatI form;
|
||||
NVC_VecT lshape, gamma_upsilon_mu;
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
PairGPUDevice<numtyp,acctyp> *device;
|
||||
/// Geryon device
|
||||
UCL_Device *ucl_device;
|
||||
|
||||
/// Device Error Flag - Set if a bad matrix inversion occurs
|
||||
UCL_D_Vec<int> dev_error;
|
||||
/// Device timers
|
||||
UCL_Timer time_kernel, time_gayberne, time_kernel2, time_gayberne2, time_pair;
|
||||
/// Host device load balancer
|
||||
PairGPUBalance<numtyp,acctyp> hd_balancer;
|
||||
/// LAMMPS pointer for screen output
|
||||
FILE *screen;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
// --------------- Timing Stuff
|
||||
NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2;
|
||||
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
|
||||
UCL_D_Vec<numtyp4> lj1;
|
||||
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
|
||||
UCL_D_Vec<numtyp4> lj3;
|
||||
/// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
|
||||
UCL_D_Vec<numtyp2> sigma_epsilon;
|
||||
/// cut_form.x = cutsq, cut_form.y = form
|
||||
UCL_D_Vec<numtyp2> cut_form;
|
||||
// 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
|
||||
UCL_D_Vec<numtyp> gamma_upsilon_mu;
|
||||
|
||||
// True if we want to use fast GB-sphere or sphere-sphere calculations
|
||||
bool multiple_forms;
|
||||
int **host_form;
|
||||
int last_ellipse;
|
||||
int _max_nbors;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
int _lj_types;
|
||||
|
||||
// --------------------------- ATOM DATA --------------------------
|
||||
|
||||
/// Atom Data
|
||||
PairGPUAtom<numtyp,acctyp> *atom;
|
||||
|
||||
/// Aspherical Const Data for Atoms
|
||||
UCL_D_Vec<numtyp4> shape, well;
|
||||
/// Aspherical Const Data for Atoms
|
||||
UCL_D_Vec<numtyp> lshape;
|
||||
|
||||
int last_ellipse, max_last_ellipse;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor data
|
||||
PairGPUNbor *nbor;
|
||||
/// ilist with particles sorted by type
|
||||
UCL_H_Vec<int> host_olist;
|
||||
/// True if we should accumulate the neighbor timer
|
||||
bool nbor_time_avail;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program, *gb_program, *gb_lj_program;
|
||||
UCL_Kernel k_gb_nbor_fast, k_gb_nbor;
|
||||
UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
|
||||
inline int block_size() { return _block_size; }
|
||||
|
||||
private:
|
||||
bool _allocated, _compiled;
|
||||
int _block_size;
|
||||
double _max_bytes;
|
||||
|
||||
void compile_kernels(UCL_Device &dev);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
Geryon
|
||||
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
|
||||
Geryon is intended to be a simple library for managing the CUDA Runtime,
|
||||
CUDA Driver, and OpenCL APIs with a consistent interface:
|
||||
|
||||
* Change from one API to another by simply changing the namespace
|
||||
* Use multiple APIs in the same code
|
||||
* Lightweight (only include files - no build required)
|
||||
* Manage device query and selection
|
||||
* Simple vector and matrix containers
|
||||
* Simple routines for data copy and type casting
|
||||
* Simple routines for data I/O
|
||||
* Simple classes for managing device timing
|
||||
* Simple classes for managing kernel compilation and execution
|
||||
|
||||
Geryon does not require building (although a Makefile is provided for testing
|
||||
purposes). The library is a set of header files that can be included with
|
||||
your code.
|
||||
|
||||
Documentation and examples are provided at
|
||||
|
||||
http://users.nccs.gov/~wb8/geryon/index.htm
|
|
@ -0,0 +1 @@
|
|||
Geryon Version 10.280
|
|
@ -0,0 +1,47 @@
|
|||
#!/bin/sh
|
||||
|
||||
# convert ptx assembly output into
|
||||
# a c-style string constant written
|
||||
# in portable posix shell script.
|
||||
# requires: sed, rm, mv
|
||||
#
|
||||
# Author: Axel Kohlmeyer, Temple University
|
||||
|
||||
num_args=$#
|
||||
|
||||
# we write to a scratch file, since
|
||||
# we know the real file name only at
|
||||
# the very end.
|
||||
output=geryon.tmp.$$
|
||||
: > $output
|
||||
|
||||
# remove temporary file in case we're interrupted.
|
||||
cleanup () {
|
||||
rm -f geryon.tmp.$$
|
||||
}
|
||||
trap cleanup INT QUIT TERM
|
||||
|
||||
# loop over arguments and convert to
|
||||
# string constants.
|
||||
i=1
|
||||
while [ $i -lt $num_args ]
|
||||
do \
|
||||
src=$1
|
||||
krn=${src##*/}
|
||||
krn=${krn%.*}
|
||||
echo "Converting kernel $krn from $src to a c-style string"
|
||||
echo "const char * $krn = " >> $output
|
||||
sed -e 's/\\/\\\\/g' \
|
||||
-e 's/"/\\"/g' \
|
||||
-e 's/ *\/\/.*$//' \
|
||||
-e '/\.file/D' \
|
||||
-e '/^[ ]*$/D' \
|
||||
-e 's/^\(.*\)$/"\1\\n"/' $src >> $output
|
||||
echo ';' >> $output
|
||||
shift
|
||||
i=`expr $i + 1`
|
||||
done
|
||||
|
||||
# $1 holds now the real output file name
|
||||
mv $output $1
|
||||
|
|
@ -0,0 +1,311 @@
|
|||
/***************************************************************************
|
||||
nvc_device.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with cuda devices
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Wed Jan 28 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVC_DEVICE
|
||||
#define NVC_DEVICE
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include "nvc_macros.h"
|
||||
#include "ucl_types.h"
|
||||
|
||||
namespace ucl_cudart {
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - COMMAND QUEUE STUFF
|
||||
// --------------------------------------------------------------------------
|
||||
typedef cudaStream_t command_queue;
|
||||
|
||||
inline void ucl_sync(cudaStream_t &stream) {
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
/// Class for looking at device properties
|
||||
/** \note Calls to change the device outside of the class results in incorrect
|
||||
* behavior
|
||||
* \note There is no error checking for indexing past the number of devices **/
|
||||
class UCL_Device {
|
||||
public:
|
||||
/// Collect properties for every GPU on the node
|
||||
/** \note You must set the active GPU with set() before using the device **/
|
||||
UCL_Device();
|
||||
|
||||
~UCL_Device();
|
||||
|
||||
/// Returns 1 (For compatibility with OpenCL)
|
||||
inline int num_platforms() { return 1; }
|
||||
|
||||
/// Return a string with name and info of the current platform
|
||||
std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA"; }
|
||||
|
||||
/// Return the number of devices that support CUDA
|
||||
inline int num_devices() { return _properties.size(); }
|
||||
|
||||
/// Set the CUDA device to the specified device number
|
||||
void set(int num);
|
||||
|
||||
/// Get the current device number
|
||||
inline int device_num() { return _device; }
|
||||
|
||||
/// Returns the default stream for the current device
|
||||
inline command_queue & cq() { return cq(0); }
|
||||
|
||||
/// Returns the stream indexed by i
|
||||
inline command_queue & cq(const int i) { return _cq[i]; }
|
||||
|
||||
/// Block until all commands in the default stream have completed
|
||||
inline void sync() { sync(0); }
|
||||
|
||||
/// Block until all commands in the specified stream have completed
|
||||
inline void sync(const int i) { ucl_sync(cq(i)); }
|
||||
|
||||
/// Get the number of command queues currently available on device
|
||||
inline int num_queues()
|
||||
{ if (_device==-1) return 0; else return _cq.size(); }
|
||||
|
||||
/// Add a stream for device computations
|
||||
inline void push_command_queue() {
|
||||
_cq.push_back(cudaStream_t());
|
||||
CUDA_SAFE_CALL_NS(cudaStreamCreate(&_cq.back()));
|
||||
}
|
||||
|
||||
/// Remove a stream for device computations
|
||||
/** \note You cannot delete the default stream **/
|
||||
inline void pop_command_queue() {
|
||||
if (_cq.size()<2) return;
|
||||
CUDA_SAFE_CALL_NS(cudaStreamDestroy(_cq.back()));
|
||||
_cq.pop_back();
|
||||
}
|
||||
|
||||
/// Get the current CUDA device name
|
||||
inline std::string name() { return name(_device); }
|
||||
/// Get the CUDA device name
|
||||
inline std::string name(const int i)
|
||||
{ return std::string(_properties[i].name); }
|
||||
|
||||
/// Get a string telling the type of the current device
|
||||
inline std::string device_type_name() { return device_type_name(_device); }
|
||||
/// Get a string telling the type of the device
|
||||
inline std::string device_type_name(const int i) { return "GPU"; }
|
||||
|
||||
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type() { return device_type(_device); }
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i) { return UCL_GPU; }
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
bool double_precision(const int i) {return arch(i)>=1.3;}
|
||||
|
||||
/// Get the number of cores in the current device
|
||||
inline unsigned cores() { return cores(_device); }
|
||||
/// Get the number of cores
|
||||
inline unsigned cores(const int i)
|
||||
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
|
||||
else return _properties[i].multiProcessorCount*32; }
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
/// Get the gigabytes of global memory
|
||||
inline double gigabytes(const int i)
|
||||
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
|
||||
|
||||
/// Get the bytes of global memory in the current device
|
||||
inline size_t bytes() { return bytes(_device); }
|
||||
/// Get the bytes of global memory
|
||||
inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
|
||||
|
||||
/// Return the GPGPU compute capability for current device
|
||||
inline double arch() { return arch(_device); }
|
||||
/// Return the GPGPU compute capability
|
||||
inline double arch(const int i)
|
||||
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
|
||||
|
||||
/// Clock rate in GHz for current device
|
||||
inline double clock_rate() { return clock_rate(_device); }
|
||||
/// Clock rate in GHz
|
||||
inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;}
|
||||
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size() { return group_size(_device); }
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size(const int i)
|
||||
{ return _properties[i].maxThreadsPerBlock; }
|
||||
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
/// Return the maximum memory pitch in bytes
|
||||
inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
|
||||
|
||||
/// List all devices along with all properties
|
||||
void print_all(std::ostream &out);
|
||||
|
||||
private:
|
||||
int _device, _num_devices;
|
||||
std::vector<cudaDeviceProp> _properties;
|
||||
std::vector<cudaStream_t> _cq;
|
||||
};
|
||||
|
||||
// Grabs the properties for all devices
|
||||
inline UCL_Device::UCL_Device() {
|
||||
CUDA_SAFE_CALL_NS(cudaGetDeviceCount(&_num_devices));
|
||||
for (int dev=0; dev<_num_devices; ++dev) {
|
||||
cudaDeviceProp deviceProp;
|
||||
CUDA_SAFE_CALL_NS(cudaGetDeviceProperties(&deviceProp, dev));
|
||||
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
|
||||
break;
|
||||
_properties.push_back(deviceProp);
|
||||
}
|
||||
_device=-1;
|
||||
_cq.push_back(cudaStream_t());
|
||||
_cq.back()=0;
|
||||
}
|
||||
|
||||
inline UCL_Device::~UCL_Device() {
|
||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||
}
|
||||
|
||||
// Set the CUDA device to the specified device number
|
||||
inline void UCL_Device::set(int num) {
|
||||
if (_device==num)
|
||||
return;
|
||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||
cudaThreadExit();
|
||||
CUDA_SAFE_CALL_NS(cudaSetDevice(num));
|
||||
_device=num;
|
||||
}
|
||||
|
||||
// List all devices along with all properties
|
||||
inline void UCL_Device::print_all(std::ostream &out) {
|
||||
#if CUDART_VERSION >= 2020
|
||||
int driver_version, runtime_version;
|
||||
cudaDriverGetVersion(&driver_version);
|
||||
out << "CUDA Driver Version: "
|
||||
<< driver_version/1000 << "." << driver_version%100
|
||||
<< std::endl;
|
||||
cudaRuntimeGetVersion(&runtime_version);
|
||||
out << "CUDA Runtime Version: "
|
||||
<< runtime_version/1000 << "." << runtime_version%100
|
||||
<< std::endl;
|
||||
#endif
|
||||
|
||||
if (num_devices() == 0)
|
||||
out << "There is no device supporting CUDA\n";
|
||||
for (int i=0; i<num_devices(); ++i) {
|
||||
out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
|
||||
out << " Type of device: "
|
||||
<< device_type_name(i).c_str() << std::endl;
|
||||
out << " Compute capability: "
|
||||
<< arch(i) << std::endl;
|
||||
out << " Double precision support: ";
|
||||
if (double_precision(i))
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Total amount of global memory: "
|
||||
<< gigabytes(i) << " GB\n";
|
||||
#if CUDART_VERSION >= 2000
|
||||
out << " Number of compute units/multiprocessors: "
|
||||
<< _properties[i].multiProcessorCount << std::endl;
|
||||
out << " Number of cores: "
|
||||
<< cores(i) << std::endl;
|
||||
#endif
|
||||
out << " Total amount of constant memory: "
|
||||
<< _properties[i].totalConstMem << " bytes\n";
|
||||
out << " Total amount of local/shared memory per block: "
|
||||
<< _properties[i].sharedMemPerBlock << " bytes\n";
|
||||
out << " Total number of registers available per block: "
|
||||
<< _properties[i].regsPerBlock << std::endl;
|
||||
out << " Warp size: "
|
||||
<< _properties[i].warpSize << std::endl;
|
||||
out << " Maximum number of threads per block: "
|
||||
<< _properties[i].maxThreadsPerBlock << std::endl;
|
||||
out << " Maximum group size (# of threads per block) "
|
||||
<< _properties[i].maxThreadsDim[0] << " x "
|
||||
<< _properties[i].maxThreadsDim[1] << " x "
|
||||
<< _properties[i].maxThreadsDim[2] << std::endl;
|
||||
out << " Maximum item sizes (# threads for each dim) "
|
||||
<< _properties[i].maxGridSize[0] << " x "
|
||||
<< _properties[i].maxGridSize[1] << " x "
|
||||
<< _properties[i].maxGridSize[2] << std::endl;
|
||||
out << " Maximum memory pitch: "
|
||||
<< max_pitch(i) << " bytes\n";
|
||||
out << " Texture alignment: "
|
||||
<< _properties[i].textureAlignment << " bytes\n";
|
||||
out << " Clock rate: "
|
||||
<< clock_rate(i) << " GHz\n";
|
||||
#if CUDART_VERSION >= 2000
|
||||
out << " Concurrent copy and execution: ";
|
||||
if (_properties[i].deviceOverlap)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
#endif
|
||||
#if CUDART_VERSION >= 2020
|
||||
out << " Run time limit on kernels: ";
|
||||
if (_properties[i].kernelExecTimeoutEnabled)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Integrated: ";
|
||||
if (_properties[i].integrated)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Support host page-locked memory mapping: ";
|
||||
if (_properties[i].canMapHostMemory)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Compute mode: ";
|
||||
if (_properties[i].computeMode == cudaComputeModeDefault)
|
||||
out << "Default\n"; // multiple threads can use device
|
||||
else if (_properties[i].computeMode == cudaComputeModeExclusive)
|
||||
out << "Exclusive\n"; // only thread can use device
|
||||
else if (_properties[i].computeMode == cudaComputeModeProhibited)
|
||||
out << "Prohibited\n"; // no thread can use device
|
||||
else
|
||||
out << "Unknown\n";
|
||||
#endif
|
||||
#if CUDART_VERSION >= 3000
|
||||
out << " Concurrent kernel execution: ";
|
||||
if (_properties[i].concurrentKernels)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Device has ECC support enabled: ";
|
||||
if (_properties[i].ECCEnabled)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
#ifndef NVC_MACROS_H
|
||||
#define NVC_MACROS_H
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#if _GLIBCXX_ATOMIC_BUILTINS == 1
|
||||
#undef _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // _GLIBCXX_ATOMIC_BUILTINS
|
||||
#endif // __APPLE__
|
||||
|
||||
#include <stdio.h>
|
||||
#include <cassert>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#ifdef MPI_GERYON
|
||||
#include "mpi.h"
|
||||
#define NVC_GERYON_EXIT MPI_Abort(MPI_COMM_WORLD,-1)
|
||||
#else
|
||||
#define NVC_GERYON_EXIT assert(0==1)
|
||||
#endif
|
||||
|
||||
#ifndef UCL_NO_API_CHECK
|
||||
|
||||
#define CUDA_SAFE_CALL_NS( call) do { \
|
||||
cudaError err = call; \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in call at file '%s' in line %i : %s.\n", \
|
||||
__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
||||
NVC_GERYON_EXIT; \
|
||||
} } while (0)
|
||||
|
||||
#ifdef UCL_SYNC_DEBUG
|
||||
|
||||
#define CUDA_SAFE_CALL( call) do { \
|
||||
CUDA_SAFE_CALL_NS( call); \
|
||||
cudaError err=cudaThreadSynchronize(); \
|
||||
if( cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
|
||||
__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
||||
NVC_GERYON_EXIT; \
|
||||
} } while (0)
|
||||
|
||||
#else
|
||||
|
||||
#define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NS( call)
|
||||
|
||||
#endif
|
||||
|
||||
#else // not DEBUG
|
||||
|
||||
// void macros for performance reasons
|
||||
#define CUDA_SAFE_CALL( call) call
|
||||
#define CUDA_SAFE_CALL_NS( call) call
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
/***************************************************************************
|
||||
nvc_texture.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with CUDA Runtime textures
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Fri Jul 2 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVC_TEXTURE
|
||||
#define NVC_TEXTURE
|
||||
|
||||
#include "nvc_mat.h"
|
||||
|
||||
namespace ucl_cudart {
|
||||
|
||||
/// Class storing a texture reference
|
||||
class UCL_Texture {
|
||||
public:
|
||||
UCL_Texture() {}
|
||||
~UCL_Texture() {}
|
||||
/// Construct with a specified texture reference
|
||||
inline UCL_Texture(textureReference *t) { get_texture(t); }
|
||||
/// Set the texture reference for this object
|
||||
inline void get_texture(textureReference *t) { _tex_ptr=t; }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class mat_typ>
|
||||
inline void bind_float(mat_typ &vec, const unsigned numel) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(numel!=0 && numel<5);
|
||||
#endif
|
||||
int bits[4]={0,0,0,0};
|
||||
for (int i=0; i<numel; i++) bits[i]=32;
|
||||
_channel = cudaCreateChannelDesc(bits[0], bits[1], bits[2], bits[3],
|
||||
cudaChannelFormatKindFloat);
|
||||
(*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
|
||||
(*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
|
||||
(*_tex_ptr).filterMode = cudaFilterModePoint;
|
||||
(*_tex_ptr).normalized = false;
|
||||
CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,vec.cbegin(),&_channel));
|
||||
}
|
||||
|
||||
/// Unbind the texture reference from the memory allocation
|
||||
inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
|
||||
|
||||
private:
|
||||
textureReference *_tex_ptr;
|
||||
cudaChannelFormatDesc _channel;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,359 @@
|
|||
/***************************************************************************
|
||||
nvd_device.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with cuda devices
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jan 21 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVD_DEVICE
|
||||
#define NVD_DEVICE
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include "nvd_macros.h"
|
||||
#include "ucl_types.h"
|
||||
|
||||
namespace ucl_cudadr {
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - COMMAND QUEUE STUFF
|
||||
// --------------------------------------------------------------------------
|
||||
typedef CUstream command_queue;
|
||||
|
||||
inline void ucl_sync(CUstream &stream) {
|
||||
CU_SAFE_CALL(cuStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
struct NVDProperties {
|
||||
std::string name;
|
||||
int major;
|
||||
int minor;
|
||||
CUDA_INT_TYPE totalGlobalMem;
|
||||
int multiProcessorCount;
|
||||
CUdevprop_st p;
|
||||
int kernelExecTimeoutEnabled;
|
||||
int integrated;
|
||||
int canMapHostMemory;
|
||||
int concurrentKernels;
|
||||
int ECCEnabled;
|
||||
};
|
||||
|
||||
/// Class for looking at device properties
|
||||
/** \note Calls to change the device outside of the class results in incorrect
|
||||
* behavior
|
||||
* \note There is no error checking for indexing past the number of devices **/
|
||||
class UCL_Device {
|
||||
public:
|
||||
/// Collect properties for every GPU on the node
|
||||
/** \note You must set the active GPU with set() before using the device **/
|
||||
UCL_Device();
|
||||
|
||||
~UCL_Device();
|
||||
|
||||
/// Returns 1 (For compatibility with OpenCL)
|
||||
inline int num_platforms() { return 1; }
|
||||
|
||||
/// Return a string with name and info of the current platform
|
||||
std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
|
||||
|
||||
/// Return the number of devices that support CUDA
|
||||
inline int num_devices() { return _properties.size(); }
|
||||
|
||||
/// Set the CUDA device to the specified device number
|
||||
/** A context and default command queue will be created for the device **/
|
||||
void set(int num);
|
||||
|
||||
/// Get the current device number
|
||||
inline int device_num() { return _device; }
|
||||
|
||||
/// Returns the default stream for the current device
|
||||
inline command_queue & cq() { return cq(0); }
|
||||
|
||||
/// Returns the stream indexed by i
|
||||
inline command_queue & cq(const int i) { return _cq[i]; }
|
||||
|
||||
/// Block until all commands in the default stream have completed
|
||||
inline void sync() { sync(0); }
|
||||
|
||||
/// Block until all commands in the specified stream have completed
|
||||
inline void sync(const int i) { ucl_sync(cq(i)); }
|
||||
|
||||
/// Get the number of command queues currently available on device
|
||||
inline int num_queues()
|
||||
{ return _cq.size(); }
|
||||
|
||||
/// Add a stream for device computations
|
||||
inline void push_command_queue() {
|
||||
_cq.push_back(CUstream());
|
||||
CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0));
|
||||
}
|
||||
|
||||
/// Remove a stream for device computations
|
||||
/** \note You cannot delete the default stream **/
|
||||
inline void pop_command_queue() {
|
||||
if (_cq.size()<2) return;
|
||||
CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back()));
|
||||
_cq.pop_back();
|
||||
}
|
||||
|
||||
/// Get the current CUDA device name
|
||||
inline std::string name() { return name(_device); }
|
||||
/// Get the CUDA device name
|
||||
inline std::string name(const int i)
|
||||
{ return std::string(_properties[i].name); }
|
||||
|
||||
/// Get a string telling the type of the current device
|
||||
inline std::string device_type_name() { return device_type_name(_device); }
|
||||
/// Get a string telling the type of the device
|
||||
inline std::string device_type_name(const int i) { return "GPU"; }
|
||||
|
||||
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type() { return device_type(_device); }
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i) { return UCL_GPU; }
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
bool double_precision(const int i) {return arch(i)>=1.3;}
|
||||
|
||||
/// Get the number of cores in the current device
|
||||
inline unsigned cores() { return cores(_device); }
|
||||
/// Get the number of cores
|
||||
inline unsigned cores(const int i)
|
||||
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
|
||||
else return _properties[i].multiProcessorCount*32; }
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
/// Get the gigabytes of global memory
|
||||
inline double gigabytes(const int i)
|
||||
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
|
||||
|
||||
/// Get the bytes of global memory in the current device
|
||||
inline size_t bytes() { return bytes(_device); }
|
||||
/// Get the bytes of global memory
|
||||
inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
|
||||
|
||||
// Get the gigabytes of free memory in the current device
|
||||
inline double free_gigabytes() { return free_gigabytes(_device); }
|
||||
// Get the gigabytes of free memory
|
||||
inline double free_gigabytes(const int i)
|
||||
{ return static_cast<double>(free_bytes(i))/1073741824; }
|
||||
|
||||
// Get the bytes of free memory in the current device
|
||||
inline size_t free_bytes() { return free_bytes(_device); }
|
||||
// Get the bytes of free memory
|
||||
inline size_t free_bytes(const int i) {
|
||||
CUDA_INT_TYPE dfree, dtotal;
|
||||
CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal));
|
||||
return static_cast<size_t>(dfree);
|
||||
}
|
||||
|
||||
/// Return the GPGPU compute capability for current device
|
||||
inline double arch() { return arch(_device); }
|
||||
/// Return the GPGPU compute capability
|
||||
inline double arch(const int i)
|
||||
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
|
||||
|
||||
/// Clock rate in GHz for current device
|
||||
inline double clock_rate() { return clock_rate(_device); }
|
||||
/// Clock rate in GHz
|
||||
inline double clock_rate(const int i)
|
||||
{ return _properties[i].p.clockRate*1e-6;}
|
||||
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size() { return group_size(_device); }
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size(const int i)
|
||||
{ return _properties[i].p.maxThreadsPerBlock; }
|
||||
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
/// Return the maximum memory pitch in bytes
|
||||
inline size_t max_pitch(const int i) { return _properties[i].p.memPitch; }
|
||||
|
||||
/// List all devices along with all properties
|
||||
void print_all(std::ostream &out);
|
||||
|
||||
private:
|
||||
int _device, _num_devices;
|
||||
std::vector<NVDProperties> _properties;
|
||||
std::vector<CUstream> _cq;
|
||||
CUdevice _cu_device;
|
||||
CUcontext _context;
|
||||
};
|
||||
|
||||
// Grabs the properties for all devices
|
||||
inline UCL_Device::UCL_Device() {
|
||||
CU_SAFE_CALL_NS(cuInit(0));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
|
||||
for (int dev=0; dev<_num_devices; ++dev) {
|
||||
CUdevice m;
|
||||
CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
|
||||
_properties.push_back(NVDProperties());
|
||||
|
||||
char namecstr[1024];
|
||||
CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
|
||||
_properties.back().name=namecstr;
|
||||
|
||||
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
|
||||
&_properties.back().minor,m));
|
||||
|
||||
CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
|
||||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
||||
m));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
|
||||
#if CUDA_VERSION >= 2020
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().kernelExecTimeoutEnabled,
|
||||
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().integrated,
|
||||
CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().canMapHostMemory,
|
||||
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
||||
#endif
|
||||
#if CUDA_VERSION >= 3000
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().concurrentKernels,
|
||||
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().ECCEnabled,
|
||||
CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
|
||||
#endif
|
||||
}
|
||||
_device=-1;
|
||||
_cq.push_back(CUstream());
|
||||
_cq.back()=0;
|
||||
}
|
||||
|
||||
inline UCL_Device::~UCL_Device() {
|
||||
if (_device>-1) {
|
||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||
cuCtxDestroy(_context);
|
||||
}
|
||||
}
|
||||
|
||||
// Set the CUDA device to the specified device number
|
||||
inline void UCL_Device::set(int num) {
|
||||
if (_device==num)
|
||||
return;
|
||||
if (_device>-1) {
|
||||
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
|
||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||
}
|
||||
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
|
||||
CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
|
||||
_device=num;
|
||||
}
|
||||
|
||||
// List all devices along with all properties
|
||||
inline void UCL_Device::print_all(std::ostream &out) {
|
||||
#if CUDA_VERSION >= 2020
|
||||
int driver_version;
|
||||
cuDriverGetVersion(&driver_version);
|
||||
out << "CUDA Driver Version: "
|
||||
<< driver_version/1000 << "." << driver_version%100
|
||||
<< std::endl;
|
||||
#endif
|
||||
|
||||
if (num_devices() == 0)
|
||||
out << "There is no device supporting CUDA\n";
|
||||
for (int i=0; i<num_devices(); ++i) {
|
||||
out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
|
||||
out << " Type of device: "
|
||||
<< device_type_name(i).c_str() << std::endl;
|
||||
out << " Compute capability: "
|
||||
<< arch(i) << std::endl;
|
||||
out << " Double precision support: ";
|
||||
if (double_precision(i))
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Total amount of global memory: "
|
||||
<< gigabytes(i) << " GB\n";
|
||||
#if CUDA_VERSION >= 2000
|
||||
out << " Number of compute units/multiprocessors: "
|
||||
<< _properties[i].multiProcessorCount << std::endl;
|
||||
out << " Number of cores: "
|
||||
<< cores(i) << std::endl;
|
||||
#endif
|
||||
out << " Total amount of constant memory: "
|
||||
<< _properties[i].p.totalConstantMemory << " bytes\n";
|
||||
out << " Total amount of local/shared memory per block: "
|
||||
<< _properties[i].p.sharedMemPerBlock << " bytes\n";
|
||||
out << " Total number of registers available per block: "
|
||||
<< _properties[i].p.regsPerBlock << std::endl;
|
||||
out << " Warp size: "
|
||||
<< _properties[i].p.SIMDWidth << std::endl;
|
||||
out << " Maximum number of threads per block: "
|
||||
<< _properties[i].p.maxThreadsPerBlock << std::endl;
|
||||
out << " Maximum group size (# of threads per block) "
|
||||
<< _properties[i].p.maxThreadsDim[0] << " x "
|
||||
<< _properties[i].p.maxThreadsDim[1] << " x "
|
||||
<< _properties[i].p.maxThreadsDim[2] << std::endl;
|
||||
out << " Maximum item sizes (# threads for each dim) "
|
||||
<< _properties[i].p.maxGridSize[0] << " x "
|
||||
<< _properties[i].p.maxGridSize[1] << " x "
|
||||
<< _properties[i].p.maxGridSize[2] << std::endl;
|
||||
out << " Maximum memory pitch: "
|
||||
<< max_pitch(i) << " bytes\n";
|
||||
out << " Texture alignment: "
|
||||
<< _properties[i].p.textureAlign << " bytes\n";
|
||||
out << " Clock rate: "
|
||||
<< clock_rate(i) << " GHz\n";
|
||||
#if CUDA_VERSION >= 2020
|
||||
out << " Run time limit on kernels: ";
|
||||
if (_properties[i].kernelExecTimeoutEnabled)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Integrated: ";
|
||||
if (_properties[i].integrated)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Support host page-locked memory mapping: ";
|
||||
if (_properties[i].canMapHostMemory)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
#endif
|
||||
#if CUDA_VERSION >= 3000
|
||||
out << " Concurrent kernel execution: ";
|
||||
if (_properties[i].concurrentKernels)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Device has ECC support enabled: ";
|
||||
if (_properties[i].ECCEnabled)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,259 @@
|
|||
/***************************************************************************
|
||||
nvd_kernel.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with CUDA Driver kernels
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Tue Feb 9 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVD_KERNEL
|
||||
#define NVD_KERNEL
|
||||
|
||||
#include "nvd_device.h"
|
||||
#include <fstream>
|
||||
|
||||
namespace ucl_cudadr {
|
||||
|
||||
class UCL_Texture;
|
||||
|
||||
/// Class storing 1 or more kernel functions from a single string or file
|
||||
class UCL_Program {
|
||||
public:
|
||||
inline UCL_Program(UCL_Device &device) {}
|
||||
inline ~UCL_Program() {}
|
||||
|
||||
/// Initialize the program with a device
|
||||
inline void init(UCL_Device &device) { }
|
||||
|
||||
/// Clear any data associated with program
|
||||
/** \note Must call init() after each clear **/
|
||||
inline void clear() { }
|
||||
|
||||
/// Load a program from a file and compile with flags
|
||||
inline int load(const char *filename, const char *flags="",
|
||||
std::string *log=NULL) {
|
||||
std::ifstream in(filename);
|
||||
if (!in || in.is_open()==false) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open kernel file: "
|
||||
<< filename << std::endl;
|
||||
exit(1);
|
||||
#endif
|
||||
return UCL_FILE_NOT_FOUND;
|
||||
}
|
||||
|
||||
std::string program((std::istreambuf_iterator<char>(in)),
|
||||
std::istreambuf_iterator<char>());
|
||||
in.close();
|
||||
return load_string(program.c_str(),flags,log);
|
||||
}
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const char *program, const char *flags="",
|
||||
std::string *log=NULL) {
|
||||
if (std::string(flags)=="BINARY")
|
||||
return load_binary(program);
|
||||
const unsigned int num_opts=2;
|
||||
CUjit_option options[num_opts];
|
||||
void *values[num_opts];
|
||||
|
||||
// set up size of compilation log buffer
|
||||
options[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||
values[0] = (void *)(int)10240;
|
||||
// set up pointer to the compilation log buffer
|
||||
options[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||
char clog[10240];
|
||||
values[1] = clog;
|
||||
|
||||
CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
|
||||
options,(void **)values);
|
||||
|
||||
if (log!=NULL)
|
||||
*log=std::string(clog);
|
||||
|
||||
if (err != CUDA_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << std::endl
|
||||
<< "----------------------------------------------------------\n"
|
||||
<< " UCL Error: Error compiling PTX Program...\n"
|
||||
<< "----------------------------------------------------------\n";
|
||||
std::cerr << log << std::endl;
|
||||
#endif
|
||||
return UCL_COMPILE_ERROR;
|
||||
}
|
||||
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
/// Load a precompiled program from a file
|
||||
inline int load_binary(const char *filename) {
|
||||
CUmodule _module;
|
||||
CUresult err = cuModuleLoad(&_module,filename);
|
||||
if (err==301) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open binary kernel file: "
|
||||
<< filename << std::endl;
|
||||
exit(1);
|
||||
#endif
|
||||
return UCL_FILE_NOT_FOUND;
|
||||
} else if (err!=CUDA_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Error loading binary kernel file: "
|
||||
<< filename << std::endl;
|
||||
exit(1);
|
||||
#endif
|
||||
return UCL_FILE_NOT_FOUND;
|
||||
}
|
||||
//int ucl_error=UCL_SUCCESS;
|
||||
//if (err==301)
|
||||
// return UCL_FILE_NOT_FOUND;
|
||||
//else if (err!=CUDA_SUCCESS)
|
||||
// return UCL_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
friend class UCL_Kernel;
|
||||
private:
|
||||
CUmodule _module;
|
||||
friend class UCL_Texture;
|
||||
};
|
||||
|
||||
/// Class for dealing with OpenCL kernels
|
||||
class UCL_Kernel {
|
||||
public:
|
||||
UCL_Kernel() : _dimensions(1), _num_args(0), _param_size(0)
|
||||
{ _num_blocks[0]=0; }
|
||||
|
||||
UCL_Kernel(UCL_Program &program, const char *function) :
|
||||
_dimensions(1), _num_args(0), _param_size(0)
|
||||
{ _num_blocks[0]=0; set_function(program,function); }
|
||||
|
||||
~UCL_Kernel() {}
|
||||
|
||||
/// Clear any function associated with the kernel
|
||||
inline void clear() { }
|
||||
|
||||
/// Get the kernel function from a program
|
||||
/** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
|
||||
inline int set_function(UCL_Program &program, const char *function) {
|
||||
CUresult err=cuModuleGetFunction(&_kernel,program._module,function);
|
||||
if (err!=CUDA_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not find function: " << function
|
||||
<< " in program.\n";
|
||||
exit(1);
|
||||
#endif
|
||||
return UCL_FUNCTION_NOT_FOUND;
|
||||
}
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
/// Set the kernel argument.
|
||||
/** If not a device pointer, this must be repeated each time the argument
|
||||
* changes
|
||||
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
|
||||
template <class dtype>
|
||||
inline void set_arg(const unsigned index, dtype *arg) {
|
||||
if (index==_num_args)
|
||||
add_arg(arg);
|
||||
else if (index<_num_args)
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
|
||||
else
|
||||
assert(0==1); // Must add kernel parameters in sequential order
|
||||
}
|
||||
|
||||
/// Add a kernel argument.
|
||||
inline void add_arg(const CUdeviceptr* const arg) {
|
||||
void* ptr = (void*)(size_t)(*arg);
|
||||
_param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
|
||||
_offsets.push_back(_param_size);
|
||||
_param_size+=sizeof(ptr);
|
||||
_num_args++;
|
||||
}
|
||||
|
||||
/// Add a kernel argument.
|
||||
template <class dtype>
|
||||
inline void add_arg(const dtype* const arg) {
|
||||
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
|
||||
_offsets.push_back(_param_size);
|
||||
_param_size+=sizeof(dtype);
|
||||
_num_args++;
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called after all arguments have been added **/
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size) {
|
||||
_dimensions=1;
|
||||
_num_blocks[0]=num_blocks;
|
||||
_num_blocks[1]=1;
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x,
|
||||
const size_t block_size_y, const size_t block_size_z) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
|
||||
block_size_z));
|
||||
}
|
||||
|
||||
/// Run the kernel in the default command queue
|
||||
inline void run() {
|
||||
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
|
||||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],0));
|
||||
}
|
||||
|
||||
/// Run the kernel in the specified command queue
|
||||
inline void run(command_queue &cq) {
|
||||
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
|
||||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],cq));
|
||||
}
|
||||
|
||||
/// Clear any arguments associated with the kernel
|
||||
inline void clear_args() { _num_args=0; _offsets.clear(); _param_size=0; }
|
||||
|
||||
#include "ucl_arg_kludge.h"
|
||||
|
||||
private:
|
||||
CUfunction _kernel;
|
||||
unsigned _dimensions;
|
||||
unsigned _num_blocks[2];
|
||||
unsigned _num_args;
|
||||
std::vector<unsigned> _offsets;
|
||||
unsigned _param_size;
|
||||
friend class UCL_Texture;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
#ifndef NVD_MACROS_H
|
||||
#define NVD_MACROS_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <cassert>
|
||||
#include <cuda.h>
|
||||
|
||||
#if CUDA_VERSION >= 3020
|
||||
#define CUDA_INT_TYPE size_t
|
||||
#else
|
||||
#define CUDA_INT_TYPE unsigned
|
||||
#endif
|
||||
|
||||
#ifdef MPI_GERYON
|
||||
#include "mpi.h"
|
||||
#define NVD_GERYON_EXIT MPI_Abort(MPI_COMM_WORLD,-1)
|
||||
#else
|
||||
#define NVD_GERYON_EXIT assert(0==1)
|
||||
#endif
|
||||
|
||||
#ifndef UCL_NO_API_CHECK
|
||||
|
||||
#define CU_SAFE_CALL_NS( call ) do { \
|
||||
CUresult err = call; \
|
||||
if( CUDA_SUCCESS != err) { \
|
||||
fprintf(stderr, "Cuda driver error %d in call at file '%s' in line %i.\n", \
|
||||
err, __FILE__, __LINE__ ); \
|
||||
NVD_GERYON_EXIT; \
|
||||
} } while (0)
|
||||
|
||||
#ifdef UCL_SYNC_DEBUG
|
||||
|
||||
#define CU_SAFE_CALL( call ) do { \
|
||||
CU_SAFE_CALL_NS( call ); \
|
||||
CUresult err=cuCtxSynchronize(); \
|
||||
if( CUDA_SUCCESS != err) { \
|
||||
fprintf(stderr, "Cuda driver error %d in file '%s' in line %i.\n", \
|
||||
err, __FILE__, __LINE__ ); \
|
||||
NVD_GERYON_EXIT; \
|
||||
} } while (0)
|
||||
|
||||
#else
|
||||
|
||||
#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )
|
||||
|
||||
#endif
|
||||
|
||||
#else // not DEBUG
|
||||
|
||||
// void macros for performance reasons
|
||||
#define CU_SAFE_CALL_NS( call ) call
|
||||
#define CU_SAFE_CALL( call) call
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
/***************************************************************************
|
||||
nvd_mat.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
CUDA Driver Specific Vector/Matrix Containers, Memory Management, and I/O
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jan 21 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
/*! \file */
|
||||
|
||||
#ifndef NVD_MAT_H
|
||||
#define NVD_MAT_H
|
||||
|
||||
#include "nvd_memory.h"
|
||||
|
||||
/// Namespace for CUDA Driver routines
|
||||
namespace ucl_cudadr {
|
||||
|
||||
#define _UCL_MAT_ALLOW
|
||||
#define _UCL_DEVICE_PTR_MAT
|
||||
#include "ucl_basemat.h"
|
||||
#include "ucl_h_vec.h"
|
||||
#include "ucl_h_mat.h"
|
||||
#include "ucl_d_vec.h"
|
||||
#include "ucl_d_mat.h"
|
||||
#undef _UCL_DEVICE_PTR_MAT
|
||||
#undef _UCL_MAT_ALLOW
|
||||
|
||||
#define UCL_COPY_ALLOW
|
||||
#include "ucl_copy.h"
|
||||
#undef UCL_COPY_ALLOW
|
||||
|
||||
#define UCL_PRINT_ALLOW
|
||||
#include "ucl_print.h"
|
||||
#undef UCL_PRINT_ALLOW
|
||||
|
||||
} // namespace ucl_cudadr
|
||||
|
||||
#endif
|
|
@ -0,0 +1,610 @@
|
|||
/***************************************************************************
|
||||
nvd_memory.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
CUDA Driver Specific Memory Management and Vector/Matrix Containers
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jan 21 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVD_MEMORY_H
|
||||
#define NVD_MEMORY_H
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include "nvd_macros.h"
|
||||
#include "ucl_types.h"
|
||||
|
||||
namespace ucl_cudadr {
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - API Specific Types
|
||||
// --------------------------------------------------------------------------
|
||||
//typedef dim3 ucl_kernel_dim;
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - API SPECIFIC DEVICE POINTERS
|
||||
// --------------------------------------------------------------------------
|
||||
typedef CUdeviceptr device_ptr;
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - HOST MEMORY ALLOCATION ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
CUresult err=CUDA_SUCCESS;
|
||||
if (kind==UCL_RW_OPTIMIZED)
|
||||
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
||||
else if (kind==UCL_WRITE_OPTIMIZED)
|
||||
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
||||
else
|
||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
CUresult err=CUDA_SUCCESS;
|
||||
if (kind==UCL_RW_OPTIMIZED)
|
||||
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
||||
else if (kind==UCL_WRITE_OPTIMIZED)
|
||||
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
||||
else
|
||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
|
||||
if (kind!=UCL_NOT_PINNED)
|
||||
CU_SAFE_CALL(cuMemFreeHost(mat.begin()));
|
||||
else
|
||||
free(mat.begin());
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - DEVICE MEMORY ALLOCATION ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
||||
const size_t cols, size_t &pitch,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
CUresult err;
|
||||
CUDA_INT_TYPE upitch;
|
||||
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
|
||||
cols*sizeof(typename mat_type::data_type),rows,16);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
||||
const size_t cols, size_t &pitch,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
CUresult err;
|
||||
unsigned upitch;
|
||||
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
|
||||
cols*sizeof(typename mat_type::data_type),rows,16);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _device_free(mat_type &mat) {
|
||||
CU_SAFE_CALL(cuMemFree(mat.cbegin()));
|
||||
}
|
||||
|
||||
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
|
||||
*ptr=in;
|
||||
}
|
||||
|
||||
template <class numtyp>
|
||||
inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
|
||||
*ptr=0;
|
||||
}
|
||||
|
||||
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
|
||||
const size_t offset, const size_t numsize) {
|
||||
*ptr=in+offset*numsize;
|
||||
}
|
||||
|
||||
template <class numtyp>
|
||||
inline void _device_view(CUdeviceptr *ptr, numtyp *in,
|
||||
const size_t offset, const size_t numsize) {
|
||||
*ptr=0;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - DEVICE IMAGE ALLOCATION ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
template <class mat_type, class copy_type>
|
||||
inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
||||
const size_t cols) {
|
||||
assert(0==1);
|
||||
}
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
||||
const size_t cols) {
|
||||
assert(0==1);
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _device_image_free(mat_type &mat) {
|
||||
assert(0==1);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - ZERO ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
inline void _host_zero(void *ptr, const size_t n) {
|
||||
memset(ptr,0,n);
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _device_zero(mat_type &mat, const size_t n) {
|
||||
if (n%32==0)
|
||||
CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4));
|
||||
else if (n%16==0)
|
||||
CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2));
|
||||
else
|
||||
CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n));
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - HELPER FUNCTIONS FOR MEMCPY ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
ins.srcXInBytes=0;
|
||||
ins.srcY=0;
|
||||
ins.srcPitch=spitch;
|
||||
ins.dstXInBytes=0;
|
||||
ins.dstY=0;
|
||||
ins.dstPitch=dpitch;
|
||||
ins.WidthInBytes=cols;
|
||||
ins.Height=rows;
|
||||
}
|
||||
|
||||
template <int mem> struct _nvd_set_2D_mem;
|
||||
template <> struct _nvd_set_2D_mem<1>
|
||||
{ static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } };
|
||||
template <> struct _nvd_set_2D_mem<2>
|
||||
{ static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } };
|
||||
template <int mem> struct _nvd_set_2D_mem
|
||||
{ static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } };
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - MEMCPY ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
template<int mem1, int mem2> struct _ucl_memcpy;
|
||||
|
||||
// Both are images
|
||||
template<> struct _ucl_memcpy<2,2> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
}
|
||||
};
|
||||
|
||||
// Destination is texture, source on device
|
||||
template<> struct _ucl_memcpy<2,0> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
}
|
||||
};
|
||||
|
||||
// Destination is texture, source on host
|
||||
template<> struct _ucl_memcpy<2,1> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
}
|
||||
};
|
||||
|
||||
// Source is texture, dest on device
|
||||
template<> struct _ucl_memcpy<0,2> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
}
|
||||
};
|
||||
|
||||
// Source is texture, dest on host
|
||||
template<> struct _ucl_memcpy<1,2> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
}
|
||||
};
|
||||
|
||||
// Neither are textures, destination on host
|
||||
template <> struct _ucl_memcpy<1,0> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoH(dst.begin(),src.cbegin(),n));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
}
|
||||
};
|
||||
|
||||
// Neither are textures, source on host
|
||||
template <> struct _ucl_memcpy<0,1> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
||||
CU_SAFE_CALL(cuMemcpyHtoD(dst.cbegin(),src.begin(),n));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
}
|
||||
};
|
||||
|
||||
// Neither are textures, both on host
|
||||
template <> struct _ucl_memcpy<1,1> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n)
|
||||
{ memcpy(dst.begin(),src.begin(),n); }
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq)
|
||||
{ memcpy(dst.begin(),src.begin(),n); }
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
}
|
||||
};
|
||||
|
||||
// Neither are textures, both on device
|
||||
template <int mem1, int mem2> struct _ucl_memcpy {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
if (p1::PADDED==0 || p2::PADDED==0) {
|
||||
size_t src_offset=0, dst_offset=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
|
||||
src.cbegin()+src_offset,cols));
|
||||
src_offset+=spitch;
|
||||
dst_offset+=dpitch;
|
||||
}
|
||||
} else {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
if (p1::PADDED==0 || p2::PADDED==0) {
|
||||
size_t src_offset=0, dst_offset=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
|
||||
src.cbegin()+src_offset,cols));
|
||||
src_offset+=spitch;
|
||||
dst_offset+=dpitch;
|
||||
}
|
||||
} else {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
|
||||
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n);
|
||||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,cq);
|
||||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
||||
rows);
|
||||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows,CUstream &cq) {
|
||||
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
||||
rows,cq);
|
||||
}
|
||||
|
||||
} // namespace ucl_cudart
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
/***************************************************************************
|
||||
nvd_texture.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with CUDA Driver textures
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Fri Jul 2 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVD_TEXTURE
|
||||
#define NVD_TEXTURE
|
||||
|
||||
#include "nvd_kernel.h"
|
||||
#include "nvd_mat.h"
|
||||
|
||||
namespace ucl_cudadr {
|
||||
|
||||
/// Class storing a texture reference
|
||||
class UCL_Texture {
|
||||
public:
|
||||
UCL_Texture() {}
|
||||
~UCL_Texture() {}
|
||||
/// Construct with a specified texture reference
|
||||
inline UCL_Texture(UCL_Program &prog, const char *texture_name)
|
||||
{ get_texture(prog,texture_name); }
|
||||
/// Set the texture reference for this object
|
||||
inline void get_texture(UCL_Program &prog, const char *texture_name)
|
||||
{ CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class mat_typ>
|
||||
inline void bind_float(mat_typ &vec, const unsigned numel) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(numel!=0 && numel<5);
|
||||
#endif
|
||||
CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
|
||||
vec.numel()*vec.element_size()));
|
||||
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
|
||||
}
|
||||
|
||||
/// Unbind the texture reference from the memory allocation
|
||||
inline void unbind() { }
|
||||
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &kernel) {
|
||||
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
|
||||
}
|
||||
|
||||
private:
|
||||
CUtexref _tex;
|
||||
friend class UCL_Kernel;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
/***************************************************************************
|
||||
nvd_timer.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Class for timing CUDA Driver routines
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Fri Jan 22 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef NVD_TIMER_H
|
||||
#define NVD_TIMER_H
|
||||
|
||||
#include "nvd_macros.h"
|
||||
|
||||
namespace ucl_cudadr {
|
||||
|
||||
/// Class for timing CUDA Driver events
|
||||
class UCL_Timer {
|
||||
public:
|
||||
inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
|
||||
inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
|
||||
{ init(dev); }
|
||||
|
||||
inline ~UCL_Timer() { clear(); }
|
||||
|
||||
/// Clear any data associated with timer
|
||||
/** \note init() must be called to reuse timer after a clear() **/
|
||||
inline void clear() {
|
||||
if (_initialized) {
|
||||
CU_SAFE_CALL(cuEventDestroy(start_event));
|
||||
CU_SAFE_CALL(cuEventDestroy(stop_event));
|
||||
_initialized=false;
|
||||
_total_time=0.0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize default command queue for timing
|
||||
inline void init(UCL_Device &dev) { init(dev, dev.cq()); }
|
||||
|
||||
/// Initialize command queue for timing
|
||||
inline void init(UCL_Device &dev, command_queue &cq) {
|
||||
clear();
|
||||
_cq=cq;
|
||||
_initialized=true;
|
||||
CU_SAFE_CALL( cuEventCreate(&start_event,0) );
|
||||
CU_SAFE_CALL( cuEventCreate(&stop_event,0) );
|
||||
}
|
||||
|
||||
/// Start timing on command queue
|
||||
inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); }
|
||||
|
||||
/// Stop timing on command queue
|
||||
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
inline void zero() {
|
||||
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
|
||||
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
|
||||
}
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline double add_to_total()
|
||||
{ double t=time(); _total_time+=t; return t/1000.0; }
|
||||
|
||||
/// Return the time (ms) of last start to stop - Forces synchronization
|
||||
inline double time() {
|
||||
float timer;
|
||||
CU_SAFE_CALL(cuEventSynchronize(stop_event));
|
||||
CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) );
|
||||
return timer;
|
||||
}
|
||||
|
||||
/// Return the time (s) of last start to stop - Forces synchronization
|
||||
inline double seconds() { return time()/1000.0; }
|
||||
|
||||
/// Return the total time in ms
|
||||
inline double total_time() { return _total_time; }
|
||||
|
||||
/// Return the total time in seconds
|
||||
inline double total_seconds() { return _total_time/1000.0; }
|
||||
|
||||
private:
|
||||
CUevent start_event, stop_event;
|
||||
CUstream _cq;
|
||||
double _total_time;
|
||||
bool _initialized;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
|
@ -0,0 +1,449 @@
|
|||
/***************************************************************************
|
||||
ocl_device.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with OpenCL devices
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Mon Dec 23 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef OCL_DEVICE
|
||||
#define OCL_DEVICE
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "CL/cl.h"
|
||||
#include "CL/cl_platform.h"
|
||||
#include "ocl_macros.h"
|
||||
#include "ucl_types.h"
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - COMMAND QUEUE STUFF
|
||||
// --------------------------------------------------------------------------
|
||||
typedef cl_command_queue command_queue;
|
||||
typedef cl_context context_type;
|
||||
|
||||
inline void ucl_sync(cl_command_queue &cq) {
|
||||
CL_SAFE_CALL(clFinish(cq));
|
||||
}
|
||||
|
||||
struct OCLProperties {
|
||||
std::string name;
|
||||
cl_device_type device_type;
|
||||
cl_ulong global_mem;
|
||||
cl_ulong shared_mem;
|
||||
cl_ulong const_mem;
|
||||
cl_uint compute_units;
|
||||
cl_uint clock;
|
||||
size_t work_group_size;
|
||||
size_t work_item_size[3];
|
||||
bool double_precision;
|
||||
int alignment;
|
||||
size_t timer_resolution;
|
||||
};
|
||||
|
||||
/// Class for looking at data parallel device properties
|
||||
/** \note Calls to change the device outside of the class results in incorrect
|
||||
* behavior
|
||||
* \note There is no error checking for indexing past the number of devices **/
|
||||
class UCL_Device {
|
||||
public:
|
||||
/// Collect properties for every device on the node
|
||||
/** \note You must set the active GPU with set() before using the device **/
|
||||
UCL_Device();
|
||||
|
||||
~UCL_Device();
|
||||
|
||||
/// Return the number of platforms (0 if error or no platforms)
|
||||
inline int num_platforms() { return _num_platforms; }
|
||||
|
||||
/// Return a string with name and info of the current platform
|
||||
std::string platform_name();
|
||||
|
||||
/// Return the number of devices that support OpenCL
|
||||
inline int num_devices() { return _num_devices; }
|
||||
|
||||
/// Set the OpenCL device to the specified device number
|
||||
/** A context and default command queue will be created for the device **/
|
||||
void set(int num);
|
||||
|
||||
/// Get the current device number
|
||||
inline int device_num() { return _device; }
|
||||
|
||||
/// Returns the context for the current device
|
||||
inline cl_context & context() { return _context; }
|
||||
|
||||
/// Returns the default stream for the current device
|
||||
inline command_queue & cq() { return cq(0); }
|
||||
|
||||
/// Returns the stream indexed by i
|
||||
inline command_queue & cq(const int i) { return _cq[i]; }
|
||||
|
||||
/// Block until all commands in the default stream have completed
|
||||
inline void sync() { sync(0); }
|
||||
|
||||
/// Block until all commands in the specified stream have completed
|
||||
inline void sync(const int i) { ucl_sync(cq(i)); }
|
||||
|
||||
/// Get the number of command queues currently available on device
|
||||
inline int num_queues()
|
||||
{ return _cq.size(); }
|
||||
|
||||
/// Add a command queue for device computations (with profiling enabled)
|
||||
inline void push_command_queue() {
|
||||
cl_int errorv;
|
||||
_cq.push_back(cl_command_queue());
|
||||
_cq.back()=clCreateCommandQueue(_context,_cl_device,
|
||||
CL_QUEUE_PROFILING_ENABLE,&errorv);
|
||||
if (errorv!=CL_SUCCESS) {
|
||||
std::cerr << "Could not create command queue on device: " << name()
|
||||
<< std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove a stream for device computations
|
||||
/** \note You cannot delete the default stream **/
|
||||
inline void pop_command_queue() {
|
||||
if (_cq.size()<2) return;
|
||||
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
|
||||
_cq.pop_back();
|
||||
}
|
||||
|
||||
/// Get the current OpenCL device name
|
||||
inline std::string name() { return name(_device); }
|
||||
/// Get the OpenCL device name
|
||||
inline std::string name(const int i)
|
||||
{ return std::string(_properties[i].name); }
|
||||
|
||||
/// Get a string telling the type of the current device
|
||||
inline std::string device_type_name() { return device_type_name(_device); }
|
||||
/// Get a string telling the type of the device
|
||||
inline std::string device_type_name(const int i);
|
||||
|
||||
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type() { return device_type(_device); }
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i);
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
bool double_precision(const int i) {return _properties[i].double_precision;}
|
||||
|
||||
/// Get the number of cores in the current device
|
||||
inline unsigned cores() { return cores(_device); }
|
||||
/// Get the number of cores
|
||||
inline unsigned cores(const int i)
|
||||
{ if (device_type(i)==UCL_CPU) return _properties[i].compute_units;
|
||||
else return _properties[i].compute_units*8; }
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
/// Get the gigabytes of global memory
|
||||
inline double gigabytes(const int i)
|
||||
{ return static_cast<double>(_properties[i].global_mem)/1073741824; }
|
||||
|
||||
/// Get the bytes of global memory in the current device
|
||||
inline size_t bytes() { return bytes(_device); }
|
||||
/// Get the bytes of global memory
|
||||
inline size_t bytes(const int i) { return _properties[i].global_mem; }
|
||||
|
||||
/// Return the GPGPU revision number for current device
|
||||
//inline double revision() { return revision(_device); }
|
||||
/// Return the GPGPU revision number
|
||||
//inline double revision(const int i)
|
||||
// { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
|
||||
|
||||
/// Clock rate in GHz for current device
|
||||
inline double clock_rate() { return clock_rate(_device); }
|
||||
/// Clock rate in GHz
|
||||
inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}
|
||||
|
||||
/// Return the address alignment in bytes
|
||||
inline int alignment() { return alignment(_device); }
|
||||
/// Return the address alignment in bytes
|
||||
inline int alignment(const int i) { return _properties[i].alignment; }
|
||||
|
||||
/// Return the timer resolution
|
||||
inline size_t timer_resolution() { return timer_resolution(_device); }
|
||||
/// Return the timer resolution
|
||||
inline size_t timer_resolution(const int i)
|
||||
{ return _properties[i].timer_resolution; }
|
||||
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size() { return group_size(_device); }
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size(const int i)
|
||||
{ return _properties[i].work_group_size; }
|
||||
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
/// Return the maximum memory pitch in bytes
|
||||
inline size_t max_pitch(const int i) { return 0; }
|
||||
|
||||
/// List all devices along with all properties
|
||||
void print_all(std::ostream &out);
|
||||
|
||||
/// Return the OpenCL type for the device
|
||||
inline cl_device_id & cl_device() { return _cl_device; }
|
||||
|
||||
private:
|
||||
int _num_platforms; // Number of platforms
|
||||
int _platform; // UCL_Device ID for current platform
|
||||
cl_platform_id _cl_platform; // OpenCL ID for current platform
|
||||
cl_context _context; // Context used for accessing the device
|
||||
std::vector<cl_command_queue> _cq;// The default command queue for this device
|
||||
int _device; // UCL_Device ID for current device
|
||||
cl_device_id _cl_device; // OpenCL ID for current device
|
||||
std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices
|
||||
int _num_devices; // Number of devices
|
||||
std::vector<OCLProperties> _properties; // Properties for each device
|
||||
|
||||
void add_properties(cl_device_id);
|
||||
void create_context();
|
||||
|
||||
};
|
||||
|
||||
// Grabs the properties for all devices
|
||||
inline UCL_Device::UCL_Device() {
|
||||
cl_int errorv;
|
||||
cl_uint nplatforms;
|
||||
|
||||
_cl_device=0;
|
||||
_device=-1;
|
||||
_num_devices=0;
|
||||
_platform=0;
|
||||
|
||||
// --- Get Number of Platforms
|
||||
errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms);
|
||||
|
||||
if (errorv!=CL_SUCCESS) {
|
||||
_num_platforms=0;
|
||||
return;
|
||||
} else
|
||||
_num_platforms=static_cast<int>(nplatforms);
|
||||
|
||||
|
||||
// --- Get Number of Devices
|
||||
cl_uint n;
|
||||
errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n);
|
||||
_num_devices=n;
|
||||
if (errorv!=CL_SUCCESS || _num_devices==0) {
|
||||
_num_devices=0;
|
||||
return;
|
||||
}
|
||||
cl_device_id device_list[_num_devices];
|
||||
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
|
||||
&n));
|
||||
|
||||
// --- Store properties for each device
|
||||
for (int i=0; i<_num_devices; i++) {
|
||||
_cl_devices.push_back(device_list[i]);
|
||||
add_properties(device_list[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline UCL_Device::~UCL_Device() {
|
||||
if (_device>-1) {
|
||||
for (size_t i=0; i<_cq.size(); i++) {
|
||||
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
|
||||
_cq.pop_back();
|
||||
}
|
||||
CL_SAFE_CALL(clReleaseContext(_context));
|
||||
}
|
||||
}
|
||||
|
||||
inline void UCL_Device::create_context() {
|
||||
cl_int errorv;
|
||||
cl_context_properties props[3];
|
||||
props[0]=CL_CONTEXT_PLATFORM;
|
||||
props[1]=_platform;
|
||||
props[2]=0;
|
||||
_context=clCreateContext(0,1,&_cl_device,NULL,NULL,&errorv);
|
||||
if (errorv!=CL_SUCCESS) {
|
||||
std::cerr << "Could not create context on device: " << name() << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
push_command_queue();
|
||||
}
|
||||
|
||||
inline void UCL_Device::add_properties(cl_device_id device_list) {
|
||||
OCLProperties op;
|
||||
char buffer[1024];
|
||||
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
|
||||
op.name=buffer;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE,
|
||||
sizeof(op.global_mem),&op.global_mem,NULL));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_LOCAL_MEM_SIZE,
|
||||
sizeof(op.shared_mem),&op.shared_mem,NULL));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
|
||||
sizeof(op.const_mem),&op.const_mem,NULL));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_TYPE,
|
||||
sizeof(op.device_type),&op.device_type,NULL));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_COMPUTE_UNITS,
|
||||
sizeof(op.compute_units),&op.compute_units,
|
||||
NULL));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CLOCK_FREQUENCY,
|
||||
sizeof(op.clock),&op.clock,NULL));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_GROUP_SIZE,
|
||||
sizeof(op.work_group_size),&op.work_group_size,
|
||||
NULL));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_ITEM_SIZES,
|
||||
3*sizeof(op.work_item_size[0]),op.work_item_size,
|
||||
NULL));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN,
|
||||
sizeof(cl_uint),&op.alignment,NULL));
|
||||
op.alignment/=8;
|
||||
|
||||
// Determine if double precision is supported
|
||||
cl_uint double_width;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
|
||||
sizeof(double_width),&double_width,NULL));
|
||||
if (double_width==0)
|
||||
op.double_precision=false;
|
||||
else
|
||||
op.double_precision=true;
|
||||
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_PROFILING_TIMER_RESOLUTION,
|
||||
sizeof(size_t),&op.timer_resolution,NULL));
|
||||
|
||||
_properties.push_back(op);
|
||||
}
|
||||
|
||||
inline std::string UCL_Device::platform_name() {
|
||||
char info[1024];
|
||||
|
||||
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
|
||||
NULL));
|
||||
std::string ans=std::string(info)+' ';
|
||||
|
||||
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info,
|
||||
NULL));
|
||||
ans+=std::string(info)+' ';
|
||||
|
||||
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info,
|
||||
NULL));
|
||||
ans+=std::string(info);
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
// Get a string telling the type of the device
|
||||
inline std::string UCL_Device::device_type_name(const int i) {
|
||||
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
|
||||
return "CPU";
|
||||
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
|
||||
return "GPU";
|
||||
else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
|
||||
return "ACCELERATOR";
|
||||
else
|
||||
return "DEFAULT";
|
||||
}
|
||||
|
||||
// Get a string telling the type of the device
|
||||
inline int UCL_Device::device_type(const int i) {
|
||||
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
|
||||
return UCL_CPU;
|
||||
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
|
||||
return UCL_GPU;
|
||||
else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
|
||||
return UCL_ACCELERATOR;
|
||||
else
|
||||
return UCL_DEFAULT;
|
||||
}
|
||||
|
||||
// Set the CUDA device to the specified device number
|
||||
inline void UCL_Device::set(int num) {
|
||||
if (_device==num)
|
||||
return;
|
||||
|
||||
if (_device>-1) {
|
||||
for (size_t i=0; i<_cq.size(); i++) {
|
||||
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
|
||||
_cq.pop_back();
|
||||
}
|
||||
CL_SAFE_CALL(clReleaseContext(_context));
|
||||
}
|
||||
|
||||
cl_device_id device_list[_num_devices];
|
||||
cl_uint n;
|
||||
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
|
||||
device_list,&n));
|
||||
|
||||
_device=num;
|
||||
_cl_device=device_list[_device];
|
||||
create_context();
|
||||
}
|
||||
|
||||
// List all devices along with all properties
|
||||
inline void UCL_Device::print_all(std::ostream &out) {
|
||||
if (num_devices() == 0)
|
||||
out << "There is no device supporting OpenCL\n";
|
||||
for (int i=0; i<num_devices(); ++i) {
|
||||
out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
|
||||
out << " Type of device: "
|
||||
<< device_type_name(i).c_str() << std::endl;
|
||||
out << " Double precision support: ";
|
||||
if (double_precision(i))
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Total amount of global memory: "
|
||||
<< gigabytes(i) << " GB\n";
|
||||
out << " Number of compute units/multiprocessors: "
|
||||
<< _properties[i].compute_units << std::endl;
|
||||
//out << " Number of cores: "
|
||||
// << cores(i) << std::endl;
|
||||
out << " Total amount of constant memory: "
|
||||
<< _properties[i].const_mem << " bytes\n";
|
||||
out << " Total amount of local/shared memory per block: "
|
||||
<< _properties[i].shared_mem << " bytes\n";
|
||||
//out << " Total number of registers available per block: "
|
||||
// << _properties[i].regsPerBlock << std::endl;
|
||||
//out << " Warp size: "
|
||||
// << _properties[i].warpSize << std::endl;
|
||||
out << " Maximum group size (# of threads per block) "
|
||||
<< _properties[i].work_group_size << std::endl;
|
||||
out << " Maximum item sizes (# threads for each dim) "
|
||||
<< _properties[i].work_item_size[0] << " x "
|
||||
<< _properties[i].work_item_size[1] << " x "
|
||||
<< _properties[i].work_item_size[2] << std::endl;
|
||||
//out << " Maximum sizes of each dimension of a grid: "
|
||||
// << _properties[i].maxGridSize[0] << " x "
|
||||
// << _properties[i].maxGridSize[1] << " x "
|
||||
// << _properties[i].maxGridSize[2] << std::endl;
|
||||
//out << " Maximum memory pitch: "
|
||||
// << _properties[i].memPitch) << " bytes\n";
|
||||
//out << " Texture alignment: "
|
||||
// << _properties[i].textureAlignment << " bytes\n";
|
||||
out << " Clock rate: "
|
||||
<< clock_rate(i) << " GHz\n";
|
||||
//out << " Concurrent copy and execution: ";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,254 @@
|
|||
/***************************************************************************
|
||||
ocl_kernel.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with OpenCL kernels
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Sun Feb 7 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef OCL_KERNEL
|
||||
#define OCL_KERNEL
|
||||
|
||||
#include "ocl_device.h"
|
||||
#include <fstream>
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
/// Class storing 1 or more kernel functions from a single string or file
|
||||
class UCL_Program {
|
||||
public:
|
||||
inline UCL_Program() : _init_done(false) {}
|
||||
inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
|
||||
inline ~UCL_Program() { clear(); }
|
||||
|
||||
/// Initialize the program with a device
|
||||
inline void init(UCL_Device &device) {
|
||||
clear();
|
||||
_device=device.cl_device();
|
||||
_context=device.context();
|
||||
_cq=device.cq();
|
||||
CL_SAFE_CALL(clRetainContext(_context));
|
||||
CL_SAFE_CALL(clRetainCommandQueue(_cq));
|
||||
_init_done=true;
|
||||
}
|
||||
|
||||
/// Clear any data associated with program
|
||||
/** \note Must call init() after each clear **/
|
||||
inline void clear() {
|
||||
if (_init_done) {
|
||||
CL_SAFE_CALL(clReleaseProgram(_program));
|
||||
CL_SAFE_CALL(clReleaseContext(_context));
|
||||
CL_SAFE_CALL(clReleaseCommandQueue(_cq));
|
||||
_init_done=false;
|
||||
}
|
||||
}
|
||||
|
||||
/// Load a program from a file and compile with flags
|
||||
inline int load(const char *filename, const char *flags="",
|
||||
std::string *log=NULL) {
|
||||
std::ifstream in(filename);
|
||||
if (!in || in.is_open()==false) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open kernel file: "
|
||||
<< filename << std::endl;
|
||||
exit(1);
|
||||
#endif
|
||||
return UCL_FILE_NOT_FOUND;
|
||||
}
|
||||
|
||||
std::string program((std::istreambuf_iterator<char>(in)),
|
||||
std::istreambuf_iterator<char>());
|
||||
in.close();
|
||||
return load_string(program.c_str(),flags,log);
|
||||
}
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const char *program, const char *flags="",
|
||||
std::string *log=NULL) {
|
||||
cl_int error_flag;
|
||||
const char *prog=program;
|
||||
_program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag);
|
||||
CL_CHECK_ERR(error_flag);
|
||||
error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL);
|
||||
cl_build_status build_status;
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,
|
||||
CL_PROGRAM_BUILD_STATUS,
|
||||
sizeof(cl_build_status),&build_status,
|
||||
NULL));
|
||||
|
||||
if (build_status != CL_SUCCESS || log!=NULL) {
|
||||
size_t ms;
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
|
||||
NULL, &ms));
|
||||
char build_log[ms];
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
|
||||
build_log, NULL));
|
||||
|
||||
if (log!=NULL)
|
||||
*log=std::string(build_log);
|
||||
|
||||
if (build_status != CL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << std::endl
|
||||
<< "----------------------------------------------------------\n"
|
||||
<< " UCL Error: Error compiling OpenCL Program...\n"
|
||||
<< "----------------------------------------------------------\n";
|
||||
std::cerr << build_log << std::endl;
|
||||
#endif
|
||||
return UCL_COMPILE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
friend class UCL_Kernel;
|
||||
private:
|
||||
bool _init_done;
|
||||
cl_program _program;
|
||||
cl_device_id _device;
|
||||
cl_context _context;
|
||||
cl_command_queue _cq;
|
||||
};
|
||||
|
||||
/// Class for dealing with OpenCL kernels
|
||||
class UCL_Kernel {
|
||||
public:
|
||||
UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0)
|
||||
{ _block_size[0]=0; _num_blocks[0]=0; }
|
||||
|
||||
inline UCL_Kernel(UCL_Program &program, const char *function) :
|
||||
_dimensions(1), _function_set(false), _num_args(0)
|
||||
{ _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); }
|
||||
|
||||
inline ~UCL_Kernel() { clear(); }
|
||||
|
||||
/// Clear any function associated with the kernel
|
||||
inline void clear() {
|
||||
if (_function_set) {
|
||||
clReleaseKernel(_kernel);
|
||||
clReleaseProgram(_program);
|
||||
clReleaseCommandQueue(_cq);
|
||||
_function_set=false;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the kernel function from a program
|
||||
/** \return UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
|
||||
inline int set_function(UCL_Program &program, const char *function);
|
||||
|
||||
/// Set the kernel argument.
|
||||
/** If not a device pointer, this must be repeated each time the argument
|
||||
* changes **/
|
||||
template <class dtype>
|
||||
inline void set_arg(const cl_uint index, dtype *arg) {
|
||||
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
|
||||
if (index>_num_args) _num_args=index;
|
||||
}
|
||||
|
||||
/// Add a kernel argument.
|
||||
template <class dtype>
|
||||
inline void add_arg(dtype *arg) {
|
||||
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
|
||||
_num_args++;
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size) {
|
||||
_dimensions=1;
|
||||
_num_blocks[0]=num_blocks*block_size;
|
||||
_block_size[0]=block_size;
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x*block_size_x;
|
||||
_block_size[0]=block_size_x;
|
||||
_num_blocks[1]=num_blocks_y*block_size_y;
|
||||
_block_size[1]=block_size_y;
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x,
|
||||
const size_t block_size_y, const size_t block_size_z) {
|
||||
_dimensions=3;
|
||||
const size_t num_blocks_z=1;
|
||||
_num_blocks[0]=num_blocks_x*block_size_x;
|
||||
_block_size[0]=block_size_x;
|
||||
_num_blocks[1]=num_blocks_y*block_size_y;
|
||||
_block_size[1]=block_size_y;
|
||||
_num_blocks[2]=num_blocks_z*block_size_z;
|
||||
_block_size[2]=block_size_z;
|
||||
}
|
||||
|
||||
/// Run the kernel in the default command queue
|
||||
inline void run() {
|
||||
run(_cq);
|
||||
}
|
||||
|
||||
/// Run the kernel in the specified command queue
|
||||
inline void run(command_queue &cq) {
|
||||
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,_kernel,_dimensions,NULL,
|
||||
_num_blocks,_block_size,0,NULL,NULL));
|
||||
}
|
||||
|
||||
/// Clear any arguments associated with the kernel
|
||||
inline void clear_args() { _num_args=0; }
|
||||
|
||||
#include "ucl_arg_kludge.h"
|
||||
|
||||
private:
|
||||
cl_kernel _kernel;
|
||||
cl_program _program;
|
||||
cl_uint _dimensions;
|
||||
size_t _block_size[3];
|
||||
size_t _num_blocks[3];
|
||||
bool _function_set;
|
||||
|
||||
cl_command_queue _cq; // The default command queue for this kernel
|
||||
unsigned _num_args;
|
||||
};
|
||||
|
||||
inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
|
||||
clear();
|
||||
_function_set=true;
|
||||
_cq=program._cq;
|
||||
CL_SAFE_CALL(clRetainCommandQueue(_cq));
|
||||
_program=program._program;
|
||||
CL_SAFE_CALL(clRetainProgram(_program));
|
||||
cl_int error_flag;
|
||||
_kernel=clCreateKernel(program._program,function,&error_flag);
|
||||
|
||||
if (error_flag!=CL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not find function: " << function
|
||||
<< " in program.\n";
|
||||
exit(1);
|
||||
#endif
|
||||
return UCL_FUNCTION_NOT_FOUND;
|
||||
}
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
/***************************************************************************
|
||||
ocl_mat.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
OpenCL Specific Vector/Matrix Containers, Memory Management, and I/O
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Wed Jan 13 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
/*! \file */
|
||||
|
||||
#ifndef OCL_MAT_H
|
||||
#define OCL_MAT_H
|
||||
|
||||
#include "ocl_memory.h"
|
||||
|
||||
/// Namespace for OpenCL routines
|
||||
namespace ucl_opencl {
|
||||
|
||||
#define _UCL_MAT_ALLOW
|
||||
#define _UCL_DEVICE_PTR_MAT
|
||||
#define _OCL_MAT
|
||||
#include "ucl_basemat.h"
|
||||
#include "ucl_h_vec.h"
|
||||
#include "ucl_h_mat.h"
|
||||
#include "ucl_d_vec.h"
|
||||
#include "ucl_d_mat.h"
|
||||
#undef _UCL_DEVICE_PTR_MAT
|
||||
#undef _OCL_MAT
|
||||
#undef _UCL_MAT_ALLOW
|
||||
|
||||
#define UCL_COPY_ALLOW
|
||||
#include "ucl_copy.h"
|
||||
#undef UCL_COPY_ALLOW
|
||||
|
||||
#define UCL_PRINT_ALLOW
|
||||
#include "ucl_print.h"
|
||||
#undef UCL_PRINT_ALLOW
|
||||
|
||||
} // namespace ucl_cudart
|
||||
|
||||
#endif
|
|
@ -0,0 +1,59 @@
|
|||
/***************************************************************************
|
||||
ocl_texture.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Utilities for dealing with OpenCL textures
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Fri Jul 2 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef OCL_TEXTURE
|
||||
#define OCL_TEXTURE
|
||||
|
||||
#include "ocl_kernel.h"
|
||||
#include "ocl_mat.h"
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
/// Class storing a texture reference
|
||||
class UCL_Texture {
|
||||
public:
|
||||
UCL_Texture() {}
|
||||
~UCL_Texture() {}
|
||||
/// Construct with a specified texture reference
|
||||
inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
|
||||
/// Set the texture reference for this object
|
||||
inline void get_texture(UCL_Program &prog, const char *texture_name) { }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class mat_typ>
|
||||
inline void bind_float(mat_typ &vec, const unsigned numel) { }
|
||||
|
||||
/// Unbind the texture reference from the memory allocation
|
||||
inline void unbind() { }
|
||||
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &kernel) { }
|
||||
|
||||
private:
|
||||
friend class UCL_Kernel;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
/***************************************************************************
|
||||
ocl_timer.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Class for timing OpenCL routines
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Jan Fri 22 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef OCL_TIMER_H
|
||||
#define OCL_TIMER_H
|
||||
|
||||
#include "ocl_macros.h"
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
/// Class for timing OpenCL events
|
||||
class UCL_Timer {
|
||||
public:
|
||||
inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
|
||||
inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
|
||||
{ init(dev); }
|
||||
|
||||
inline ~UCL_Timer() { clear(); }
|
||||
|
||||
/// Clear any data associated with timer
|
||||
/** \note init() must be called to reuse timer after a clear() **/
|
||||
inline void clear() {
|
||||
if (_initialized) {
|
||||
CL_SAFE_CALL(clReleaseCommandQueue(_cq));
|
||||
clReleaseEvent(start_event);
|
||||
clReleaseEvent(stop_event);
|
||||
_initialized=false;
|
||||
_total_time=0.0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize default command queue for timing
|
||||
inline void init(UCL_Device &dev) { init(dev,dev.cq()); }
|
||||
|
||||
/// Initialize command queue for timing
|
||||
inline void init(UCL_Device &dev, command_queue &cq) {
|
||||
clear();
|
||||
t_factor=dev.timer_resolution()/1000000000.0;
|
||||
_cq=cq;
|
||||
clRetainCommandQueue(_cq);
|
||||
_initialized=true;
|
||||
}
|
||||
|
||||
/// Start timing on default command queue
|
||||
inline void start() { clEnqueueMarker(_cq,&start_event); }
|
||||
|
||||
/// Stop timing on default command queue
|
||||
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
inline void zero()
|
||||
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline double add_to_total()
|
||||
{ double t=time(); _total_time+=t; return t/1000.0; }
|
||||
|
||||
/// Return the time (ms) of last start to stop - Forces synchronization
|
||||
inline double time() {
|
||||
cl_ulong tstart,tend;
|
||||
CL_SAFE_CALL(clWaitForEvents(1,&stop_event));
|
||||
CL_SAFE_CALL(clGetEventProfilingInfo(stop_event,
|
||||
CL_PROFILING_COMMAND_START,
|
||||
sizeof(cl_ulong), &tend, NULL));
|
||||
CL_SAFE_CALL(clGetEventProfilingInfo(start_event,
|
||||
CL_PROFILING_COMMAND_END,
|
||||
sizeof(cl_ulong), &tstart, NULL));
|
||||
return (tend-tstart)*t_factor;
|
||||
}
|
||||
|
||||
/// Return the time (s) of last start to stop - Forces synchronization
|
||||
inline double seconds() { return time()/1000.0; }
|
||||
|
||||
/// Return the total time in ms
|
||||
inline double total_time() { return _total_time; }
|
||||
|
||||
/// Return the total time in seconds
|
||||
inline double total_seconds() { return _total_time/1000.0; }
|
||||
|
||||
private:
|
||||
cl_event start_event, stop_event;
|
||||
cl_command_queue _cq;
|
||||
double _total_time;
|
||||
bool _initialized;
|
||||
double t_factor;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
|
@ -0,0 +1,673 @@
|
|||
/***************************************************************************
|
||||
ucl_arg_kludge.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Allow multiple arguments to be added for a kernel call at a single time
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Sun Feb 7 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
template <class t1, class t2>
|
||||
inline void add_args(t1 *a1, t2 *a2) {
|
||||
add_arg(a1); add_arg(a2);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class t1>
|
||||
inline void run(t1 *a1) {
|
||||
clear_args();
|
||||
add_arg(a1);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2>
|
||||
inline void run(t1 *a1, t2 *a2) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
|
||||
run();
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
run();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class t1>
|
||||
inline void run_cq(command_queue &cq, t1 *a1) {
|
||||
clear_args();
|
||||
add_arg(a1);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
run(cq);
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
/***************************************************************************
|
||||
ucl_basemat.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Vector/Matrix Base Container
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jun 25 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
// Only allow this file to be included by CUDA and OpenCL specific headers
|
||||
#ifdef _UCL_MAT_ALLOW
|
||||
|
||||
#include "ucl_types.h"
|
||||
|
||||
#define UCL_H_VecT UCL_H_Vec<numtyp>
|
||||
#define UCL_H_VecD UCL_H_Vec<double>
|
||||
#define UCL_H_VecS UCL_H_Vec<float>
|
||||
#define UCL_H_VecI UCL_H_Vec<int>
|
||||
|
||||
#define UCL_D_VecT UCL_D_Vec<numtyp>
|
||||
#define UCL_D_VecD UCL_D_Vec<double>
|
||||
#define UCL_D_VecS UCL_D_Vec<float>
|
||||
#define UCL_D_VecI UCL_D_Vec<int>
|
||||
#define UCL_D_VecI2 UCL_D_Vec<int2>
|
||||
#define UCL_D_VecU2 UCL_D_Vec<uint2>
|
||||
|
||||
#define UCL_D_MatT UCL_D_Mat<numtyp>
|
||||
#define UCL_D_MatD UCL_D_Mat<double>
|
||||
#define UCL_D_MatS UCL_D_Mat<float>
|
||||
#define UCL_D_MatI UCL_D_Mat<int>
|
||||
|
||||
#define UCL_ConstMatT UCL_ConstMat<numtyp>
|
||||
#define UCL_ConstMatD UCL_ConstMat<double>
|
||||
#define UCL_ConstMatS UCL_ConstMat<float>
|
||||
#define UCL_ConstMatI UCL_ConstMat<int>
|
||||
#define UCL_ConstMatD2 UCL_ConstMat<double2>
|
||||
|
||||
/// Base class for vector/matrix containers
|
||||
/** All containers are associated with a default command queue.
|
||||
* For CUDA, this is the default stream.
|
||||
*
|
||||
* The default queue is used for asynchonrous operations on the container
|
||||
* that do not specify a queue. For OpenCL, this queue is also used in
|
||||
* calls for reserving and copying memory **/
|
||||
class UCL_BaseMat {
|
||||
public:
|
||||
UCL_BaseMat() : _cq(0) { }
|
||||
virtual ~UCL_BaseMat() { }
|
||||
/// Return the default command queue/stream associated with this data
|
||||
inline command_queue & cq() { return _cq; }
|
||||
/// Block until command_queue associated with matrix is complete
|
||||
inline void sync() { ucl_sync(_cq); }
|
||||
|
||||
#ifdef UCL_DEBUG
|
||||
// Returns the type of host allocation
|
||||
virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; }
|
||||
#endif
|
||||
protected:
|
||||
command_queue _cq;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,826 @@
|
|||
/***************************************************************************
|
||||
ucl_copy.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Routines for copying matrix/vector data onto and off coprocessor device
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Mon Jan 4 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
/***************************************************************************
|
||||
The ucl_copy and ucl_cast_copy routines provide a general prototype for
|
||||
copying data between host and device memory (including texture memory)
|
||||
for the matrix and vector types in nvc_memory.
|
||||
|
||||
For host/host and host/device transfers, typecasting is performed
|
||||
automatically as necessary.
|
||||
|
||||
The routines are written so that all branches can be removed by the
|
||||
compiler during template instantiation.
|
||||
|
||||
The routines currently assume row-major ordering for all types.
|
||||
|
||||
For asynchronous copy in the default command queue, async is boolean true;
|
||||
For asynchronous copy in a specified command queue, async is command queue
|
||||
Otherwise, set async to boolean false;
|
||||
|
||||
When performing frequent data copies that require casting, it is more
|
||||
efficient to allocate a casting buffer once and then pass that buffer
|
||||
to the copy routine. This can be accomplished with the ucl_cast_copy
|
||||
routines.
|
||||
|
||||
Examples
|
||||
(x's represent alignment padding - to maintain alignment)
|
||||
(o's represent a larger matrix in memory)
|
||||
(vectors represented as single row)
|
||||
----------------------------------------------------------------
|
||||
dst src command
|
||||
----------------------------------------------------------------
|
||||
0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async)
|
||||
|
||||
0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async)
|
||||
|
||||
0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async)
|
||||
3 4 5
|
||||
|
||||
0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async)
|
||||
3 4 5
|
||||
|
||||
0 1 2 <-- 0 1 2 ucl_copy(dst,src,async)
|
||||
3 4 5 3 4 5
|
||||
|
||||
0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async)
|
||||
3 4 5 3 4 5
|
||||
5 6 7
|
||||
|
||||
0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async)
|
||||
4 5 6 4 5 6 7
|
||||
8 9 10 11
|
||||
|
||||
0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async)
|
||||
3 4 5 x x 3 4 5
|
||||
|
||||
0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async)
|
||||
3 4 5 3 4 5 x x
|
||||
|
||||
0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async)
|
||||
3 4 5 o o 3 4 5
|
||||
o o o o o
|
||||
|
||||
0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async)
|
||||
3 4 5 o o
|
||||
o o o o o
|
||||
|
||||
0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async)
|
||||
2 3 o o o
|
||||
o o o o o
|
||||
|
||||
0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
|
||||
5 6 7 o o 5 6 7 8 9
|
||||
o o o o o 10 11 12 13 14
|
||||
|
||||
0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
|
||||
5 6 7 8 9
|
||||
10 11 12 13 14
|
||||
|
||||
***************************************************************************/
|
||||
|
||||
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
|
||||
#ifdef UCL_COPY_ALLOW
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - HOST-HOST COPY ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// Have to use specialization because some types don't have operator[]
|
||||
template <int host_t1, int host_t2> struct _host_host_copy;
|
||||
|
||||
// Both on host
|
||||
template <> struct _host_host_copy<1,1> {
|
||||
template <class mat1, class mat2>
|
||||
static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(mat1::PADDED==0 && mat2::PADDED==0);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
|
||||
memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
|
||||
else
|
||||
for (size_t i=0; i<numel; i++)
|
||||
dst[i]=static_cast<typename mat1::data_type>(src[i]);
|
||||
}
|
||||
template <class mat1, class mat2>
|
||||
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
size_t dst_row_size, src_row_size;
|
||||
if (mat1::VECTOR)
|
||||
dst_row_size=cols;
|
||||
else
|
||||
dst_row_size=dst.row_size();
|
||||
if (mat2::VECTOR)
|
||||
src_row_size=cols;
|
||||
else
|
||||
src_row_size=src.row_size();
|
||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
|
||||
for (size_t i=0; i<rows; i++)
|
||||
memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
|
||||
cols*sizeof(typename mat1::data_type));
|
||||
else
|
||||
for (size_t j=0; j<rows; j++) {
|
||||
int dst_i=j*dst_row_size;
|
||||
int d_end=dst_i+cols;
|
||||
int src_i=j*src_row_size;
|
||||
for (; dst_i<d_end; dst_i++) {
|
||||
dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
|
||||
src_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Should never be here
|
||||
template <int host_t1, int host_t2> struct _host_host_copy {
|
||||
template <class mat1, class mat2>
|
||||
static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2>
|
||||
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols) {
|
||||
assert(0==1);
|
||||
}
|
||||
};
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - TEMPLATE HELPER FUNCTIONS FOR SPECIALIZED CASTING
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// Helper functions for ucl_cast_copy
|
||||
template <int host_type1, int host_type2> struct _ucl_cast_copy;
|
||||
|
||||
// Destination is on host
|
||||
template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer) {
|
||||
ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type));
|
||||
for (size_t i=0; i<numel; i++)
|
||||
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer,command_queue &cq) {
|
||||
ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type),cq);
|
||||
cast_buffer.sync();
|
||||
for (size_t i=0; i<numel; i++)
|
||||
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer) {
|
||||
// Asynchronous currently pointless here
|
||||
#ifdef UCL_DEBUG
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
#endif
|
||||
if (mat1::VECTOR) {
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
|
||||
for (size_t i=0; i<rows*cols; i++)
|
||||
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
||||
} else {
|
||||
if (mat2::VECTOR)
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
cols*sizeof(typename mat2::data_type),
|
||||
cols*sizeof(typename mat2::data_type),rows);
|
||||
else
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
src.row_bytes(),cols*sizeof(typename mat2::data_type),
|
||||
rows);
|
||||
int dst_i=0;
|
||||
int buff_i=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t j=0; j<cols; j++) {
|
||||
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
|
||||
buff_i++;
|
||||
dst_i++;
|
||||
}
|
||||
dst_i+=dst.cols()-cols;
|
||||
}
|
||||
}
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
// Asynchronous currently pointless here
|
||||
#ifdef UCL_DEBUG
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
#endif
|
||||
if (mat1::VECTOR) {
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
|
||||
cast_buffer.sync();
|
||||
for (size_t i=0; i<rows*cols; i++)
|
||||
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
||||
} else {
|
||||
if (mat2::VECTOR)
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
cols*sizeof(typename mat2::data_type),
|
||||
cols*sizeof(typename mat2::data_type),rows,cq);
|
||||
else
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
src.row_bytes(),cols*sizeof(typename mat2::data_type),
|
||||
rows,cq);
|
||||
cast_buffer.sync();
|
||||
int dst_i=0;
|
||||
int buff_i=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t j=0; j<cols; j++) {
|
||||
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
|
||||
buff_i++;
|
||||
dst_i++;
|
||||
}
|
||||
dst_i+=dst.cols()-cols;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Source is on host
|
||||
template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer) {
|
||||
for (size_t i=0; i<numel; i++)
|
||||
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
||||
ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type));
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer, command_queue &cq) {
|
||||
for (size_t i=0; i<numel; i++)
|
||||
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
||||
ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type),cq);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
#endif
|
||||
if (mat2::VECTOR) {
|
||||
for (size_t i=0; i<rows*cols; i++)
|
||||
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
||||
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
|
||||
cols*sizeof(typename mat1::data_type),
|
||||
cols*sizeof(typename mat1::data_type),rows);
|
||||
} else if (mat1::VECTOR) {
|
||||
int src_i=0;
|
||||
int buf_i=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t j=0; j<cols; j++) {
|
||||
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
||||
buf_i++;
|
||||
src_i++;
|
||||
}
|
||||
src_i+=src.cols()-cols;
|
||||
}
|
||||
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
|
||||
} else {
|
||||
int src_i=0;
|
||||
int buf_i=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t j=0; j<cols; j++) {
|
||||
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
||||
buf_i++;
|
||||
src_i++;
|
||||
}
|
||||
src_i+=src.cols()-cols;
|
||||
}
|
||||
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
|
||||
cols*sizeof(typename mat1::data_type),
|
||||
cols*sizeof(typename mat1::data_type),rows);
|
||||
}
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
#endif
|
||||
if (mat2::VECTOR) {
|
||||
for (size_t i=0; i<rows*cols; i++)
|
||||
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
||||
ucl_mv_cpy(dst,dst.row_bytes(),
|
||||
cast_buffer,cols*sizeof(typename mat1::data_type),
|
||||
cols*sizeof(typename mat1::data_type),rows,cq);
|
||||
} else if (mat1::VECTOR) {
|
||||
int src_i=0;
|
||||
int buf_i=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t j=0; j<cols; j++) {
|
||||
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
||||
buf_i++;
|
||||
src_i++;
|
||||
}
|
||||
src_i+=src.cols()-cols;
|
||||
}
|
||||
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
|
||||
} else {
|
||||
int src_i=0;
|
||||
int buf_i=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t j=0; j<cols; j++) {
|
||||
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
||||
buf_i++;
|
||||
src_i++;
|
||||
}
|
||||
src_i+=src.cols()-cols;
|
||||
}
|
||||
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
|
||||
cols*sizeof(typename mat1::data_type),
|
||||
cols*sizeof(typename mat1::data_type),rows,cq);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Neither on host or both on host
|
||||
template <> struct _ucl_cast_copy<1,1> {
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer, command_queue &cq) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
assert(0==1);
|
||||
}
|
||||
};
|
||||
|
||||
// Neither on host or both on host
|
||||
template <> struct _ucl_cast_copy<0,0> {
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer, command_queue &cq) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
assert(0==1);
|
||||
}
|
||||
};
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - 1D COPY - SPECIFIED NUMBER OF BYTES
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
||||
/** \param numel Number of elements (not bytes) to copy
|
||||
* \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer, command_queue &cq) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(dst.numel()>=numel && src.numel()>=numel);
|
||||
assert(cast_buffer.numel()>=numel);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,numel,cq);
|
||||
else
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||
cast_buffer,cq);
|
||||
}
|
||||
|
||||
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
||||
/** \param numel Number of elements (not bytes) to copy
|
||||
* \param async Perform non-blocking copy on default stream
|
||||
* \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer, const bool async) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(dst.numel()>=numel && src.numel()>=numel);
|
||||
assert(cast_buffer.numel()>=numel);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,numel,async);
|
||||
else if (async)
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||
cast_buffer,dst.cq());
|
||||
else
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||
cast_buffer);
|
||||
}
|
||||
|
||||
/// Asynchronous copy of matrix/vector (memory already allocated)
|
||||
/** \param numel Number of elements (not bytes) to copy
|
||||
* - If the data types of the two matrices are not the same,
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* not device to device. For host/device transfers, a temporary
|
||||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
* be passed to an alternative copy routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2>
|
||||
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
command_queue &cq) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||
if (mat1::MEM_TYPE==1) {
|
||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||
cast_buffer,cq);
|
||||
} else {
|
||||
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
||||
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||
cast_buffer,cq);
|
||||
}
|
||||
} else
|
||||
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
|
||||
}
|
||||
|
||||
/// Copy matrix/vector (memory already allocated)
|
||||
/** \param numel Number of elements (not bytes) to copy
|
||||
* \param async Perform non-blocking copy (ignored for host to host copy)
|
||||
* - If the data types of the two matrices are not the same,
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* not device to device. For host/device transfers, a temporary
|
||||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
* be passed to an alternative copy routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - The default stream is used for asynchronous copy
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2>
|
||||
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
const bool async) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
|
||||
else if (async)
|
||||
ucl_copy(dst,src,numel,dst.cq());
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||
if (mat1::MEM_TYPE==1) {
|
||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||
cast_buffer);
|
||||
} else {
|
||||
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
||||
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||
cast_buffer);
|
||||
}
|
||||
} else
|
||||
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - 2D COPY - SPECIFIED NUMBER OF ROWS/COLS
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/// Asynchronous copy subset matrix rows/cols with cast (Device/Host transfer)
|
||||
/** \param async Perform non-blocking copy on default stream
|
||||
* \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If src is a vector, routine assumes row-major rows by cols copy
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If dst is a vector, routine assumes row-major rows by cols copy
|
||||
* - If dst is a matrix, routine will copy into left tile of matrix
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
const bool async) {
|
||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,rows,cols,async);
|
||||
else if (async)
|
||||
ucl_copy(dst,src,rows,cols,dst.cq());
|
||||
else
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||
cast_buffer);
|
||||
}
|
||||
|
||||
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
|
||||
/** \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If src is a vector, routine assumes row-major rows by cols copy
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If dst is a vector, routine assumes row-major rows by cols copy
|
||||
* - If dst is a matrix, routine will copy into upper left tile of matrix
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,rows,cols,cq);
|
||||
else
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||
cast_buffer,cq);
|
||||
}
|
||||
|
||||
/// Asynchronous copy of subset matrix rows,cols (memory already allocated)
|
||||
/** - If src is a vector, routine assumes row-major rows by cols copy
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If dst is a vector, routine assumes row-major rows by cols copy
|
||||
* - If dst is a matrix, routine will copy into left tile of matrix
|
||||
* - If the data types of the two matrices are not the same,
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* not device to device. For host/device transfers, a temporary
|
||||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
* be passed to an alternative copy routine.
|
||||
* - The copy should handle padding for 2D alignment correctly
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2>
|
||||
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, command_queue &cq) {
|
||||
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||
if (mat1::MEM_TYPE==1) {
|
||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||
cast_buffer,cq);
|
||||
} else {
|
||||
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
||||
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||
cast_buffer,cq);
|
||||
}
|
||||
// If we are here, at least one of the matrices must have VECTOR=0
|
||||
} else if (mat1::VECTOR) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
|
||||
cols*sizeof(typename mat1::data_type),rows,
|
||||
cq);
|
||||
} else if (mat2::VECTOR) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
|
||||
cols*sizeof(typename mat1::data_type),rows,cq);
|
||||
} else {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(src.rows()>=rows && src.cols()>=cols);
|
||||
assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
|
||||
cols*sizeof(typename mat1::data_type),rows,cq);
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy subset of matrix rows,cols (memory already allocated)
|
||||
/** \param async Perform non-blocking copy (ignored for host to host copy)
|
||||
* - If src is a vector, routine assumes row-major rows by cols copy
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If dst is a vector, routine assumes row-major rows by cols copy
|
||||
* - If dst is a matrix, routine will copy into left tile of matrix
|
||||
* - If the data types of the two matrices are not the same,
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* not device to device. For host/device transfers, a temporary
|
||||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
* be passed to an alternative copy routine.
|
||||
* - The copy should handle padding for 2D alignment correctly
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - The default stream is used for asynchronous copy
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2>
|
||||
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, const bool async) {
|
||||
if (async)
|
||||
ucl_copy(dst,src,rows,cols,dst.cq());
|
||||
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||
if (mat1::MEM_TYPE==1) {
|
||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||
cast_buffer);
|
||||
} else {
|
||||
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
||||
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
|
||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||
cast_buffer);
|
||||
}
|
||||
// If we are here, at least one of the matrices must have VECTOR=0
|
||||
} else if (mat1::VECTOR) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(mat2::VECTOR==0);
|
||||
#endif
|
||||
ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
|
||||
cols*sizeof(typename mat1::data_type),rows);
|
||||
} else if (mat2::VECTOR) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(mat1::VECTOR==0);
|
||||
#endif
|
||||
ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
|
||||
cols*sizeof(typename mat1::data_type),rows);
|
||||
} else {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(src.rows()>=rows && src.cols()>=cols);
|
||||
assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
#endif
|
||||
ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
|
||||
cols*sizeof(typename mat1::data_type),rows);
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - 1D/2D COPY
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
||||
/** \param async Perform non-blocking copy on default stream
|
||||
* \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - The number of bytes copied is determined by entire src data
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
|
||||
mat3 &cast_buffer, const bool async) {
|
||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,async);
|
||||
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
|
||||
ucl_cast_copy(dst,src,src.rows(),src.cols(),cast_buffer,async);
|
||||
else if (mat1::PADDED==1)
|
||||
ucl_cast_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,async);
|
||||
else
|
||||
ucl_cast_copy(dst,src,src.numel(),cast_buffer,async);
|
||||
}
|
||||
|
||||
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
||||
/** \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - The number of bytes copied is determined by entire src data
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
|
||||
mat3 &cast_buffer, command_queue &cq) {
|
||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,cq);
|
||||
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
|
||||
ucl_copy(dst,src,src.rows(),src.cols(),cast_buffer,cq);
|
||||
else if (mat1::PADDED==1)
|
||||
ucl_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,cq);
|
||||
else
|
||||
ucl_copy(dst,src,src.numel(),cast_buffer,cq);
|
||||
}
|
||||
|
||||
/// Asynchronous copy of matrix/vector (memory already allocated)
|
||||
/** - The number of bytes copied is determined by entire src data
|
||||
* - If the data types of the two matrices are not the same,
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* not device to device. For host/device transfers, a temporary
|
||||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
* be passed to an alternative copy routine.
|
||||
* - The copy should handle padding for 2D alignment correctly
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2>
|
||||
inline void ucl_copy(mat1 &dst, const mat2 &src, command_queue &cq) {
|
||||
if (dst.row_bytes()==src.row_bytes() &&
|
||||
src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
|
||||
(int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,src.row_size()*src.rows(),cq);
|
||||
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
|
||||
ucl_copy(dst,src,src.rows(),src.cols(),cq);
|
||||
else if (mat1::PADDED==1)
|
||||
ucl_copy(dst,src,dst.rows(),dst.cols(),cq);
|
||||
else
|
||||
ucl_copy(dst,src,src.numel(),cq);
|
||||
}
|
||||
|
||||
/// Copy matrix/vector (memory already allocated)
|
||||
/** \param async Perform non-blocking copy (ignored for host to host copy)
|
||||
* - The number of bytes copied is determined by entire src data
|
||||
* - If the data types of the two matrices are not the same,
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* not device to device. For host/device transfers, a temporary
|
||||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
* be passed to an alternative copy routine.
|
||||
* - The copy should handle padding for 2D alignment correctly
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - The default stream is used for asynchronous copy
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2>
|
||||
inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
|
||||
if (async)
|
||||
ucl_copy(dst,src,dst.cq());
|
||||
else if (dst.row_bytes()==src.row_bytes() &&
|
||||
src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
|
||||
(int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,src.row_size()*src.rows(),async);
|
||||
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
|
||||
ucl_copy(dst,src,src.rows(),src.cols(),async);
|
||||
else if (mat1::PADDED==1)
|
||||
ucl_copy(dst,src,dst.rows(),dst.cols(),async);
|
||||
else
|
||||
ucl_copy(dst,src,src.numel(),async);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,430 @@
|
|||
/***************************************************************************
|
||||
ucl_d_mat.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Matrix Container on Device
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jun 25 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
// Only allow this file to be included by CUDA and OpenCL specific headers
|
||||
#ifdef _UCL_MAT_ALLOW
|
||||
|
||||
/// 2D Matrix on device (can have extra column storage to get correct alignment)
|
||||
template <class numtyp>
|
||||
class UCL_D_Mat : public UCL_BaseMat {
|
||||
public:
|
||||
// Traits for copying data
|
||||
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
|
||||
enum traits {
|
||||
DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
|
||||
MEM_TYPE = 0,
|
||||
PADDED = 1,
|
||||
ROW_MAJOR = 1,
|
||||
VECTOR = 0
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {}
|
||||
~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); }
|
||||
|
||||
/// Construct with specified rows and cols
|
||||
/** \sa alloc() **/
|
||||
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
||||
_rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); }
|
||||
|
||||
/// Row major matrix on device
|
||||
/** The kind parameter controls memory optimizations as follows:
|
||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||
* \param cq Default command queue for operations copied from another mat
|
||||
* \note - Coalesced access using adjacent cols on same row
|
||||
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
|
||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
template <class mat_type>
|
||||
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||
clear();
|
||||
_kind=kind;
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
|
||||
_row_size=_pitch/sizeof(numtyp);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_row_size*cols;
|
||||
#endif
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
std::cerr << "UCL Error: Could not allocate "
|
||||
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Row major matrix on device
|
||||
/** The kind parameter controls memory optimizations as follows:
|
||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||
* \param device Used to get the default command queue for operations
|
||||
* \note - Coalesced access using adjacent cols on same row
|
||||
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
|
||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||
clear();
|
||||
_kind=kind;
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
|
||||
_row_size=_pitch/sizeof(numtyp);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_row_size*cols;
|
||||
#endif
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
std::cerr << "UCL Error: Could not allocate "
|
||||
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Return the type of memory allocation
|
||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
|
||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||
const size_t stride) {
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
_pitch=stride*sizeof(numtyp);
|
||||
_row_size=stride;
|
||||
this->_cq=input.cq();
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
_array=input.cbegin();
|
||||
#else
|
||||
_device_view(&_array,input.begin());
|
||||
#endif
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
||||
{ view(input,rows,cols,input.row_size()); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t cols)
|
||||
{ view(input,1,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input)
|
||||
{ view(input,input.rows(),input.cols()); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||
const size_t stride, UCL_Device &dev) {
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_pitch=stride*sizeof(numtyp);
|
||||
_row_size=stride;
|
||||
this->_cq=dev.cq();
|
||||
_array=input;
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
|
||||
{ view(input,1,cols,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols, const size_t stride) {
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_pitch=stride*sizeof(numtyp);
|
||||
_row_size=stride;
|
||||
this->_cq=input.cq();
|
||||
#ifdef _OCL_MAT
|
||||
_array=input.begin();
|
||||
_offset=offset;
|
||||
#else
|
||||
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
|
||||
#endif
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols)
|
||||
{ view_offset(offset,input,rows,cols,input.row_size()); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
|
||||
{ view_offset(offset,input,1,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset, ucl_type &input) {
|
||||
if (input.rows()==1)
|
||||
view_offset(offset,input,1,input.cols()-offset);
|
||||
else
|
||||
view_offset(offset,input,input.rows()-offset/input.row_size(),
|
||||
input.cols());
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||
const size_t cols,const size_t stride,
|
||||
UCL_Device &dev) {
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_pitch=stride*sizeof(numtyp);
|
||||
_row_size=stride;
|
||||
this->_cq=dev.cq();
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
_array=input;
|
||||
_offset=offset;
|
||||
#else
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
_array=input+offset*sizeof(numtyp);
|
||||
#else
|
||||
_array=input+offset;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view_offset(offset,input,rows,cols,cols,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset, ptr_type input,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view_offset(offset,input,1,cols,dev); }
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
{ _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { _device_zero(*this,row_bytes()*_rows); }
|
||||
|
||||
/// Set first n elements to zero
|
||||
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
||||
inline device_ptr & begin() { return _array; }
|
||||
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
||||
inline const device_ptr & begin() const { return _array; }
|
||||
#else
|
||||
/// For CUDA-RT, get device pointer to first element
|
||||
inline numtyp * begin() { return _array; }
|
||||
/// For CUDA-RT, get device pointer to first element
|
||||
inline const numtyp * begin() const { return _array; }
|
||||
/// For CUDA-RT, get device pointer to one past last element
|
||||
inline numtyp * end() { return _end; }
|
||||
/// For CUDA-RT, get device pointer to one past last element
|
||||
inline const numtyp * end() const { return _end; }
|
||||
#endif
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
/// Returns an API specific device pointer
|
||||
/** - For OpenCL, returns a &cl_mem object
|
||||
* - For CUDA Driver, returns a &CUdeviceptr
|
||||
* - For CUDA-RT, returns void** **/
|
||||
inline device_ptr & cbegin() { return _array; }
|
||||
/// Returns an API specific device pointer
|
||||
/** - For OpenCL, returns a &cl_mem object
|
||||
* - For CUDA Driver, returns a &CUdeviceptr
|
||||
* - For CUDA-RT, returns void** **/
|
||||
inline const device_ptr & cbegin() const { return _array; }
|
||||
#else
|
||||
/// Returns an API specific device pointer
|
||||
/** - For OpenCL, returns a &cl_mem object
|
||||
* - For CUDA Driver, returns a &CUdeviceptr
|
||||
* - For CUDA-RT, returns numtyp** **/
|
||||
inline numtyp ** cbegin() { return &_array; }
|
||||
/// Returns an API specific device pointer
|
||||
/** - For OpenCL, returns a &cl_mem object
|
||||
* - For CUDA Driver, returns a &CUdeviceptr
|
||||
* - For CUDA-RT, returns numtyp** **/
|
||||
inline const numtyp ** cbegin() const { return &_array; }
|
||||
#endif
|
||||
|
||||
/// Get the number of elements
|
||||
inline size_t numel() const { return _cols*_rows; }
|
||||
/// Get the number of rows
|
||||
inline size_t rows() const { return _rows; }
|
||||
/// Get the number of columns
|
||||
inline size_t cols() const { return _cols; }
|
||||
///Get the size of a row (including any padding) in elements
|
||||
inline size_t row_size() const { return _row_size; }
|
||||
/// Get the size of a row (including any padding) in bytes
|
||||
inline size_t row_bytes() const { return _pitch; }
|
||||
/// Get the size in bytes of 1 element
|
||||
inline int element_size() const { return sizeof(numtyp); }
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t offset() const { return _offset; }
|
||||
#else
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t offset() const { return 0; }
|
||||
#endif
|
||||
|
||||
/// Return the offset (in bytes) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t byteoff() const { return offset()*sizeof(numtyp); }
|
||||
|
||||
private:
|
||||
size_t _pitch, _row_size, _rows, _cols;
|
||||
enum UCL_MEMOPT _kind;
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
device_ptr _array;
|
||||
#else
|
||||
numtyp *_array,*_end;
|
||||
#endif
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
size_t _offset;
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,442 @@
|
|||
/***************************************************************************
|
||||
ucl_d_vec.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Vector Container on Device
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jun 25 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
// Only allow this file to be included by CUDA and OpenCL specific headers
|
||||
#ifdef _UCL_MAT_ALLOW
|
||||
|
||||
/// Row vector on device
|
||||
template <class numtyp>
|
||||
class UCL_D_Vec : public UCL_BaseMat {
|
||||
public:
|
||||
// Traits for copying data
|
||||
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
|
||||
enum traits {
|
||||
DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
|
||||
MEM_TYPE = 0,
|
||||
PADDED = 0,
|
||||
ROW_MAJOR = 1,
|
||||
VECTOR = 1
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {}
|
||||
~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); }
|
||||
|
||||
/// Construct with n columns
|
||||
/** \sa alloc() **/
|
||||
UCL_D_Vec(const size_t n, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
||||
_cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); }
|
||||
|
||||
/// Set up host vector with 'cols' columns and reserve memory
|
||||
/** The kind parameter controls memory optimizations as follows:
|
||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||
* \param cq Default command queue for operations copied from another mat
|
||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
template <class mat_type>
|
||||
inline int alloc(const size_t cols, mat_type &cq,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||
|
||||
clear();
|
||||
_kind=kind;
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
int err=_device_alloc(*this,cq,_row_bytes,kind);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+cols;
|
||||
#endif
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on device.\n";
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Set up host vector with 'cols' columns and reserve memory
|
||||
/** The kind parameter controls memory optimizations as follows:
|
||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||
* \param device Used to get the default command queue for operations
|
||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
inline int alloc(const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||
clear();
|
||||
_kind=kind;
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
int err=_device_alloc(*this,device,_row_bytes,kind);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+cols;
|
||||
#endif
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on device.\n";
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Return the type of memory allocation
|
||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
|
||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=input.cq();
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
_array=input.cbegin();
|
||||
#else
|
||||
_device_view(&_array,input.begin());
|
||||
#endif
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||
const size_t stride) { view(input,rows,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t cols)
|
||||
{ view(input,1,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input)
|
||||
{ view(input,input.rows()*input.row_size()); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||
UCL_Device &dev) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=dev.cq();
|
||||
_array=input;
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||
const size_t stride, UCL_Device &dev)
|
||||
{ view(input,rows,cols,stride); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
|
||||
{ view(input,1,cols,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=input.cq();
|
||||
#ifdef _OCL_MAT
|
||||
_array=input.begin();
|
||||
_offset=offset;
|
||||
#else
|
||||
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
|
||||
#endif
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols, const size_t stride)
|
||||
{ view_offset(offset,input,rows,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
|
||||
{ view_offset(offset,input,1,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset, ucl_type &input)
|
||||
{ view_offset(offset,input,input.rows()*input.row_size()-offset); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||
const size_t cols, UCL_Device &dev) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=dev.cq();
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
_array=input;
|
||||
_offset=offset;
|
||||
#else
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
_array=input+offset*sizeof(numtyp);
|
||||
#else
|
||||
_array=input+offset;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||
const size_t cols,const size_t stride,UCL_Device &dev)
|
||||
{ view_offset(offset,input,rows,cols,stride); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset, ptr_type input,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view_offset(offset,input,1,cols,dev); }
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
{ if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { _device_zero(*this,row_bytes()); }
|
||||
|
||||
/// Set first n elements to zero
|
||||
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
||||
inline device_ptr & begin() { return _array; }
|
||||
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
||||
inline const device_ptr & begin() const { return _array; }
|
||||
#else
|
||||
/// For CUDA-RT, get device pointer to first element
|
||||
inline numtyp * begin() { return _array; }
|
||||
/// For CUDA-RT, get device pointer to first element
|
||||
inline const numtyp * begin() const { return _array; }
|
||||
/// For CUDA-RT, get device pointer to one past last element
|
||||
inline numtyp * end() { return _end; }
|
||||
/// For CUDA-RT, get device pointer to one past last element
|
||||
inline const numtyp * end() const { return _end; }
|
||||
#endif
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
/// Returns an API specific device pointer
|
||||
/** - For OpenCL, returns a &cl_mem object
|
||||
* - For CUDA Driver, returns a &CUdeviceptr
|
||||
* - For CUDA-RT, returns void** **/
|
||||
inline device_ptr & cbegin() { return _array; }
|
||||
/// Returns an API specific device pointer
|
||||
/** - For OpenCL, returns a &cl_mem object
|
||||
* - For CUDA Driver, returns a &CUdeviceptr
|
||||
* - For CUDA-RT, returns void** **/
|
||||
inline const device_ptr & cbegin() const { return _array; }
|
||||
#else
|
||||
/// Returns an API specific device pointer
|
||||
/** - For OpenCL, returns a &cl_mem object
|
||||
* - For CUDA Driver, returns a &CUdeviceptr
|
||||
* - For CUDA-RT, returns numtyp** **/
|
||||
inline numtyp ** cbegin() { return &_array; }
|
||||
/// Returns an API specific device pointer
|
||||
/** - For OpenCL, returns a &cl_mem object
|
||||
* - For CUDA Driver, returns a &CUdeviceptr
|
||||
* - For CUDA-RT, returns numtyp** **/
|
||||
inline const numtyp ** cbegin() const { return &_array; }
|
||||
/// For CUDA-RT, allocate row vector and bind texture
|
||||
inline void safe_alloc(const size_t cols, UCL_Device &dev,
|
||||
textureReference *t)
|
||||
{ alloc(cols,dev); assign_texture(t); bind(); }
|
||||
/// For CUDA-RT, assign a texture to matrix
|
||||
inline void assign_texture(textureReference *t) { _tex_ptr=t; }
|
||||
/// For CUDA-RT, bind to texture
|
||||
inline void bind() {
|
||||
cuda_gb_get_channel<numtyp>(_channel);
|
||||
(*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
|
||||
(*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
|
||||
(*_tex_ptr).filterMode = cudaFilterModePoint;
|
||||
(*_tex_ptr).normalized = false;
|
||||
CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,_array,&_channel));
|
||||
}
|
||||
/// For CUDA-RT, unbind texture
|
||||
inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
|
||||
#endif
|
||||
|
||||
/// Get the number of elements
|
||||
inline size_t numel() const { return _cols; }
|
||||
/// Get the number of rows
|
||||
inline size_t rows() const { return 1; }
|
||||
/// Get the number of columns
|
||||
inline size_t cols() const { return _cols; }
|
||||
///Get the size of a row (including any padding) in elements
|
||||
inline size_t row_size() const { return _cols; }
|
||||
/// Get the size of a row (including any padding) in bytes
|
||||
inline size_t row_bytes() const { return _row_bytes; }
|
||||
/// Get the size in bytes of 1 element
|
||||
inline int element_size() const { return sizeof(numtyp); }
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t offset() const { return _offset; }
|
||||
#else
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t offset() const { return 0; }
|
||||
#endif
|
||||
|
||||
/// Return the offset (in bytes) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t byteoff() const { return offset()*sizeof(numtyp); }
|
||||
|
||||
private:
|
||||
size_t _row_bytes, _row_size, _rows, _cols;
|
||||
enum UCL_MEMOPT _kind;
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
device_ptr _array;
|
||||
#else
|
||||
numtyp *_array,*_end;
|
||||
cudaChannelFormatDesc _channel;
|
||||
textureReference *_tex_ptr;
|
||||
#endif
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
size_t _offset;
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
/***************************************************************************
|
||||
nvc_get_devices.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
List properties of cuda devices
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Wed Jan 28 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifdef UCL_OPENCL
|
||||
#include "ocl_device.h"
|
||||
using namespace ucl_opencl;
|
||||
#endif
|
||||
|
||||
#ifdef UCL_CUDADR
|
||||
#include "nvd_device.h"
|
||||
using namespace ucl_cudadr;
|
||||
#endif
|
||||
|
||||
#ifdef UCL_CUDART
|
||||
#include "nvc_device.h"
|
||||
using namespace ucl_cudart;
|
||||
#endif
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
UCL_Device cop;
|
||||
std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
|
||||
if (cop.num_platforms()>0) {
|
||||
std::cout << "Using platform: " << cop.platform_name() << std::endl;
|
||||
cop.print_all(std::cout);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,378 @@
|
|||
/***************************************************************************
|
||||
ucl_h_mat.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Matrix Container on Host
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jun 25 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
// Only allow this file to be included by CUDA and OpenCL specific headers
|
||||
#ifdef _UCL_MAT_ALLOW
|
||||
|
||||
/// Matrix on Host with options for pinning (page locked)
|
||||
template <class numtyp>
|
||||
class UCL_H_Mat : public UCL_BaseMat {
|
||||
public:
|
||||
// Traits for copying data
|
||||
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
|
||||
enum traits {
|
||||
DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
|
||||
MEM_TYPE = 1,
|
||||
PADDED = 0,
|
||||
ROW_MAJOR = 1,
|
||||
VECTOR = 0
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
|
||||
~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
||||
|
||||
/// Construct with specied number of rows and columns
|
||||
/** \sa alloc() **/
|
||||
UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED)
|
||||
{ _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
|
||||
|
||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||
/** The kind parameter controls memory pinning as follows:
|
||||
* - UCL_NOT_PINNED - Memory is not pinned
|
||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
||||
* \param cq Default command queue for operations copied from another mat
|
||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
template <class mat_type>
|
||||
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||
clear();
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
_kind=kind;
|
||||
int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
||||
<< " bytes on host.\n";
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
_end=_array+rows*cols;
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||
/** The kind parameter controls memory pinning as follows:
|
||||
* - UCL_NOT_PINNED - Memory is not pinned
|
||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
||||
* \param device Used to get the default command queue for operations
|
||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||
clear();
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
_kind=kind;
|
||||
int err=_host_alloc(*this,device,_row_bytes*_rows,kind);
|
||||
_end=_array+rows*cols;
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
||||
<< " bytes on host.\n";
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Return the type of memory allocation
|
||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
|
||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||
const size_t stride) {
|
||||
assert(rows==1 || stride==cols);
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_row_bytes=stride*sizeof(numtyp);
|
||||
this->_cq=input.cq();
|
||||
_array=input.begin();
|
||||
_end=_array+_cols;
|
||||
#ifdef _OCL_MAT
|
||||
_carray=input.cbegin();
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
||||
{ view(input,rows,cols,input.row_size()); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t cols)
|
||||
{ view(input,1,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input)
|
||||
{ view(input,input.rows(),input.cols()); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||
const size_t stride, UCL_Device &dev) {
|
||||
assert(rows==1 || stride==cols);
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_row_bytes=stride*sizeof(numtyp);
|
||||
this->_cq=dev.cq();
|
||||
_array=input;
|
||||
_end=_array+_cols;
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
_host_alloc(*this,dev,_row_bytes,UCL_VIEW);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
|
||||
{ view(input,1,cols,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols, const size_t stride) {
|
||||
assert(rows==1 || stride==cols);
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_row_bytes=stride*sizeof(numtyp);
|
||||
this->_cq=input.cq();
|
||||
_array=input.begin()+offset;
|
||||
_end=_array+_cols;
|
||||
#ifdef _OCL_MAT
|
||||
_host_alloc(*this,input,_row_bytes,UCL_VIEW);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols)
|
||||
{ view_offset(offset,input,rows,cols,input.row_size()); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
|
||||
{ view_offset(offset,input,1,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset, ucl_type &input) {
|
||||
if (input.rows()==1)
|
||||
view_offset(offset,input,1,input.cols()-offset);
|
||||
else
|
||||
view_offset(offset,input,input.rows()-offset/input.row_size(),
|
||||
input.cols());
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view(input+offset,rows,cols,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
||||
const size_t cols,const size_t stride,UCL_Device &dev)
|
||||
{ view(input+offset,rows,cols,stride,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset, ptr_type *input,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view(input+offset,1,cols,dev); }
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
{ if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }}
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { _host_zero(_array,_rows*row_bytes()); }
|
||||
/// Set first n elements to zero
|
||||
inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
|
||||
|
||||
/// Get host pointer to first element
|
||||
inline numtyp * begin() { return _array; }
|
||||
/// Get host pointer to first element
|
||||
inline const numtyp * begin() const { return _array; }
|
||||
/// Get host pointer to one past last element
|
||||
inline numtyp * end() { return _end; }
|
||||
/// Get host pointer to one past last element
|
||||
inline const numtyp * end() const { return _end; }
|
||||
|
||||
/// Get the number of elements
|
||||
inline size_t numel() const { return _rows*_cols; }
|
||||
/// Get the number of rows
|
||||
inline size_t rows() const { return _rows; }
|
||||
/// Get the number of columns
|
||||
inline size_t cols() const { return _cols; }
|
||||
///Get the size of a row (including any padding) in elements
|
||||
inline size_t row_size() const { return _cols; }
|
||||
/// Get the size of a row (including any padding) in bytes
|
||||
inline size_t row_bytes() const { return _row_bytes; }
|
||||
/// Get the size in bytes of 1 element
|
||||
inline int element_size() const { return sizeof(numtyp); }
|
||||
|
||||
/// Get element at index i
|
||||
inline numtyp & operator[](const int i) { return _array[i]; }
|
||||
/// Get element at index i
|
||||
inline const numtyp & operator[](const int i) const { return _array[i]; }
|
||||
/// 2D access (row should always be 0)
|
||||
inline numtyp & operator()(const int row, const int col)
|
||||
{ return _array[row*_cols+col]; }
|
||||
/// 2D access (row should always be 0)
|
||||
inline const numtyp & operator()(const int row, const int col) const
|
||||
{ return _array[row*_cols+col]; }
|
||||
|
||||
/// Returns pointer to memory pointer for allocation on host
|
||||
inline numtyp ** host_ptr() { return &_array; }
|
||||
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t offset() const { return 0; }
|
||||
/// Return the offset (in bytes) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t byteoff() const { return 0; }
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
|
||||
inline device_ptr & cbegin() { return _carray; }
|
||||
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
|
||||
inline const device_ptr & cbegin() const { return _carray; }
|
||||
#else
|
||||
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
|
||||
inline void ** cbegin() { return (void **)&_array; }
|
||||
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
|
||||
inline const void ** cbegin() const { return (const void **)&_array; }
|
||||
#endif
|
||||
|
||||
private:
|
||||
enum UCL_MEMOPT _kind;
|
||||
numtyp *_array, *_end;
|
||||
size_t _row_bytes, _rows, _cols;
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
device_ptr _carray;
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,370 @@
|
|||
/***************************************************************************
|
||||
ucl_h_vec.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Vector Container on Host
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Thu Jun 25 2009
|
||||
copyright : (C) 2009 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
// Only allow this file to be included by CUDA and OpenCL specific headers
|
||||
#ifdef _UCL_MAT_ALLOW
|
||||
|
||||
/// Row Vector on Host with options for pinning (page locked)
|
||||
template <class numtyp>
|
||||
class UCL_H_Vec : public UCL_BaseMat {
|
||||
public:
|
||||
// Traits for copying data
|
||||
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
|
||||
enum traits {
|
||||
DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
|
||||
MEM_TYPE = 1,
|
||||
PADDED = 0,
|
||||
ROW_MAJOR = 1,
|
||||
VECTOR = 1
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
|
||||
~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
||||
|
||||
/// Construct with n columns
|
||||
/** \sa alloc() **/
|
||||
UCL_H_Vec(const size_t n, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED)
|
||||
{ _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
|
||||
|
||||
/// Set up host vector with 'cols' columns and reserve memory
|
||||
/** The kind parameter controls memory pinning as follows:
|
||||
* - UCL_NOT_PINNED - Memory is not pinned
|
||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
||||
* \param cq Default command queue for operations copied from another mat
|
||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
template <class mat_type>
|
||||
inline int alloc(const size_t cols, mat_type &cq,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||
clear();
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
_kind=kind;
|
||||
int err=_host_alloc(*this,cq,_row_bytes,kind);
|
||||
_end=_array+cols;
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on host.\n";
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Set up host vector with 'cols' columns and reserve memory
|
||||
/** The kind parameter controls memory pinning as follows:
|
||||
* - UCL_NOT_PINNED - Memory is not pinned
|
||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
||||
* \param device Used to get the default command queue for operations
|
||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
inline int alloc(const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||
clear();
|
||||
_cols=cols;
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
_kind=kind;
|
||||
int err=_host_alloc(*this,device,_row_bytes,kind);
|
||||
_end=_array+cols;
|
||||
#ifndef UCL_NO_EXIT
|
||||
if (err!=UCL_SUCCESS) {
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on host.\n";
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Return the type of memory allocation
|
||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
|
||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=input.cq();
|
||||
_array=input.begin();
|
||||
_end=_array+_cols;
|
||||
#ifdef _OCL_MAT
|
||||
_carray=input.cbegin();
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||
const size_t stride) { view(input,rows,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t cols)
|
||||
{ view(input,1,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input)
|
||||
{ view(input,input.rows()*input.row_size()); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||
UCL_Device &dev) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=dev.cq();
|
||||
_array=input;
|
||||
_end=_array+_cols;
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
_host_alloc(*this,dev,_row_bytes,UCL_VIEW);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||
const size_t stride, UCL_Device &dev)
|
||||
{ view(input,rows,cols,stride); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
|
||||
{ view(input,1,cols,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=input.cq();
|
||||
_array=input.begin()+offset;
|
||||
_end=_array+_cols;
|
||||
#ifdef _OCL_MAT
|
||||
_host_alloc(*this,input,_row_bytes,UCL_VIEW);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols, const size_t stride)
|
||||
{ view_offset(offset,input,rows,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
|
||||
{ view_offset(offset,input,1,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset, ucl_type &input)
|
||||
{ view_offset(offset,input,input.rows()*input.row_size()-offset); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view(input+offset,rows,cols,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
||||
const size_t cols,const size_t stride,UCL_Device &dev)
|
||||
{ view(input+offset,rows,cols,stride,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset, ptr_type *input,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view(input+offset,1,cols,dev); }
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
{ if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { _host_zero(_array,row_bytes()); }
|
||||
|
||||
/// Set first n elements to zero
|
||||
inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
|
||||
|
||||
/// Get host pointer to first element
|
||||
inline numtyp * begin() { return _array; }
|
||||
/// Get host pointer to first element
|
||||
inline const numtyp * begin() const { return _array; }
|
||||
/// Get host pointer to one past last element
|
||||
inline numtyp * end() { return _end; }
|
||||
/// Get host pointer to one past last element
|
||||
inline const numtyp * end() const { return _end; }
|
||||
|
||||
/// Get the number of elements
|
||||
inline size_t numel() const { return _cols; }
|
||||
/// Get the number of rows
|
||||
inline size_t rows() const { return 1; }
|
||||
/// Get the number of columns
|
||||
inline size_t cols() const { return _cols; }
|
||||
///Get the size of a row (including any padding) in elements
|
||||
inline size_t row_size() const { return _cols; }
|
||||
/// Get the size of a row (including any padding) in bytes
|
||||
inline size_t row_bytes() const { return _row_bytes; }
|
||||
/// Get the size in bytes of 1 element
|
||||
inline int element_size() const { return sizeof(numtyp); }
|
||||
|
||||
/// Get element at index i
|
||||
inline numtyp & operator[](const int i) { return _array[i]; }
|
||||
/// Get element at index i
|
||||
inline const numtyp & operator[](const int i) const { return _array[i]; }
|
||||
/// 2D access (row should always be 0)
|
||||
inline numtyp & operator()(const int row, const int col)
|
||||
{ return _array[col]; }
|
||||
/// 2D access (row should always be 0)
|
||||
inline const numtyp & operator()(const int row, const int col) const
|
||||
{ return _array[col]; }
|
||||
|
||||
/// Returns pointer to memory pointer for allocation on host
|
||||
inline numtyp ** host_ptr() { return &_array; }
|
||||
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t offset() const { return 0; }
|
||||
/// Return the offset (in bytes) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t byteoff() const { return 0; }
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
/// For OpenCL, returns a reference to the cl_mem object
|
||||
inline device_ptr & cbegin() { return _carray; }
|
||||
/// For OpenCL, returns a reference to the cl_mem object
|
||||
inline const device_ptr & cbegin() const { return _carray; }
|
||||
#endif
|
||||
|
||||
private:
|
||||
enum UCL_MEMOPT _kind;
|
||||
numtyp *_array, *_end;
|
||||
size_t _row_bytes, _cols;
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
device_ptr _carray;
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
/***************************************************************************
|
||||
ucl_nv_kernel.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Preprocessor macros for OpenCL/CUDA compatibility
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Mon May 3 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
// Only allow this file to be included by CUDA and OpenCL specific headers
|
||||
#ifndef UCL_NV_KERNEL_H
|
||||
#define UCL_NV_KERNEL_H
|
||||
|
||||
#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
|
||||
#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
|
||||
#define THREAD_ID_X threadIdx.x
|
||||
#define THREAD_ID_Y threadIdx.y
|
||||
#define BLOCK_ID_X blockIdx.x
|
||||
#define BLOCK_ID_Y blockIdx.y
|
||||
#define BLOCK_SIZE_X blockDim.x
|
||||
#define BLOCK_SIZE_Y blockDim.y
|
||||
#define __kernel extern "C" __global__
|
||||
#define __local __shared__
|
||||
#define mul24 __mul24
|
||||
#define __global
|
||||
#define __inline static __inline__ __device__
|
||||
|
||||
#endif
|
|
@ -0,0 +1,273 @@
|
|||
/***************************************************************************
|
||||
ucl_print.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Routines for printing debugging output for matrix/vector data
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Mon Jan 11 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
|
||||
#ifdef UCL_PRINT_ALLOW
|
||||
|
||||
template <int mem> struct _ucl_print;
|
||||
template <> struct _ucl_print<1> {
|
||||
template <class mat_type>
|
||||
static inline void p(mat_type &mat, const size_t n, std::ostream &out,
|
||||
const std::string delim) {
|
||||
for (size_t i=0; i<n-1; i++)
|
||||
out << mat[i] << delim;
|
||||
out << mat[n-1];
|
||||
}
|
||||
template <class mat_type>
|
||||
static inline void p(const mat_type &mat, const size_t n, std::ostream &out,
|
||||
const std::string delim, UCL_Device &dev) {
|
||||
p(mat,n,out,delim);
|
||||
}
|
||||
template <class mat_type>
|
||||
static inline void p(mat_type &mat, const size_t rows, const size_t cols,
|
||||
std::ostream &out, const std::string delim,
|
||||
const std::string row_delim) {
|
||||
int offset=0;
|
||||
int row_size=cols;
|
||||
if (mat_type::VECTOR==0)
|
||||
row_size=mat.row_size();
|
||||
for (size_t j=0; j<rows; j++) {
|
||||
size_t lend=offset+cols-1;
|
||||
for (size_t i=offset; i<lend; i++)
|
||||
out << mat[i] << delim;
|
||||
out << mat[lend];
|
||||
if (j!=rows-1)
|
||||
out << row_delim;
|
||||
offset+=row_size;
|
||||
}
|
||||
}
|
||||
template <class mat_type>
|
||||
static inline void p(const mat_type &mat,const size_t rows,const size_t cols,
|
||||
std::ostream &out,const std::string delim,
|
||||
const std::string row_delim, UCL_Device &dev) {
|
||||
p(mat,rows,cols,out,delim,row_delim);
|
||||
}
|
||||
};
|
||||
|
||||
template <int mem> struct _ucl_print {
|
||||
template <class mat_type>
|
||||
static inline void p(mat_type &mat, const size_t n, std::ostream &out,
|
||||
const std::string delim) {
|
||||
UCL_H_Vec<typename mat_type::data_type> temp;
|
||||
temp.alloc(n,mat);
|
||||
ucl_copy(temp,mat,n,false);
|
||||
_ucl_print<1>::p(temp,n,out,delim);
|
||||
}
|
||||
template <class mat_type>
|
||||
static inline void p(const mat_type &mat, const size_t n, std::ostream &out,
|
||||
const std::string delim, UCL_Device &dev) {
|
||||
UCL_H_Vec<typename mat_type::data_type> temp;
|
||||
temp.alloc(n,dev);
|
||||
ucl_copy(temp,mat,n,false);
|
||||
_ucl_print<1>::p(temp,n,out,delim);
|
||||
}
|
||||
template <class mat_type>
|
||||
static inline void p(mat_type &mat, const size_t rows, const size_t cols,
|
||||
std::ostream &out, const std::string delim,
|
||||
const std::string row_delim) {
|
||||
UCL_H_Vec<typename mat_type::data_type> temp;
|
||||
temp.alloc(mat.rows()*mat.cols(),mat);
|
||||
if (mat_type::VECTOR==1)
|
||||
ucl_copy(temp,mat,rows*cols,false);
|
||||
else
|
||||
ucl_copy(temp,mat,rows,cols,false);
|
||||
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
|
||||
}
|
||||
template <class mat_type>
|
||||
static inline void p(const mat_type &mat, const size_t rows,
|
||||
const size_t cols,std::ostream &out,
|
||||
const std::string delim,
|
||||
const std::string row_delim, UCL_Device &dev) {
|
||||
UCL_H_Vec<typename mat_type::data_type> temp;
|
||||
temp.alloc(mat.rows()*mat.cols(),dev);
|
||||
if (mat_type::VECTOR==1)
|
||||
ucl_copy(temp,mat,rows*cols,false);
|
||||
else
|
||||
ucl_copy(temp,mat,rows,cols,false);
|
||||
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
|
||||
}
|
||||
};
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// - Non-const routines that do not require a device object
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/// Outputs n elements of mat delimited by the string delim
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out,
|
||||
const std::string delim) {
|
||||
if (n>mat.numel()) {
|
||||
std::cerr << "Attempted to ucl_print " << n << " elements of matrix "
|
||||
<< "that only has " << mat.numel() << " elements.";
|
||||
exit(1);
|
||||
}
|
||||
_ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim);
|
||||
}
|
||||
|
||||
/// Outputs n elements of mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) {
|
||||
ucl_print(mat,n,out," ");
|
||||
}
|
||||
|
||||
/// Outputs n elements of mat delimited by a space to standard out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t n) {
|
||||
ucl_print(mat,n,std::cout," ");
|
||||
}
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by the string delim
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
|
||||
std::ostream &out, const std::string delim,
|
||||
const std::string row_delim) {
|
||||
if (rows*cols>mat.numel()) {
|
||||
std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
|
||||
<< "that only has " << mat.numel() << " elements.";
|
||||
exit(1);
|
||||
}
|
||||
_ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim);
|
||||
}
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
|
||||
std::ostream &out) {
|
||||
ucl_print(mat,rows,cols,out," ","\n");
|
||||
}
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by a space to std out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t rows,
|
||||
const size_t cols) {
|
||||
ucl_print(mat,rows,cols,std::cout," ","\n");
|
||||
}
|
||||
|
||||
/// Outputs mat delimited by a space to standard out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat) {
|
||||
ucl_print(mat,std::cout);
|
||||
}
|
||||
|
||||
/// Outputs mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, std::ostream &out) {
|
||||
if (mat_type::VECTOR==1)
|
||||
ucl_print(mat,mat.cols(),out," ");
|
||||
else
|
||||
ucl_print(mat,mat.rows(),mat.cols(),out," ","\n");
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// - Const routines that do not require a device object
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/// Outputs n elements of mat delimited by the string delim
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
|
||||
const std::string delim, UCL_Device &dev) {
|
||||
if (n>mat.numel()) {
|
||||
std::cerr << "Attempted to ucl_print " << n << " elements of matrix "
|
||||
<< "that only has " << mat.numel() << " elements.";
|
||||
exit(1);
|
||||
}
|
||||
_ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim,dev);
|
||||
}
|
||||
|
||||
/// Outputs n elements of mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
|
||||
UCL_Device &dev) {
|
||||
ucl_print(mat,n,out," ",dev);
|
||||
}
|
||||
|
||||
/// Outputs n elements of mat delimited by a space to standard out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat, const size_t n,
|
||||
UCL_Device &dev) {
|
||||
ucl_print(mat,n,std::cout," ",dev);
|
||||
}
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by the string delim
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
|
||||
std::ostream &out, const std::string delim,
|
||||
const std::string row_delim, UCL_Device &dev) {
|
||||
if (rows*cols>mat.numel()) {
|
||||
std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
|
||||
<< "that only has " << mat.numel() << " elements.";
|
||||
exit(1);
|
||||
}
|
||||
_ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim,dev);
|
||||
}
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
|
||||
std::ostream &out, UCL_Device &dev) {
|
||||
ucl_print(mat,rows,cols,out," ","\n",dev);
|
||||
}
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by a space to std out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat, const size_t rows,
|
||||
const size_t cols, UCL_Device &dev) {
|
||||
ucl_print(mat,rows,cols,std::cout," ","\n",dev);
|
||||
}
|
||||
|
||||
/// Outputs mat delimited by a space to standard out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat, UCL_Device &dev) {
|
||||
ucl_print(mat,std::cout,dev);
|
||||
}
|
||||
|
||||
/// Outputs mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat, std::ostream &out, UCL_Device &dev) {
|
||||
if (mat_type::VECTOR==1)
|
||||
ucl_print(mat,mat.cols(),out," ",dev);
|
||||
else
|
||||
ucl_print(mat,mat.rows(),mat.cols(),out," ","\n",dev);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// - Operator << Overloading
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_H_Vec<numtyp> &mat)
|
||||
{ ucl_print(mat,out); return out; }
|
||||
|
||||
template <class numtyp>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_H_Mat<numtyp> &mat)
|
||||
{ ucl_print(mat,out); return out; }
|
||||
|
||||
template <class numtyp>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_D_Vec<numtyp> &mat)
|
||||
{ ucl_print(mat,out); return out; }
|
||||
|
||||
template <class numtyp>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
|
||||
{ ucl_print(mat,out); return out; }
|
||||
|
||||
#endif
|
|
@ -0,0 +1,121 @@
|
|||
/***************************************************************************
|
||||
ucl_types.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Data type definitions for Coprocessor library
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin : Mon Jan 4 2010
|
||||
copyright : (C) 2010 by W. Michael Brown
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef UCL_TYPES_H
|
||||
#define UCL_TYPES_H
|
||||
|
||||
// Assign an integer id based on the data type: (int, float, double, etc)
|
||||
template <class eltype> struct _UCL_DATA_ID;
|
||||
template <> struct _UCL_DATA_ID<double> {
|
||||
enum { id=1 };
|
||||
static inline const char * name() { return "double"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<float> {
|
||||
enum { id=2 };
|
||||
static inline const char * name() { return "float"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<unsigned> {
|
||||
enum { id=3 };
|
||||
static inline const char * name() { return "unsigned"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<int> {
|
||||
enum { id=4 };
|
||||
static inline const char * name() { return "int"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<char> {
|
||||
enum { id=5 };
|
||||
static inline const char * name() { return "char"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<unsigned char> {
|
||||
enum { id=6 };
|
||||
static inline const char * name() { return "unsigned char"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<short> {
|
||||
enum { id=7 };
|
||||
static inline const char * name() { return "short"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<unsigned short> {
|
||||
enum { id=8 };
|
||||
static inline const char * name() { return "unsigned short"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<long> {
|
||||
enum { id=9 };
|
||||
static inline const char * name() { return "long"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<unsigned long> {
|
||||
enum { id=10 };
|
||||
static inline const char * name() { return "unsigned long"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<long double> {
|
||||
enum { id=11 };
|
||||
static inline const char * name() { return "long double"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }
|
||||
};
|
||||
template <class eltype> struct _UCL_DATA_ID {
|
||||
enum { id=0 };
|
||||
static inline const char * name() { return "error_type"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }
|
||||
};
|
||||
|
||||
// Host memory allocation types
|
||||
enum UCL_MEMOPT {
|
||||
UCL_WRITE_ONLY, ///< Allow any optimizations for memory that is write only
|
||||
UCL_READ_ONLY, ///< Allow any optimizations for memory that is read only
|
||||
UCL_READ_WRITE, ///< Allow read and write
|
||||
UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined)
|
||||
UCL_RW_OPTIMIZED, ///< Allow host memory to be pinned
|
||||
UCL_NOT_PINNED, ///< Host memory is not to be pinned
|
||||
UCL_VIEW ///< View of another memory allocation
|
||||
};
|
||||
|
||||
enum UCL_DEVICE_TYPE {
|
||||
UCL_DEFAULT, ///< Unknown device type
|
||||
UCL_CPU, ///< Device is a CPU
|
||||
UCL_GPU, ///< Device is a GPU
|
||||
UCL_ACCELERATOR ///< Device is an Accelerator
|
||||
};
|
||||
|
||||
enum UCL_ERROR_FLAG {
|
||||
UCL_SUCCESS, ///< No error
|
||||
UCL_ERROR, ///< Unqualified error
|
||||
UCL_FILE_NOT_FOUND, ///< File not found
|
||||
UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found
|
||||
UCL_COMPILE_ERROR, ///< Error compiling kernel
|
||||
UCL_MEMORY_ERROR
|
||||
};
|
||||
|
||||
template <class numtyp>
|
||||
const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <math.h>
|
||||
|
||||
#include "lj96_cut_gpu_memory.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
LJ96MF.clear();
|
||||
gpu_mode=LJ96MF.device->gpu_mode();
|
||||
double gpu_split=LJ96MF.device->particle_split();
|
||||
int first_gpu=LJ96MF.device->first_device();
|
||||
int last_gpu=LJ96MF.device->last_device();
|
||||
int world_me=LJ96MF.device->world_me();
|
||||
int gpu_rank=LJ96MF.device->gpu_rank();
|
||||
int procs_per_gpu=LJ96MF.device->procs_per_gpu();
|
||||
|
||||
LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (world_me==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum,
|
||||
nall, 300, maxspecial, cell_size, gpu_split,
|
||||
screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
MPI_Barrier(LJ96MF.device->gpu_comm);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
void lj96_gpu_clear() {
|
||||
LJ96MF.clear();
|
||||
}
|
||||
|
||||
int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success);
|
||||
}
|
||||
|
||||
void lj96_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||
}
|
||||
|
||||
double lj96_gpu_bytes() {
|
||||
return LJ96MF.host_memory_usage();
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,281 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJ96_GPU_KERNEL
|
||||
#define LJ96_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
texture<float4> pos_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
__inline double4 fetch_pos(const int& i, const double4 *pos)
|
||||
{
|
||||
return pos[i];
|
||||
}
|
||||
#else
|
||||
__inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{
|
||||
return tex1Dfetch(pos_tex, i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (r2inv<lj1[mtype].z) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp r3inv = sqrt(r6inv);
|
||||
numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
|
||||
force*=factor_lj;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<lj1[mtype].z) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp r3inv = sqrt(r6inv);
|
||||
numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii*/
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "lj96_cut_gpu_cl.h"
|
||||
#else
|
||||
#include "lj96_cut_gpu_ptx.h"
|
||||
#endif
|
||||
|
||||
#include "lj96_cut_gpu_memory.h"
|
||||
#include <cassert>
|
||||
#define LJ96_GPU_MemoryT LJ96_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
LJ96_GPU_MemoryT::LJ96_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
LJ96_GPU_MemoryT::~LJ96_GPU_Memory() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool LJ96_GPU_MemoryT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj96_cut_gpu_kernel);
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_OPTIMIZED);
|
||||
|
||||
for (int i=0; i<lj_types*lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||
ucl_copy(sp_lj,dview,false);
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void LJ96_GPU_MemoryT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
lj1.clear();
|
||||
lj3.clear();
|
||||
sp_lj.clear();
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double LJ96_GPU_MemoryT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(LJ96_GPU_Memory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
template class LJ96_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,71 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJ96_GPU_MEMORY_H
|
||||
#define LJ96_GPU_MEMORY_H
|
||||
|
||||
#include "atomic_gpu_memory.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
||||
public:
|
||||
LJ96_GPU_Memory();
|
||||
~LJ96_GPU_Memory();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
|
||||
UCL_D_Vec<numtyp4> lj1;
|
||||
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
|
||||
UCL_D_Vec<numtyp4> lj3;
|
||||
/// Special LJ values
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <math.h>
|
||||
|
||||
#include "lj_cut_gpu_memory.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool ljl_gpu_init(const int ntypes, double **cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen) {
|
||||
LJLMF.clear();
|
||||
gpu_mode=LJLMF.device->gpu_mode();
|
||||
double gpu_split=LJLMF.device->particle_split();
|
||||
int first_gpu=LJLMF.device->first_device();
|
||||
int last_gpu=LJLMF.device->last_device();
|
||||
int world_me=LJLMF.device->world_me();
|
||||
int gpu_rank=LJLMF.device->gpu_rank();
|
||||
int procs_per_gpu=LJLMF.device->procs_per_gpu();
|
||||
|
||||
LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (world_me==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split,
|
||||
screen);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
MPI_Barrier(LJLMF.device->gpu_comm);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
void ljl_gpu_clear() {
|
||||
LJLMF.clear();
|
||||
}
|
||||
|
||||
int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success);
|
||||
}
|
||||
|
||||
void ljl_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||
}
|
||||
|
||||
double ljl_gpu_bytes() {
|
||||
return LJLMF.host_memory_usage();
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,279 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJ_GPU_KERNEL
|
||||
#define LJ_GPU_KERNEL
|
||||
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
texture<float4> pos_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
__inline double4 fetch_pos(const int& i, const double4 *pos)
|
||||
{
|
||||
return pos[i];
|
||||
}
|
||||
#else
|
||||
__inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{
|
||||
return tex1Dfetch(pos_tex, i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (r2inv<lj1[mtype].z) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
force*=factor_lj;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=THREAD_ID_X;
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
if (ii<4)
|
||||
sp_lj[ii]=sp_lj_in[ii];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[ii]=lj1_in[ii];
|
||||
if (eflag>0)
|
||||
lj3[ii]=lj3_in[ii];
|
||||
}
|
||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
|
||||
acctyp energy=(numtyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(numtyp)0;
|
||||
f.y=(numtyp)0;
|
||||
f.z=(numtyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
|
||||
int j=*nbor;
|
||||
if (j < nall)
|
||||
factor_lj = (numtyp)1.0;
|
||||
else {
|
||||
factor_lj = sp_lj[j/nall];
|
||||
j %= nall;
|
||||
}
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<lj1[mtype].z) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
|
||||
// Store answers
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii*/
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "lj_cut_gpu_cl.h"
|
||||
#else
|
||||
#include "lj_cut_gpu_ptx.h"
|
||||
#endif
|
||||
|
||||
#include "lj_cut_gpu_memory.h"
|
||||
#include <cassert>
|
||||
#define LJL_GPU_MemoryT LJL_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
LJL_GPU_MemoryT::LJL_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
LJL_GPU_MemoryT::~LJL_GPU_Memory() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool LJL_GPU_MemoryT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj_cut_gpu_kernel);
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_OPTIMIZED);
|
||||
|
||||
for (int i=0; i<lj_types*lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||
ucl_copy(sp_lj,dview,false);
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void LJL_GPU_MemoryT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
lj1.clear();
|
||||
lj3.clear();
|
||||
sp_lj.clear();
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double LJL_GPU_MemoryT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(LJL_GPU_Memory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
||||
|
||||
int ainum=this->atom->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->atom->dev_ans.begin(),
|
||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
template class LJL_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,71 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJL_GPU_MEMORY_H
|
||||
#define LJL_GPU_MEMORY_H
|
||||
|
||||
#include "atomic_gpu_memory.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
||||
public:
|
||||
LJL_GPU_Memory();
|
||||
~LJL_GPU_Memory();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device **/
|
||||
bool init(const int ntypes, double **host_cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
|
||||
UCL_D_Vec<numtyp4> lj1;
|
||||
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
|
||||
UCL_D_Vec<numtyp4> lj3;
|
||||
/// Special LJ values
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -16,206 +16,270 @@
|
|||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include "nvc_macros.h"
|
||||
#include "nvc_timer.h"
|
||||
#include "nvc_device.h"
|
||||
#include "pair_gpu_texture.h"
|
||||
#include "pair_gpu_cell.h"
|
||||
#include "lj_gpu_memory.cu"
|
||||
#include <string.h>
|
||||
#include "cudatimer.h"
|
||||
#include "lj_tex.h"
|
||||
#include "neigh.h"
|
||||
#include "cell.h"
|
||||
#include "lj_gpu_kernel.h"
|
||||
|
||||
#ifdef WINDLL
|
||||
#define EXTERN extern "C" __declspec(dllexport)
|
||||
#else
|
||||
#define EXTERN
|
||||
#endif
|
||||
|
||||
static float h_boxlo[3], h_boxhi[3];
|
||||
static float cell_size;
|
||||
static float *energy = NULL, *d_energy = NULL;
|
||||
static float3 *d_force = NULL, *f_temp = NULL, *v_temp = NULL, *d_virial = NULL;
|
||||
static float4 *d_pos = NULL, *temp_pos = NULL;
|
||||
static int *d_type = NULL;
|
||||
static int ncellx, ncelly, ncellz;
|
||||
|
||||
static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
|
||||
#define LJMT LJ_GPU_Memory<numtyp,acctyp>
|
||||
static neigh_list_gpu d_neigh_list;
|
||||
static cell_list_gpu d_cell_list;
|
||||
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Convert something to a string
|
||||
// ---------------------------------------------------------------------------
|
||||
#include <sstream>
|
||||
|
||||
template <class t>
|
||||
inline string lj_gpu_toa(const t& in) {
|
||||
ostringstream o;
|
||||
o.precision(2);
|
||||
o << in;
|
||||
return o.str();
|
||||
}
|
||||
#define TIMING(x)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Return string with GPU info
|
||||
// ---------------------------------------------------------------------------
|
||||
EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) {
|
||||
string sname=LJMF.gpu.name(id)+", "+
|
||||
lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
|
||||
lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
|
||||
lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ";
|
||||
strcpy(name,sname.c_str());
|
||||
EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name)
|
||||
{
|
||||
struct cudaDeviceProp prop;
|
||||
CUDA_SAFE_CALL( cudaGetDeviceProperties(&prop, id) );
|
||||
#ifdef _WIN32
|
||||
strcpy_s(name, strlen(prop.name)+1, prop.name);
|
||||
#else
|
||||
strncpy(name, prop.name, strlen(prop.name)+1);
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool _pc_cell_alloc;
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
EXTERN bool lj_gpu_init(int &ij_size, const int ntypes,
|
||||
double **cutsq,double **sigma,
|
||||
double **epsilon, double **host_lj1, double **host_lj2,
|
||||
double **host_lj3, double **host_lj4, double **offset,
|
||||
double *special_lj, double *boxlo, double *boxhi,
|
||||
double cellsize, double skin,
|
||||
const int max_nbors, const int gpu_id)
|
||||
{
|
||||
int num_devices;
|
||||
|
||||
inline void _lj_gpu_clear() {
|
||||
if (_pc_cell_alloc) {
|
||||
free(energy);
|
||||
free(v_temp);
|
||||
cudaFreeHost(f_temp);
|
||||
cudaFree(d_force);
|
||||
cudaFree(d_energy);
|
||||
cudaFree(d_virial);
|
||||
clear_cell_list(cell_list_gpu);
|
||||
_pc_cell_alloc=false;
|
||||
/* get device count */
|
||||
CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
|
||||
if (num_devices == 0) {
|
||||
printf("NO CUDA-capable GPU detected.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (gpu_id > num_devices) {
|
||||
printf("gpu_id %d is larger than the number of GPUs %d\n",
|
||||
gpu_id, num_devices);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* set CUDA device to the specified GPU */
|
||||
cudaThreadExit();
|
||||
CUDA_SAFE_CALL( cudaSetDevice(gpu_id) );
|
||||
|
||||
ij_size=0;
|
||||
|
||||
cell_size = cellsize;
|
||||
ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
|
||||
ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
|
||||
ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
h_boxhi[i] = boxhi[i];
|
||||
h_boxlo[i] = boxlo[i];
|
||||
}
|
||||
|
||||
init_force_const(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
|
||||
|
||||
init_cell_list_const(cellsize, skin, boxlo, boxhi);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Clear memory on host and device
|
||||
// ---------------------------------------------------------------------------
|
||||
EXTERN void lj_gpu_clear() {
|
||||
_lj_gpu_clear();
|
||||
LJMF.clear();
|
||||
|
||||
free(energy);
|
||||
free(v_temp);
|
||||
CUDA_SAFE_CALL( cudaFreeHost(f_temp) );
|
||||
if (d_force) CUDA_SAFE_CALL( cudaFree(d_force) );
|
||||
if (d_energy) CUDA_SAFE_CALL( cudaFree(d_energy) );
|
||||
if (d_virial) CUDA_SAFE_CALL( cudaFree(d_virial) );
|
||||
if (d_pos) CUDA_SAFE_CALL( cudaFree(d_pos) );
|
||||
if (d_type) CUDA_SAFE_CALL( cudaFree(d_type) );
|
||||
if (temp_pos) CUDA_SAFE_CALL( cudaFreeHost(temp_pos) );
|
||||
clear_neigh_list_gpu(d_neigh_list);
|
||||
clear_cell_list_gpu(d_cell_list);
|
||||
|
||||
if (useCache) {
|
||||
unbind_pos();
|
||||
unbind_type();
|
||||
}
|
||||
|
||||
|
||||
//LJMF.clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
|
||||
double **epsilon, double **host_lj1, double **host_lj2,
|
||||
double **host_lj3, double **host_lj4, double **offset,
|
||||
double *special_lj, double *boxlo, double *boxhi,
|
||||
double cell_size, double skin,
|
||||
const int max_nbors, const int gpu_id) {
|
||||
if (LJMF.is_allocated())
|
||||
lj_gpu_clear();
|
||||
else
|
||||
_pc_cell_alloc=false;
|
||||
|
||||
LJMF.gpu.init();
|
||||
if (LJMF.gpu.num_devices()==0)
|
||||
return false;
|
||||
|
||||
ij_size=IJ_SIZE;
|
||||
|
||||
bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
|
||||
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id,
|
||||
0,0);
|
||||
|
||||
ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
|
||||
ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
|
||||
ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
|
||||
|
||||
init_cell_list_const(cell_size, skin, boxlo, boxhi);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
|
||||
double **host_x, int *host_type, const int inum,
|
||||
const int nall, const int ago, const bool eflag, const bool vflag,
|
||||
const double *boxlo, const double *boxhi)
|
||||
double _lj_gpu_neigh(double **force, double *virial,
|
||||
double **host_x, int *host_type, const int inum,
|
||||
const int nall, const int ago, const bool eflag, const bool vflag,
|
||||
const double *boxlo, const double *boxhi)
|
||||
{
|
||||
cudaError_t err;
|
||||
|
||||
ljm.atom.nall(nall);
|
||||
ljm.atom.inum(inum);
|
||||
|
||||
ljm.nbor.time_nbor.start();
|
||||
ljm.nbor.time_nbor.stop();
|
||||
|
||||
double evdwl=0.0;
|
||||
|
||||
static int blockSize = BLOCK_1D;
|
||||
static int ncell = ncellx*ncelly*ncellz;
|
||||
|
||||
static int first_call = 1;
|
||||
|
||||
TIMING( static CUDATimer cuTimer );
|
||||
TIMING( static CTimer cTimer );
|
||||
TIMING( static CTimer cTimer2 );
|
||||
|
||||
double *atom_pos = host_x[0];
|
||||
|
||||
static int szTailList = inum*32;
|
||||
|
||||
TIMING( cTimer.Start() );
|
||||
TIMING( cTimer2.Start() );
|
||||
|
||||
/* MPI communication just happened, reallocate space using new inum & nall
|
||||
FIXME: this is costly: ~ total kernel time! Use a DIY GPU memory allocator.*/
|
||||
|
||||
if (first_call || ago == 0) {
|
||||
first_call = 0;
|
||||
_lj_gpu_clear();
|
||||
|
||||
if (!first_call) {
|
||||
if (useCache) {
|
||||
unbind_pos();
|
||||
unbind_type();
|
||||
}
|
||||
|
||||
CUDA_SAFE_CALL( cudaFree(d_force) );
|
||||
CUDA_SAFE_CALL( cudaFree(d_energy) );
|
||||
CUDA_SAFE_CALL( cudaFree(d_virial) );
|
||||
CUDA_SAFE_CALL( cudaFree(d_pos) );
|
||||
CUDA_SAFE_CALL( cudaFree(d_type) );
|
||||
|
||||
clear_neigh_list_gpu(d_neigh_list);
|
||||
|
||||
CUDA_SAFE_CALL( cudaFreeHost(f_temp) );
|
||||
CUDA_SAFE_CALL( cudaFreeHost(temp_pos) );
|
||||
|
||||
free(energy);
|
||||
free(v_temp);
|
||||
}
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc((void**)&d_force, inum*sizeof(float3)) );
|
||||
CUDA_SAFE_CALL( cudaMalloc((void**)&d_energy, inum*sizeof(float)) );
|
||||
CUDA_SAFE_CALL( cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)) );
|
||||
CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) );
|
||||
CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) );
|
||||
|
||||
init_neigh_list_gpu(d_neigh_list, inum, NEIGH_BIN_SIZE, szTailList);
|
||||
|
||||
CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) );
|
||||
CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp, inum*sizeof(float3)) );
|
||||
|
||||
energy = (float*) malloc(inum*sizeof(float));
|
||||
v_temp = (float3*)malloc(inum*2*sizeof(float3));
|
||||
cudaMallocHost((void**)&f_temp, inum*sizeof(float3));
|
||||
|
||||
cudaMalloc((void**)&d_force, inum*sizeof(float3));
|
||||
cudaMalloc((void**)&d_energy, inum*sizeof(float));
|
||||
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
|
||||
if (useCache) {
|
||||
bind_pos(d_pos, nall);
|
||||
bind_type(d_type, nall);
|
||||
}
|
||||
|
||||
first_call = 0;
|
||||
CUDA_SAFE_CALL( cudaThreadSynchronize() );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int),
|
||||
cudaMemcpyHostToDevice) );
|
||||
|
||||
init_cell_list(cell_list_gpu, nall, ncell, blockSize);
|
||||
_pc_cell_alloc=true;
|
||||
}
|
||||
|
||||
// build cell-list on GPU
|
||||
ljm.atom.time_atom.start();
|
||||
build_cell_list(host_x[0], host_type, cell_list_gpu,
|
||||
ncell, ncellx, ncelly, ncellz, blockSize, inum, nall, ago);
|
||||
ljm.atom.time_atom.stop();
|
||||
TIMING( static double mallocTime = 0. );
|
||||
TIMING( mallocTime += cTimer2.GetET() );
|
||||
TIMING( printf("malloc time = %f ms\n", mallocTime*1e3) );
|
||||
|
||||
ljm.time_pair.start();
|
||||
TIMING( cTimer2.Start() );
|
||||
for (int i = 0; i < 3*nall; i+=3) {
|
||||
temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f);
|
||||
}
|
||||
|
||||
#ifdef TIMING
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
cudaEventRecord(start, 0);
|
||||
#endif
|
||||
TIMING( static double copyTime = 0. );
|
||||
TIMING( copyTime += cTimer2.GetET() );
|
||||
TIMING( printf("position copy time = %f ms\n", copyTime*1e3) );
|
||||
|
||||
#define KERNEL_LJ_CELL(e, v, b, s) kernel_lj_cell<e,v,b><<<GX, BX, s>>> \
|
||||
(d_force, d_energy, d_virial, \
|
||||
cell_list_gpu.pos, \
|
||||
cell_list_gpu.idx, \
|
||||
cell_list_gpu.type, \
|
||||
cell_list_gpu.natom, \
|
||||
inum, nall, ncell, ncellx, ncelly, ncellz);
|
||||
|
||||
// call the cell-list force kernel
|
||||
const int BX=blockSize;
|
||||
dim3 GX(ncellx, ncelly*ncellz);
|
||||
|
||||
if (eflag == 0 && vflag == 0) {
|
||||
if (blockSize == 64 ) KERNEL_LJ_CELL(false, false, 64, 0);
|
||||
if (blockSize == 128) KERNEL_LJ_CELL(false, false, 128, 0);
|
||||
if (blockSize == 256) KERNEL_LJ_CELL(false, false, 256, 0);
|
||||
} else {
|
||||
if (blockSize == 64) KERNEL_LJ_CELL(true, true, 64, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
|
||||
if (blockSize == 128) KERNEL_LJ_CELL(true, true, 128, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
|
||||
if (blockSize == 256) KERNEL_LJ_CELL(true, true, 256, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
|
||||
TIMING( cTimer2.Start() );
|
||||
CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4), cudaMemcpyHostToDevice) );
|
||||
|
||||
TIMING( static double h2dTime = 0. );
|
||||
TIMING( h2dTime += cTimer2.GetET() );
|
||||
TIMING( printf("h2d copy time = %f ms\n", h2dTime*1e3) );
|
||||
|
||||
TIMING( cTimer2.Start() );
|
||||
if (ago == 0) {
|
||||
build_neigh_list_gpu(d_pos,
|
||||
d_neigh_list,
|
||||
h_boxlo, h_boxhi, cell_size,
|
||||
inum, nall);
|
||||
}
|
||||
|
||||
err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
printf("LJ force kernel launch error: %d\n", err);
|
||||
exit(1);
|
||||
TIMING( static double neighTime = 0. );
|
||||
TIMING( neighTime += cTimer2.GetET() );
|
||||
TIMING( printf("Neigh List time = %f ms\n", neighTime*1e3) );
|
||||
|
||||
TIMING( cTimer2.Start() );
|
||||
calc_lj_neigh_gpu(d_force, d_energy, d_virial,
|
||||
d_pos, d_type,
|
||||
d_neigh_list,
|
||||
inum, nall,
|
||||
eflag, vflag);
|
||||
TIMING( static double forceTime = 0. );
|
||||
TIMING( forceTime += cTimer2.GetET() );
|
||||
TIMING( printf("Force time = %f ms\n", forceTime*1e3) );
|
||||
TIMING( printf("GPU kernel time = %f ms\n", (forceTime + neighTime)*1e3) );
|
||||
|
||||
|
||||
TIMING( cTimer2.Start() );
|
||||
CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost) );
|
||||
TIMING( static double d2hTime = 0. );
|
||||
TIMING( d2hTime += cTimer2.GetET() );
|
||||
TIMING( printf("d2h copy time = %f ms\n", d2hTime*1e3) );
|
||||
TIMING( printf("GPU-CPU data transfer time = %f ms\n", (h2dTime+d2hTime)*1e3) );
|
||||
|
||||
TIMING( cTimer2.Start() );
|
||||
|
||||
for (int i = 0; i < inum; i++) {
|
||||
force[i][0] += f_temp[i].x;
|
||||
force[i][1] += f_temp[i].y;
|
||||
force[i][2] += f_temp[i].z;
|
||||
}
|
||||
|
||||
#ifdef TIMING
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaEventSynchronize(stop);
|
||||
float kTime;
|
||||
cudaEventElapsedTime(&kTime, start, stop);
|
||||
kernelTime += kTime;
|
||||
printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
|
||||
cudaEventDestroy(start);
|
||||
cudaEventDestroy(stop);
|
||||
#endif
|
||||
|
||||
// copy results from GPU to CPU
|
||||
cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
|
||||
if (eflag) {
|
||||
cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
|
||||
CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy,
|
||||
inum*sizeof(float), cudaMemcpyDeviceToHost) );
|
||||
for (int i = 0; i < inum; i++) {
|
||||
evdwl += energy[i];
|
||||
}
|
||||
evdwl *= 0.5f;
|
||||
}
|
||||
|
||||
if (vflag) {
|
||||
cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
|
||||
CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3),
|
||||
cudaMemcpyDeviceToHost) );
|
||||
for (int i = 0; i < inum; i++) {
|
||||
virial[0] += v_temp[2*i].x;
|
||||
virial[1] += v_temp[2*i].y;
|
||||
|
@ -228,43 +292,175 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
|
|||
virial[i] *= 0.5f;
|
||||
}
|
||||
|
||||
for (int i = 0; i < inum; i++) {
|
||||
force[i][0] += f_temp[i].x;
|
||||
force[i][1] += f_temp[i].y;
|
||||
force[i][2] += f_temp[i].z;
|
||||
}
|
||||
|
||||
ljm.time_pair.stop();
|
||||
|
||||
ljm.atom.time_atom.add_to_total();
|
||||
ljm.nbor.time_nbor.add_to_total();
|
||||
ljm.time_pair.add_to_total();
|
||||
TIMING( static double postTime = 0. );
|
||||
TIMING( postTime += cTimer2.GetET() );
|
||||
TIMING( printf("postprocess Time = %f ms\n", postTime*1e3) );
|
||||
TIMING( printf("Data process time = %f ms\n", (postTime+copyTime)*1e3) );
|
||||
|
||||
TIMING( static double totalTime = 0. );
|
||||
TIMING( totalTime += cTimer.GetET() );
|
||||
TIMING( printf("lj_gpu time = %f ms\n", totalTime*1e3) );
|
||||
|
||||
return evdwl;
|
||||
|
||||
}
|
||||
|
||||
EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall,
|
||||
const int ago, const bool eflag, const bool vflag,
|
||||
const double *boxlo, const double *boxhi)
|
||||
EXTERN double lj_gpu_neigh(double **force, double *virial,
|
||||
double **host_x, int *host_type,
|
||||
const int inum, const int nall,
|
||||
const int ago, const bool eflag, const bool vflag,
|
||||
const double *boxlo, const double *boxhi)
|
||||
{
|
||||
return _lj_gpu_cell<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall,
|
||||
ago, eflag, vflag, boxlo, boxhi);
|
||||
return _lj_gpu_neigh<float,float>(force, virial,
|
||||
host_x, host_type, inum, nall,
|
||||
ago, eflag, vflag, boxlo, boxhi);
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double _lj_gpu_cell(double **force, double *virial,
|
||||
double **host_x, int *host_type, const int inum,
|
||||
const int nall, const int ago,
|
||||
const bool eflag, const bool vflag,
|
||||
const double *boxlo, const double *boxhi)
|
||||
{
|
||||
|
||||
double evdwl=0.0;
|
||||
|
||||
static int ncell = ncellx*ncelly*ncellz;
|
||||
|
||||
static int first_call = 1;
|
||||
|
||||
// allocate memory on CPU and GPU
|
||||
if (first_call || ago == 0) {
|
||||
if (!first_call) {
|
||||
if (useCache) {
|
||||
unbind_pos();
|
||||
unbind_type();
|
||||
}
|
||||
|
||||
free(energy);
|
||||
free(v_temp);
|
||||
|
||||
CUDA_SAFE_CALL( cudaFree(d_force) );
|
||||
CUDA_SAFE_CALL( cudaFree(d_energy) );
|
||||
CUDA_SAFE_CALL( cudaFree(d_virial) );
|
||||
|
||||
CUDA_SAFE_CALL( cudaFree(d_pos) );
|
||||
CUDA_SAFE_CALL( cudaFree(d_type) );
|
||||
CUDA_SAFE_CALL( cudaFreeHost(f_temp) );
|
||||
CUDA_SAFE_CALL( cudaFreeHost(temp_pos) );
|
||||
|
||||
clear_cell_list_gpu(d_cell_list);
|
||||
}
|
||||
|
||||
energy = (float*) malloc(inum*sizeof(float));
|
||||
v_temp = (float3*)malloc(inum*2*sizeof(float3));
|
||||
|
||||
|
||||
cudaMalloc((void**)&d_force, inum*sizeof(float3));
|
||||
cudaMalloc((void**)&d_energy, inum*sizeof(float));
|
||||
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) );
|
||||
CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) );
|
||||
|
||||
CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp, inum*sizeof(float3)) );
|
||||
CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) );
|
||||
|
||||
init_cell_list_gpu(d_cell_list, nall, ncell);
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int),
|
||||
cudaMemcpyHostToDevice) );
|
||||
|
||||
if (useCache) {
|
||||
bind_pos(d_pos, nall);
|
||||
bind_type(d_type, nall);
|
||||
}
|
||||
|
||||
first_call = 0;
|
||||
}
|
||||
|
||||
/* build cell-list on GPU */
|
||||
double *atom_pos = host_x[0];
|
||||
for (int i = 0; i < 3*nall; i+=3) {
|
||||
temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f);
|
||||
}
|
||||
CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4),
|
||||
cudaMemcpyHostToDevice) );
|
||||
if (ago == 0) {
|
||||
build_cell_list_gpu(d_pos, d_cell_list, h_boxlo, h_boxhi,
|
||||
cell_size, inum, nall);
|
||||
}
|
||||
|
||||
calc_lj_cell_gpu(d_force, d_energy, d_virial,
|
||||
d_pos, d_type, d_cell_list,
|
||||
inum, nall, ncellx,
|
||||
ncelly, ncellz, cell_size,
|
||||
eflag, vflag);
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3),
|
||||
cudaMemcpyDeviceToHost) );
|
||||
|
||||
for (int i = 0; i < inum; i++) {
|
||||
force[i][0] += f_temp[i].x;
|
||||
force[i][1] += f_temp[i].y;
|
||||
force[i][2] += f_temp[i].z;
|
||||
}
|
||||
|
||||
if (eflag) {
|
||||
CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy,
|
||||
inum*sizeof(float), cudaMemcpyDeviceToHost) );
|
||||
for (int i = 0; i < inum; i++) {
|
||||
evdwl += energy[i];
|
||||
}
|
||||
evdwl *= 0.5f;
|
||||
}
|
||||
|
||||
if (vflag) {
|
||||
CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3),
|
||||
cudaMemcpyDeviceToHost) );
|
||||
for (int i = 0; i < inum; i++) {
|
||||
virial[0] += v_temp[2*i].x;
|
||||
virial[1] += v_temp[2*i].y;
|
||||
virial[2] += v_temp[2*i].z;
|
||||
virial[3] += v_temp[2*i+1].x;
|
||||
virial[4] += v_temp[2*i+1].y;
|
||||
virial[5] += v_temp[2*i+1].z;
|
||||
}
|
||||
for (int i = 0; i < 6; i++)
|
||||
virial[i] *= 0.5f;
|
||||
}
|
||||
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
EXTERN double lj_gpu_cell(double **force, double *virial,
|
||||
double **host_x, int *host_type,
|
||||
const int inum, const int nall,
|
||||
const int ago, const bool eflag, const bool vflag,
|
||||
const double *boxlo, const double *boxhi)
|
||||
{
|
||||
return _lj_gpu_cell<float,float>(force, virial,
|
||||
host_x, host_type, inum, nall,
|
||||
ago, eflag, vflag, boxlo, boxhi);
|
||||
}
|
||||
|
||||
EXTERN void lj_gpu_time() {
|
||||
cout.precision(4);
|
||||
cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n";
|
||||
cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
|
||||
cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n";
|
||||
cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n";
|
||||
/* cout.precision(4);
|
||||
cout << "Atom copy: " << LJMF.time_atom.total_seconds() << " s.\n";
|
||||
cout << "Neighbor copy: " << LJMF.time_nbor.total_seconds() << " s.\n";
|
||||
cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n";*/
|
||||
//cout << "Answer copy: " << LJMF.time_answer.total_seconds() << " s.\n";
|
||||
}
|
||||
|
||||
EXTERN int lj_gpu_num_devices() {
|
||||
return LJMF.gpu.num_devices();
|
||||
int num_devices;
|
||||
CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
|
||||
return num_devices;
|
||||
}
|
||||
|
||||
EXTERN double lj_gpu_bytes() {
|
||||
return LJMF.host_memory_usage();
|
||||
return 0.0;
|
||||
}
|
||||
|
|
|
@ -1,220 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJ_GPU_KERNEL
|
||||
#define LJ_GPU_KERNEL
|
||||
|
||||
/* Cell list version of LJ kernel */
|
||||
template<bool eflag, bool vflag, int blockSize>
|
||||
__global__ void kernel_lj_cell(float3 *force3,
|
||||
float *energy, float3 *virial,
|
||||
float3 *cell_list, unsigned int *cell_idx,
|
||||
int *cell_type, int *cell_atom,
|
||||
const int inum, const int nall, const int ncell,
|
||||
const int ncellx, const int ncelly, const int ncellz)
|
||||
{
|
||||
|
||||
|
||||
|
||||
// calculate 3D block idx from 2d block
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y % ncelly;
|
||||
int bz = blockIdx.y / ncelly;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
// compute cell idx from 3D block idx
|
||||
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
|
||||
|
||||
__shared__ int typeSh[blockSize];
|
||||
__shared__ float posSh[blockSize*3];
|
||||
__shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
|
||||
extern __shared__ float smem[];
|
||||
|
||||
__shared__ float *lj3Sh;
|
||||
__shared__ float *lj4Sh;
|
||||
__shared__ float *offsetSh;
|
||||
|
||||
// load force parameters into shared memory
|
||||
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
|
||||
int itype = i/MAX_SHARED_TYPES;
|
||||
int jtype = i%MAX_SHARED_TYPES;
|
||||
cutsqSh[i] = _cutsq_<float>(itype,jtype);
|
||||
lj1Sh[i] = _lj1_<float>(itype,jtype).x;
|
||||
lj2Sh[i] = _lj1_<float>(itype,jtype).y;
|
||||
}
|
||||
|
||||
// Only allocate shared memory when needed,
|
||||
// this reduces shared memory limitation on occupancy
|
||||
if (eflag || vflag) {
|
||||
lj3Sh = smem;
|
||||
lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
|
||||
offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
|
||||
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
|
||||
int itype = i/MAX_SHARED_TYPES;
|
||||
int jtype = i%MAX_SHARED_TYPES;
|
||||
lj3Sh[i] = _lj3_<float>(itype,jtype).x+0.01;
|
||||
lj4Sh[i] = _lj3_<float>(itype,jtype).y;
|
||||
offsetSh[i]= _offset_<float>(itype,jtype);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1),
|
||||
nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1),
|
||||
nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1);
|
||||
|
||||
for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) {
|
||||
float3 f = {0.0f, 0.0f, 0.0f};
|
||||
float ener = 0.0f;
|
||||
float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
|
||||
int itype;
|
||||
float ix, iy, iz;
|
||||
int i = tid + ii*blockSize;
|
||||
unsigned int answer_pos = cell_idx[cid*blockSize+i];
|
||||
|
||||
// load current cell atom position and type into sMem
|
||||
for (int j = tid; j < cell_atom[cid]; j += blockSize) {
|
||||
int pid = cid*blockSize + j;
|
||||
float3 pos = cell_list[pid];
|
||||
posSh[j ] = pos.x;
|
||||
posSh[j+ blockSize] = pos.y;
|
||||
posSh[j+2*blockSize] = pos.z;
|
||||
typeSh[j] = cell_type[pid];
|
||||
}
|
||||
__syncthreads();
|
||||
if (answer_pos < inum) {
|
||||
itype = typeSh[i];
|
||||
ix = posSh[i ];
|
||||
iy = posSh[i+ blockSize];
|
||||
iz = posSh[i+2*blockSize];
|
||||
|
||||
// compute force from current cell
|
||||
for (int j = 0; j < cell_atom[cid]; j++) {
|
||||
if (j == i) continue;
|
||||
float delx = ix - posSh[j ];
|
||||
float dely = iy - posSh[j+ blockSize];
|
||||
float delz = iz - posSh[j+2*blockSize];
|
||||
int jtype = typeSh[j];
|
||||
int mtype = itype + jtype*MAX_SHARED_TYPES;
|
||||
float r2inv = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
if (r2inv < cutsqSh[mtype]) {
|
||||
r2inv = 1.0f/r2inv;
|
||||
float r6inv = r2inv * r2inv * r2inv;
|
||||
float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
|
||||
f.x += delx * force;
|
||||
f.y += dely * force;
|
||||
f.z += delz * force;
|
||||
|
||||
if (eflag) {
|
||||
float e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
|
||||
ener += (e - offsetSh[mtype]);
|
||||
}
|
||||
|
||||
if (vflag) {
|
||||
v0.x += delx*delx*force;
|
||||
v0.y += dely*dely*force;
|
||||
v0.z += delz*delz*force;
|
||||
v1.x += delx*dely*force;
|
||||
v1.y += delx*delz*force;
|
||||
v1.z += dely*delz*force;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// compute force from neigboring cells
|
||||
for (int nborz = nborz0; nborz <= nborz1; nborz++) {
|
||||
for (int nbory = nbory0; nbory <= nbory1; nbory++) {
|
||||
for (int nborx = nborx0; nborx <= nborx1; nborx++) {
|
||||
if (nborz == bz && nbory == by && nborx == bx) continue;
|
||||
|
||||
// compute cell id
|
||||
int cid_nbor = nborx + INT_MUL(nbory,ncellx) +
|
||||
INT_MUL(nborz,INT_MUL(ncellx,ncelly));
|
||||
|
||||
// load neighbor cell position and type into smem
|
||||
for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) {
|
||||
int pid = INT_MUL(cid_nbor,blockSize) + j;
|
||||
float3 pos = cell_list[pid];
|
||||
posSh[j ] = pos.x;
|
||||
posSh[j+ blockSize] = pos.y;
|
||||
posSh[j+2*blockSize] = pos.z;
|
||||
typeSh[j] = cell_type[pid];
|
||||
}
|
||||
__syncthreads();
|
||||
// compute force
|
||||
if (answer_pos < inum) {
|
||||
for (int j = 0; j < cell_atom[cid_nbor]; j++) {
|
||||
float delx = ix - posSh[j ];
|
||||
float dely = iy - posSh[j+ blockSize];
|
||||
float delz = iz - posSh[j+2*blockSize];
|
||||
int jtype = typeSh[j];
|
||||
int mtype = itype + jtype*MAX_SHARED_TYPES;
|
||||
float r2inv = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
if (r2inv < cutsqSh[mtype]) {
|
||||
r2inv = 1.0f/r2inv;
|
||||
float r6inv = r2inv * r2inv * r2inv;
|
||||
float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
|
||||
f.x += delx * force;
|
||||
f.y += dely * force;
|
||||
f.z += delz * force;
|
||||
|
||||
if (eflag) {
|
||||
float e=r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
|
||||
ener += (e-offsetSh[mtype]);
|
||||
}
|
||||
if (vflag) {
|
||||
v0.x += delx*delx*force;
|
||||
v0.y += dely*dely*force;
|
||||
v0.z += delz*delz*force;
|
||||
v1.x += delx*dely*force;
|
||||
v1.y += delx*delz*force;
|
||||
v1.z += dely*delz*force;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (answer_pos < inum) {
|
||||
force3[answer_pos] = f;
|
||||
if (eflag)
|
||||
energy[answer_pos] = ener;
|
||||
if (vflag) {
|
||||
virial[2*answer_pos] = v0;
|
||||
virial[2*answer_pos+1] = v1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,147 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "lj_gpu_memory.h"
|
||||
#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
|
||||
double **host_cutsq, double **host_sigma,
|
||||
double **host_epsilon, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int max_nbors,
|
||||
const int me, const int nlocal, const int nall) {
|
||||
if (allocated)
|
||||
clear();
|
||||
|
||||
if (me>=gpu.num_devices())
|
||||
return false;
|
||||
gpu.set(me);
|
||||
if (gpu.revision()<1.0)
|
||||
return false;
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init();
|
||||
|
||||
// Initialize atom and nbor data
|
||||
max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
|
||||
if (max_local==0)
|
||||
max_local=1000;
|
||||
if (nall<=nlocal)
|
||||
max_atoms=max_local*2;
|
||||
else
|
||||
max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
|
||||
if (!atom.init(max_atoms))
|
||||
return false;
|
||||
if (!nbor.init(ij_size,max_local,max_nbors))
|
||||
return false;
|
||||
|
||||
// Get a stream for computing pair potentials
|
||||
CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
|
||||
|
||||
// Use the write buffer from atom for data initialization
|
||||
NVC_HostT &host_write=atom.host_write;
|
||||
assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
|
||||
|
||||
// Copy data for bonded interactions
|
||||
special_lj.safe_alloc(4);
|
||||
special_lj.cast_copy(host_special_lj,host_write);
|
||||
|
||||
// Copy sigma, epsilon, and cutsq onto GPU
|
||||
sigma.safe_alloc(ntypes,ntypes,sigma_get_texture<numtyp>());
|
||||
sigma.cast_copy(host_sigma[0],host_write);
|
||||
epsilon.safe_alloc(ntypes,ntypes,epsilon_get_texture<numtyp>());
|
||||
epsilon.cast_copy(host_epsilon[0],host_write);
|
||||
cutsq.safe_alloc(ntypes,ntypes,cutsq_get_texture<numtyp>());
|
||||
cutsq.cast_copy(host_cutsq[0],host_write);
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
if (lj_types<=MAX_SHARED_TYPES) {
|
||||
lj_types=MAX_SHARED_TYPES;
|
||||
shared_types=true;
|
||||
}
|
||||
offset.safe_alloc(lj_types,lj_types,offset_get_texture<numtyp>());
|
||||
offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes);
|
||||
double *t1=host_lj1[0];
|
||||
double *t2=host_lj2[0];
|
||||
for (int i=0; i<ntypes*ntypes; i++) {
|
||||
host_write[i*2]=t1[i];
|
||||
host_write[i*2+1]=t2[i];
|
||||
}
|
||||
lj1.safe_alloc(lj_types,lj_types,lj1_get_texture<numtyp>());
|
||||
lj1.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
|
||||
ntypes,ntypes);
|
||||
t1=host_lj3[0];
|
||||
t2=host_lj4[0];
|
||||
for (int i=0; i<ntypes*ntypes; i++) {
|
||||
host_write[i*2]=t1[i];
|
||||
host_write[i*2+1]=t2[i];
|
||||
}
|
||||
lj3.safe_alloc(lj_types,lj_types,lj3_get_texture<numtyp>());
|
||||
lj3.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
|
||||
ntypes,ntypes);
|
||||
|
||||
dev_error.safe_alloc(1);
|
||||
dev_error.zero();
|
||||
|
||||
allocated=true;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void LJ_GPU_MemoryT::clear() {
|
||||
if (!allocated)
|
||||
return;
|
||||
allocated=false;
|
||||
|
||||
// Check for any pair style specific errors here
|
||||
int err_flag;
|
||||
dev_error.copy_to_host(&err_flag);
|
||||
|
||||
atom.clear();
|
||||
nbor.clear();
|
||||
|
||||
CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream));
|
||||
|
||||
dev_error.clear();
|
||||
sigma.clear();
|
||||
epsilon.clear();
|
||||
special_lj.clear();
|
||||
cutsq.clear();
|
||||
offset.clear();
|
||||
lj1.clear();
|
||||
lj3.clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double LJ_GPU_MemoryT::host_memory_usage() const {
|
||||
return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+
|
||||
sizeof(LJ_GPU_Memory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
@ -1,87 +0,0 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
||||
Peng Wang (Nvidia), penwang@nvidia.com
|
||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJ_GPU_MEMORY_H
|
||||
#define LJ_GPU_MEMORY_H
|
||||
|
||||
#include "nvc_device.h"
|
||||
#include "nvc_traits.h"
|
||||
#include "pair_gpu_atom.h"
|
||||
#include "pair_gpu_nbor.h"
|
||||
|
||||
#define BLOCK_1D 64 // max value = 256
|
||||
#define CELL_SIZE BLOCK_1D
|
||||
#define MAX_SHARED_TYPES 8
|
||||
#define BIG_NUMBER 100000000
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class LJ_GPU_Memory {
|
||||
public:
|
||||
LJ_GPU_Memory() : allocated(false) {}
|
||||
~LJ_GPU_Memory() { clear(); }
|
||||
|
||||
inline bool is_allocated() { return allocated; }
|
||||
|
||||
/// Allocate memory on host and device
|
||||
bool init(const int ij_size, const int ntypes, double **host_cutsq,
|
||||
double **host_sigma, double **host_epsilon,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int max_nbors, const int me, const int nlocal,
|
||||
const int nall);
|
||||
/// Free any memory on host and device
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on GPU per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
/// Total host memory used by library
|
||||
double host_memory_usage() const;
|
||||
|
||||
// ------------------------- DATA -----------------------------
|
||||
|
||||
// Device Properties
|
||||
NVCDevice gpu;
|
||||
// Device Error Flag
|
||||
NVC_VecI dev_error;
|
||||
// Stream for asynchronous work
|
||||
cudaStream_t pair_stream;
|
||||
|
||||
// Atom Data
|
||||
PairGPUAtom<numtyp,acctyp> atom;
|
||||
// Neighbor Data
|
||||
PairGPUNbor nbor;
|
||||
|
||||
// --------------- Const Data for Atoms
|
||||
NVC_ConstMatT sigma, epsilon, cutsq, offset;
|
||||
NVC_ConstMat< typename nvc_vec_traits<numtyp>::vec2 > lj1, lj3;
|
||||
NVC_VecT special_lj;
|
||||
|
||||
size_t max_atoms, max_local;
|
||||
|
||||
// Timing for pair calculation
|
||||
NVCTimer time_pair;
|
||||
|
||||
// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
protected:
|
||||
bool allocated;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -0,0 +1,129 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <math.h>
|
||||
|
||||
#include "ljc_cut_gpu_memory.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e) {
|
||||
LJCMF.clear();
|
||||
gpu_mode=LJCMF.device->gpu_mode();
|
||||
double gpu_split=LJCMF.device->particle_split();
|
||||
int first_gpu=LJCMF.device->first_device();
|
||||
int last_gpu=LJCMF.device->last_device();
|
||||
int world_me=LJCMF.device->world_me();
|
||||
int gpu_rank=LJCMF.device->gpu_rank();
|
||||
int procs_per_gpu=LJCMF.device->procs_per_gpu();
|
||||
|
||||
LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (world_me==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
if (world_me==0) {
|
||||
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||
qqrd2e);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0) {
|
||||
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split,
|
||||
screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e);
|
||||
if (!init_ok)
|
||||
return false;
|
||||
}
|
||||
MPI_Barrier(LJCMF.device->gpu_comm);
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
void ljc_gpu_clear() {
|
||||
LJCMF.clear();
|
||||
}
|
||||
|
||||
int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q) {
|
||||
return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, cpu_time, success, host_q);
|
||||
}
|
||||
|
||||
void ljc_gpu_compute(const int timestep, const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q) {
|
||||
LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||
host_q);
|
||||
}
|
||||
|
||||
double ljc_gpu_bytes() {
|
||||
return LJCMF.host_memory_usage();
|
||||
}
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue