From 9656958169df763b79a91194e8c67c876c1f7d8e Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 1 Jul 2016 23:27:26 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Makefile.lammps.mingw-cross | 2 +- lib/gpu/Makefile.linux | 2 +- lib/gpu/Makefile.mingw32-cross | 2 +- lib/gpu/Makefile.mingw32-cross-mpi | 2 +- lib/gpu/Makefile.mingw64-cross | 2 +- lib/gpu/Makefile.mingw64-cross-mpi | 2 +- lib/gpu/geryon/nvd_device.h | 92 +++---- lib/gpu/geryon/nvd_kernel.h | 126 +++++----- lib/gpu/geryon/nvd_mat.h | 6 +- lib/gpu/geryon/nvd_memory.h | 106 ++++---- lib/gpu/geryon/nvd_texture.h | 24 +- lib/gpu/geryon/nvd_timer.h | 30 +-- lib/gpu/geryon/ocl_device.h | 120 ++++----- lib/gpu/geryon/ocl_kernel.h | 132 +++++----- lib/gpu/geryon/ocl_mat.h | 6 +- lib/gpu/geryon/ocl_memory.h | 116 ++++----- lib/gpu/geryon/ocl_texture.h | 8 +- lib/gpu/geryon/ocl_timer.h | 30 +-- lib/gpu/geryon/ucl_arg_kludge.h | 372 ++++++++++++++-------------- lib/gpu/geryon/ucl_basemat.h | 16 +- lib/gpu/geryon/ucl_copy.h | 162 ++++++------ lib/gpu/geryon/ucl_d_mat.h | 112 ++++----- lib/gpu/geryon/ucl_d_vec.h | 98 ++++---- lib/gpu/geryon/ucl_h_mat.h | 164 ++++++------ lib/gpu/geryon/ucl_h_vec.h | 144 +++++------ lib/gpu/geryon/ucl_matrix.h | 48 ++-- lib/gpu/geryon/ucl_nv_kernel.h | 6 +- lib/gpu/geryon/ucl_print.h | 64 ++--- lib/gpu/geryon/ucl_s_obj_help.h | 30 +-- lib/gpu/geryon/ucl_types.h | 78 +++--- lib/gpu/geryon/ucl_vector.h | 48 ++-- lib/gpu/lal_answer.cpp | 46 ++-- lib/gpu/lal_atom.cpp | 46 ++-- lib/gpu/lal_atom.cu | 6 +- lib/gpu/lal_atom.h | 60 ++--- lib/gpu/lal_balance.h | 18 +- lib/gpu/lal_base_atomic.cpp | 20 +- lib/gpu/lal_base_atomic.h | 22 +- lib/gpu/lal_base_charge.cpp | 16 +- lib/gpu/lal_base_charge.h | 12 +- lib/gpu/lal_base_dipole.cpp | 18 +- lib/gpu/lal_base_dipole.h | 12 +- lib/gpu/lal_base_dpd.cpp | 18 +- lib/gpu/lal_base_dpd.h | 12 +- lib/gpu/lal_base_ellipsoid.cpp | 34 +-- lib/gpu/lal_base_ellipsoid.h | 28 +-- lib/gpu/lal_base_three.cpp | 52 ++-- lib/gpu/lal_base_three.h | 31 +-- lib/gpu/lal_beck.cpp | 10 +- lib/gpu/lal_beck.cu | 26 +- lib/gpu/lal_beck.h | 12 +- lib/gpu/lal_beck_ext.cpp | 8 +- lib/gpu/lal_born.cpp | 26 +- lib/gpu/lal_born.cu | 58 ++--- lib/gpu/lal_born.h | 20 +- lib/gpu/lal_born_coul_long.cpp | 28 +-- lib/gpu/lal_born_coul_long.cu | 268 ++++++++++---------- lib/gpu/lal_born_coul_long.h | 20 +- lib/gpu/lal_born_coul_long_ext.cpp | 32 +-- lib/gpu/lal_born_coul_wolf.cpp | 30 +-- lib/gpu/lal_born_coul_wolf.cu | 64 ++--- lib/gpu/lal_born_coul_wolf.h | 20 +- lib/gpu/lal_born_coul_wolf_ext.cpp | 24 +- lib/gpu/lal_born_ext.cpp | 30 +-- lib/gpu/lal_buck.cpp | 30 +-- lib/gpu/lal_buck.cu | 54 ++-- lib/gpu/lal_buck.h | 18 +- lib/gpu/lal_buck_coul.cpp | 30 +-- lib/gpu/lal_buck_coul.cu | 80 +++--- lib/gpu/lal_buck_coul.h | 20 +- lib/gpu/lal_buck_coul_ext.cpp | 24 +- lib/gpu/lal_buck_coul_long.cpp | 26 +- lib/gpu/lal_buck_coul_long.cu | 276 ++++++++++----------- lib/gpu/lal_buck_coul_long.h | 14 +- lib/gpu/lal_buck_coul_long_ext.cpp | 20 +- lib/gpu/lal_buck_ext.cpp | 24 +- lib/gpu/lal_cg_cmm.cpp | 24 +- lib/gpu/lal_cg_cmm.cu | 44 ++-- lib/gpu/lal_cg_cmm.h | 10 +- lib/gpu/lal_cg_cmm_ext.cpp | 14 +- lib/gpu/lal_cg_cmm_long.cpp | 24 +- lib/gpu/lal_cg_cmm_long.cu | 38 +-- lib/gpu/lal_cg_cmm_long.h | 12 +- lib/gpu/lal_cg_cmm_long_ext.cpp | 16 +- lib/gpu/lal_charmm_long.cpp | 20 +- lib/gpu/lal_charmm_long.cu | 42 ++-- lib/gpu/lal_charmm_long.h | 12 +- lib/gpu/lal_charmm_long_ext.cpp | 16 +- lib/gpu/lal_colloid.cpp | 30 +-- lib/gpu/lal_colloid.cu | 116 ++++----- lib/gpu/lal_colloid.h | 20 +- lib/gpu/lal_colloid_ext.cpp | 22 +- lib/gpu/lal_coul.cpp | 18 +- lib/gpu/lal_coul.cu | 38 +-- lib/gpu/lal_coul.h | 12 +- lib/gpu/lal_coul_debye.cpp | 16 +- lib/gpu/lal_coul_debye.cu | 26 +- lib/gpu/lal_coul_debye.h | 12 +- lib/gpu/lal_coul_debye_ext.cpp | 16 +- lib/gpu/lal_coul_dsf.cpp | 18 +- lib/gpu/lal_coul_dsf.cu | 56 ++--- lib/gpu/lal_coul_dsf.h | 10 +- lib/gpu/lal_coul_dsf_ext.cpp | 22 +- lib/gpu/lal_coul_ext.cpp | 18 +- lib/gpu/lal_coul_long.cpp | 14 +- lib/gpu/lal_coul_long.cu | 18 +- lib/gpu/lal_coul_long.h | 12 +- lib/gpu/lal_coul_long_ext.cpp | 42 ++-- lib/gpu/lal_device.cpp | 112 ++++----- lib/gpu/lal_device.cu | 6 +- lib/gpu/lal_device.h | 68 ++--- lib/gpu/lal_dipole_lj.cpp | 16 +- lib/gpu/lal_dipole_lj.cu | 90 +++---- lib/gpu/lal_dipole_lj.h | 8 +- lib/gpu/lal_dipole_lj_ext.cpp | 12 +- lib/gpu/lal_dipole_lj_sf.cpp | 20 +- lib/gpu/lal_dipole_lj_sf.cu | 122 ++++----- lib/gpu/lal_dipole_lj_sf.h | 8 +- lib/gpu/lal_dipole_lj_sf_ext.cpp | 12 +- lib/gpu/lal_dpd.cpp | 30 +-- lib/gpu/lal_dpd.cu | 84 +++---- lib/gpu/lal_dpd.h | 18 +- lib/gpu/lal_dpd_ext.cpp | 22 +- lib/gpu/lal_eam.cpp | 134 +++++----- lib/gpu/lal_eam.cu | 136 +++++----- lib/gpu/lal_eam.h | 54 ++-- lib/gpu/lal_eam_alloy_ext.cpp | 32 +-- lib/gpu/lal_eam_ext.cpp | 32 +-- lib/gpu/lal_eam_fs_ext.cpp | 32 +-- lib/gpu/lal_ellipsoid_extra.h | 16 +- lib/gpu/lal_ellipsoid_nbor.cu | 34 +-- lib/gpu/lal_gauss.cpp | 22 +- lib/gpu/lal_gauss.cu | 60 ++--- lib/gpu/lal_gauss.h | 18 +- lib/gpu/lal_gauss_ext.cpp | 24 +- lib/gpu/lal_gayberne.cpp | 80 +++--- lib/gpu/lal_gayberne.cu | 144 +++++------ lib/gpu/lal_gayberne.h | 26 +- lib/gpu/lal_gayberne_ext.cpp | 20 +- lib/gpu/lal_gayberne_lj.cu | 130 +++++----- lib/gpu/lal_lj.cpp | 30 +-- lib/gpu/lal_lj.cu | 60 ++--- lib/gpu/lal_lj.h | 16 +- lib/gpu/lal_lj96.cpp | 18 +- lib/gpu/lal_lj96.cu | 50 ++-- lib/gpu/lal_lj96.h | 10 +- lib/gpu/lal_lj96_ext.cpp | 8 +- lib/gpu/lal_lj_class2_long.cpp | 16 +- lib/gpu/lal_lj_class2_long.cu | 42 ++-- lib/gpu/lal_lj_class2_long.h | 8 +- lib/gpu/lal_lj_class2_long_ext.cpp | 8 +- lib/gpu/lal_lj_coul.cpp | 24 +- lib/gpu/lal_lj_coul.cu | 46 ++-- lib/gpu/lal_lj_coul.h | 8 +- lib/gpu/lal_lj_coul_debye.cpp | 22 +- lib/gpu/lal_lj_coul_debye.cu | 42 ++-- lib/gpu/lal_lj_coul_debye.h | 8 +- lib/gpu/lal_lj_coul_debye_ext.cpp | 12 +- lib/gpu/lal_lj_coul_ext.cpp | 10 +- lib/gpu/lal_lj_coul_long.cpp | 22 +- lib/gpu/lal_lj_coul_long.cu | 38 +-- lib/gpu/lal_lj_coul_long.h | 10 +- lib/gpu/lal_lj_coul_long_ext.cpp | 18 +- lib/gpu/lal_lj_coul_msm.cpp | 24 +- lib/gpu/lal_lj_coul_msm.cu | 30 +-- lib/gpu/lal_lj_coul_msm.h | 14 +- lib/gpu/lal_lj_coul_msm_ext.cpp | 10 +- lib/gpu/lal_lj_cubic.cpp | 26 +- lib/gpu/lal_lj_cubic.cu | 64 ++--- lib/gpu/lal_lj_cubic.h | 16 +- lib/gpu/lal_lj_cubic_ext.cpp | 16 +- lib/gpu/lal_lj_dsf.cpp | 24 +- lib/gpu/lal_lj_dsf.cu | 46 ++-- lib/gpu/lal_lj_dsf.h | 8 +- lib/gpu/lal_lj_dsf_ext.cpp | 10 +- lib/gpu/lal_lj_expand.cpp | 28 +-- lib/gpu/lal_lj_expand.cu | 70 +++--- lib/gpu/lal_lj_expand.h | 14 +- lib/gpu/lal_lj_expand_ext.cpp | 14 +- lib/gpu/lal_lj_ext.cpp | 14 +- lib/gpu/lal_lj_gromacs.cpp | 22 +- lib/gpu/lal_lj_gromacs.cu | 26 +- lib/gpu/lal_lj_gromacs.h | 12 +- lib/gpu/lal_lj_gromacs_ext.cpp | 16 +- lib/gpu/lal_mie.cpp | 16 +- lib/gpu/lal_mie.cu | 42 ++-- lib/gpu/lal_mie.h | 12 +- lib/gpu/lal_mie_ext.cpp | 8 +- lib/gpu/lal_morse.cpp | 24 +- lib/gpu/lal_morse.cu | 48 ++-- lib/gpu/lal_morse.h | 12 +- lib/gpu/lal_morse_ext.cpp | 14 +- lib/gpu/lal_neighbor.cpp | 119 +++++---- lib/gpu/lal_neighbor.h | 62 ++--- lib/gpu/lal_neighbor_cpu.cu | 6 +- lib/gpu/lal_neighbor_gpu.cu | 118 ++++----- lib/gpu/lal_neighbor_shared.cpp | 6 +- lib/gpu/lal_neighbor_shared.h | 6 +- lib/gpu/lal_pppm.cpp | 44 ++-- lib/gpu/lal_pppm.cu | 68 ++--- lib/gpu/lal_pppm.h | 28 +-- lib/gpu/lal_pppm_ext.cpp | 18 +- lib/gpu/lal_precision.h | 16 +- lib/gpu/lal_preprocessor.h | 20 +- lib/gpu/lal_re_squared.cpp | 68 ++--- lib/gpu/lal_re_squared.cu | 40 +-- lib/gpu/lal_re_squared.h | 20 +- lib/gpu/lal_re_squared_ext.cpp | 20 +- lib/gpu/lal_re_squared_lj.cu | 140 +++++------ lib/gpu/lal_soft.cpp | 18 +- lib/gpu/lal_soft.cu | 32 +-- lib/gpu/lal_soft.h | 14 +- lib/gpu/lal_soft_ext.cpp | 16 +- lib/gpu/lal_sw.cpp | 64 ++--- lib/gpu/lal_sw.cu | 128 +++++----- lib/gpu/lal_sw.h | 12 +- lib/gpu/lal_sw_ext.cpp | 26 +- lib/gpu/lal_table.cpp | 88 +++---- lib/gpu/lal_table.cu | 328 ++++++++++++------------ lib/gpu/lal_table.h | 36 +-- lib/gpu/lal_table_ext.cpp | 14 +- lib/gpu/lal_tersoff.cpp | 12 +- lib/gpu/lal_tersoff.cu | 30 ++- lib/gpu/lal_tersoff_ext.cpp | 2 +- lib/gpu/lal_tersoff_extra.h | 2 +- lib/gpu/lal_tersoff_mod.cpp | 12 +- lib/gpu/lal_tersoff_mod.cu | 32 ++- lib/gpu/lal_tersoff_mod_ext.cpp | 2 +- lib/gpu/lal_tersoff_mod_extra.h | 4 +- lib/gpu/lal_tersoff_zbl.cpp | 12 +- lib/gpu/lal_tersoff_zbl.cu | 30 ++- lib/gpu/lal_tersoff_zbl_ext.cpp | 2 +- lib/gpu/lal_tersoff_zbl_extra.h | 2 +- lib/gpu/lal_yukawa.cpp | 18 +- lib/gpu/lal_yukawa.cu | 52 ++-- lib/gpu/lal_yukawa.h | 16 +- lib/gpu/lal_yukawa_colloid.cpp | 64 ++--- lib/gpu/lal_yukawa_colloid.cu | 74 +++--- lib/gpu/lal_yukawa_colloid.h | 28 +-- lib/gpu/lal_yukawa_colloid_ext.cpp | 26 +- lib/gpu/lal_yukawa_ext.cpp | 22 +- lib/gpu/lal_zbl.cpp | 28 +-- lib/gpu/lal_zbl.cu | 108 ++++---- lib/gpu/lal_zbl.h | 20 +- lib/gpu/lal_zbl_ext.cpp | 20 +- 245 files changed, 4890 insertions(+), 4832 deletions(-) diff --git a/lib/gpu/Makefile.lammps.mingw-cross b/lib/gpu/Makefile.lammps.mingw-cross index e92c3e9d0d..12d833c744 100644 --- a/lib/gpu/Makefile.lammps.mingw-cross +++ b/lib/gpu/Makefile.lammps.mingw-cross @@ -1,6 +1,6 @@ # Settings that the LAMMPS build will import when this package library is used # settings for OpenCL builds gpu_SYSINC = -gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL +gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic gpu_SYSPATH = diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux index 1e689a355c..d72c0ba437 100644 --- a/lib/gpu/Makefile.linux +++ b/lib/gpu/Makefile.linux @@ -7,7 +7,7 @@ EXTRAMAKE = Makefile.lammps.standard -ifeq($(CUDA_HOME),) +ifeq ($(CUDA_HOME),) CUDA_HOME = /usr/local/cuda endif diff --git a/lib/gpu/Makefile.mingw32-cross b/lib/gpu/Makefile.mingw32-cross index 3f1240af1a..6f77634755 100644 --- a/lib/gpu/Makefile.mingw32-cross +++ b/lib/gpu/Makefile.mingw32-cross @@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \ -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \ -I$(CUDA_HOME)/include -OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32 +OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32 OCL_PREC = -D_SINGLE_DOUBLE OCL_TUNE = -DFERMI_OCL EXTRAMAKE = Makefile.lammps.mingw-cross diff --git a/lib/gpu/Makefile.mingw32-cross-mpi b/lib/gpu/Makefile.mingw32-cross-mpi index 6dae2d0604..94099cd90b 100644 --- a/lib/gpu/Makefile.mingw32-cross-mpi +++ b/lib/gpu/Makefile.mingw32-cross-mpi @@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \ -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \ -I../../tools/mingw-cross/mpich2-win32/include/ \ -DMPICH_IGNORE_CXX_SEEK -OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \ +OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \ -L../../tools/mingw-cross/mpich2-win32/lib -lmpi OCL_PREC = -D_SINGLE_DOUBLE OCL_TUNE = -DFERMI_OCL diff --git a/lib/gpu/Makefile.mingw64-cross b/lib/gpu/Makefile.mingw64-cross index 606b0309cb..54f6af8c65 100644 --- a/lib/gpu/Makefile.mingw64-cross +++ b/lib/gpu/Makefile.mingw64-cross @@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \ -msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \ -I$(CUDA_HOME)/include -OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \ +OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \ -L../../src/STUBS -lmpi_mingw64 OCL_PREC = -D_SINGLE_DOUBLE OCL_TUNE = -DFERMI_OCL diff --git a/lib/gpu/Makefile.mingw64-cross-mpi b/lib/gpu/Makefile.mingw64-cross-mpi index cea8155efd..2ff72d98b1 100644 --- a/lib/gpu/Makefile.mingw64-cross-mpi +++ b/lib/gpu/Makefile.mingw64-cross-mpi @@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \ -I../../tools/mingw-cross/mpich2-win64/include/ \ -DMPICH_IGNORE_CXX_SEEK -OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \ +OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \ -L../../tools/mingw-cross/mpich2-win64/lib -lmpi OCL_PREC = -D_SINGLE_DOUBLE OCL_TUNE = -DFERMI_OCL diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index 3b7781753c..2d2a751f85 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -35,7 +35,7 @@ namespace ucl_cudadr { // -------------------------------------------------------------------------- // - COMMAND QUEUE STUFF // -------------------------------------------------------------------------- -typedef CUstream command_queue; +typedef CUstream command_queue; inline void ucl_sync(CUstream &stream) { CU_SAFE_CALL(cuStreamSynchronize(stream)); @@ -59,21 +59,21 @@ struct NVDProperties { /// Class for looking at device properties /** \note Calls to change the device outside of the class results in incorrect - * behavior + * behavior * \note There is no error checking for indexing past the number of devices **/ class UCL_Device { public: /// Collect properties for every GPU on the node /** \note You must set the active GPU with set() before using the device **/ inline UCL_Device(); - + inline ~UCL_Device(); /// Returns 1 (For compatibility with OpenCL) inline int num_platforms() { return 1; } /// Return a string with name and info of the current platform - inline std::string platform_name() + inline std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; } /// Delete any contexts/data and set the platform number to be used @@ -97,24 +97,24 @@ class UCL_Device { /// Returns the default stream for the current device inline command_queue & cq() { return cq(0); } - + /// Returns the stream indexed by i inline command_queue & cq(const int i) { return _cq[i]; } - + /// Block until all commands in the default stream have completed inline void sync() { sync(0); } - + /// Block until all commands in the specified stream have completed inline void sync(const int i) { ucl_sync(cq(i)); } - + /// Get the number of command queues currently available on device - inline int num_queues() + inline int num_queues() { return _cq.size(); } - + /// Add a stream for device computations inline void push_command_queue() { - _cq.push_back(CUstream()); - CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0)); + _cq.push_back(CUstream()); + CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0)); } /// Remove a stream for device computations @@ -124,19 +124,19 @@ class UCL_Device { CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back())); _cq.pop_back(); } - + /// Set the default command queue (by default this is the null stream) - /** \param i index of the command queue (as added by push_command_queue()) + /** \param i index of the command queue (as added by push_command_queue()) If i is 0, the default command queue is set to the null stream **/ inline void set_command_queue(const int i) { if (i==0) _cq[0]=0; else _cq[0]=_cq[i]; } - + /// Get the current CUDA device name inline std::string name() { return name(_device); } /// Get the CUDA device name - inline std::string name(const int i) + inline std::string name(const int i) { return std::string(_properties[i].name); } /// Get a string telling the type of the current device @@ -148,38 +148,38 @@ class UCL_Device { inline int device_type() { return device_type(_device); } /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) inline int device_type(const int i) { return UCL_GPU; } - + /// Returns true if host memory is efficiently addressable from device inline bool shared_memory() { return shared_memory(_device); } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } - + /// Returns true if double precision is support for the current device inline bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device inline bool double_precision(const int i) {return arch(i)>=1.3;} - + /// Get the number of compute units on the current device inline unsigned cus() { return cus(_device); } /// Get the number of compute units - inline unsigned cus(const int i) + inline unsigned cus(const int i) { return _properties[i].multiProcessorCount; } /// Get the number of cores in the current device inline unsigned cores() { return cores(_device); } /// Get the number of cores - inline unsigned cores(const int i) - { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; + inline unsigned cores(const int i) + { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32; else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48; else return _properties[i].multiProcessorCount*192; } - + /// Get the gigabytes of global memory in the current device inline double gigabytes() { return gigabytes(_device); } /// Get the gigabytes of global memory - inline double gigabytes(const int i) + inline double gigabytes(const int i) { return static_cast(_properties[i].totalGlobalMem)/1073741824; } - + /// Get the bytes of global memory in the current device inline size_t bytes() { return bytes(_device); } /// Get the bytes of global memory @@ -188,13 +188,13 @@ class UCL_Device { // Get the gigabytes of free memory in the current device inline double free_gigabytes() { return free_gigabytes(_device); } // Get the gigabytes of free memory - inline double free_gigabytes(const int i) + inline double free_gigabytes(const int i) { return static_cast(free_bytes(i))/1073741824; } - + // Get the bytes of free memory in the current device inline size_t free_bytes() { return free_bytes(_device); } // Get the bytes of free memory - inline size_t free_bytes(const int i) { + inline size_t free_bytes(const int i) { CUDA_INT_TYPE dfree, dtotal; CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal)); return static_cast(dfree); @@ -203,21 +203,21 @@ class UCL_Device { /// Return the GPGPU compute capability for current device inline double arch() { return arch(_device); } /// Return the GPGPU compute capability - inline double arch(const int i) + inline double arch(const int i) { return static_cast(_properties[i].minor)/10+_properties[i].major;} - + /// Clock rate in GHz for current device inline double clock_rate() { return clock_rate(_device); } /// Clock rate in GHz - inline double clock_rate(const int i) + inline double clock_rate(const int i) { return _properties[i].p.clockRate*1e-6;} - + /// Get the maximum number of threads per block inline size_t group_size() { return group_size(_device); } /// Get the maximum number of threads per block - inline size_t group_size(const int i) + inline size_t group_size(const int i) { return _properties[i].p.maxThreadsPerBlock; } - + /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } /// Return the maximum memory pitch in bytes @@ -242,7 +242,7 @@ class UCL_Device { { return fission_by_counts(_device); } /// True if splitting device into subdevices by specified counts supported inline bool fission_by_counts(const int i) - { return false; } + { return false; } /// True if splitting device into subdevices by affinity domains supported inline bool fission_by_affinity() { return fission_by_affinity(_device); } @@ -259,7 +259,7 @@ class UCL_Device { /// List all devices along with all properties inline void print_all(std::ostream &out); - + private: int _device, _num_devices; std::vector _properties; @@ -279,16 +279,16 @@ UCL_Device::UCL_Device() { CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m)); if (major==9999) continue; - + _properties.push_back(NVDProperties()); _properties.back().device_id=dev; _properties.back().major=major; _properties.back().minor=minor; - + char namecstr[1024]; CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m)); _properties.back().name=namecstr; - + CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m)); CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, @@ -296,23 +296,23 @@ UCL_Device::UCL_Device() { CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m)); #if CUDA_VERSION >= 2020 CU_SAFE_CALL_NS(cuDeviceGetAttribute( - &_properties.back().kernelExecTimeoutEnabled, + &_properties.back().kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev)); CU_SAFE_CALL_NS(cuDeviceGetAttribute( &_properties.back().integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev)); CU_SAFE_CALL_NS(cuDeviceGetAttribute( - &_properties.back().canMapHostMemory, + &_properties.back().canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev)); - CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode, + CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev)); #endif #if CUDA_VERSION >= 3010 CU_SAFE_CALL_NS(cuDeviceGetAttribute( - &_properties.back().concurrentKernels, + &_properties.back().concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev)); CU_SAFE_CALL_NS(cuDeviceGetAttribute( - &_properties.back().ECCEnabled, + &_properties.back().ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); #endif } @@ -365,7 +365,7 @@ void UCL_Device::print_all(std::ostream &out) { cuDriverGetVersion(&driver_version); out << "CUDA Driver Version: " << driver_version/1000 << "." << driver_version%100 - << std::endl; + << std::endl; #endif if (num_devices() == 0) diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h index e0bfb1bb5e..d03a715e1b 100644 --- a/lib/gpu/geryon/nvd_kernel.h +++ b/lib/gpu/geryon/nvd_kernel.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -35,15 +35,15 @@ template class UCL_D_Mat; template class UCL_Vector; template class UCL_Matrix; #define UCL_MAX_KERNEL_ARGS 256 - + /// Class storing 1 or more kernel functions from a single string or file class UCL_Program { public: inline UCL_Program(UCL_Device &device) { _cq=device.cq(); } - inline UCL_Program(UCL_Device &device, const void *program, - const char *flags="", std::string *log=NULL) { + inline UCL_Program(UCL_Device &device, const void *program, + const char *flags="", std::string *log=NULL) { _cq=device.cq(); - init(device); + init(device); load_string(program,flags,log); } @@ -61,20 +61,20 @@ class UCL_Program { std::string *log=NULL) { std::ifstream in(filename); if (!in || in.is_open()==false) { - #ifndef UCL_NO_EXIT - std::cerr << "UCL Error: Could not open kernel file: " + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not open kernel file: " << filename << std::endl; UCL_GERYON_EXIT; #endif return UCL_FILE_NOT_FOUND; } - + std::string program((std::istreambuf_iterator(in)), std::istreambuf_iterator()); in.close(); return load_string(program.c_str(),flags,log); } - + /// Load a program from a string and compile with flags inline int load_string(const void *program, const char *flags="", std::string *log=NULL) { @@ -94,12 +94,12 @@ class UCL_Program { CUresult err=cuModuleLoadDataEx(&_module,program,num_opts, options,(void **)values); - + if (log!=NULL) *log=std::string(clog); - + if (err != CUDA_SUCCESS) { - #ifndef UCL_NO_EXIT + #ifndef UCL_NO_EXIT std::cerr << std::endl << "----------------------------------------------------------\n" << " UCL Error: Error compiling PTX Program...\n" @@ -108,24 +108,24 @@ class UCL_Program { #endif return UCL_COMPILE_ERROR; } - + return UCL_SUCCESS; - } - + } + /// Load a precompiled program from a file inline int load_binary(const char *filename) { CUmodule _module; CUresult err = cuModuleLoad(&_module,filename); if (err==301) { - #ifndef UCL_NO_EXIT - std::cerr << "UCL Error: Could not open binary kernel file: " + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not open binary kernel file: " << filename << std::endl; UCL_GERYON_EXIT; #endif return UCL_FILE_NOT_FOUND; } else if (err!=CUDA_SUCCESS) { - #ifndef UCL_NO_EXIT - std::cerr << "UCL Error: Error loading binary kernel file: " + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Error loading binary kernel file: " << filename << std::endl; UCL_GERYON_EXIT; #endif @@ -138,7 +138,7 @@ class UCL_Program { // return UCL_ERROR; return UCL_SUCCESS; } - + friend class UCL_Kernel; private: CUmodule _module; @@ -149,23 +149,23 @@ class UCL_Program { /// Class for dealing with CUDA Driver kernels class UCL_Kernel { public: - UCL_Kernel() : _dimensions(1), _num_args(0) { + UCL_Kernel() : _dimensions(1), _num_args(0) { #if CUDA_VERSION < 4000 _param_size=0; #endif - _num_blocks[0]=0; + _num_blocks[0]=0; } - - UCL_Kernel(UCL_Program &program, const char *function) : + + UCL_Kernel(UCL_Program &program, const char *function) : _dimensions(1), _num_args(0) { #if CUDA_VERSION < 4000 _param_size=0; #endif - _num_blocks[0]=0; - set_function(program,function); - _cq=program._cq; + _num_blocks[0]=0; + set_function(program,function); + _cq=program._cq; } - + ~UCL_Kernel() {} /// Clear any function associated with the kernel @@ -189,7 +189,7 @@ class UCL_Kernel { /// Set the kernel argument. /** If not a device pointer, this must be repeated each time the argument - * changes + * changes * \note To set kernel parameter i (i>0), parameter i-1 must be set **/ template inline void set_arg(const unsigned index, const dtype * const arg) { @@ -202,27 +202,27 @@ class UCL_Kernel { CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype))); #endif else - assert(0==1); // Must add kernel parameters in sequential order + assert(0==1); // Must add kernel parameters in sequential order } - + /// Set a geryon container as a kernel argument. template - inline void set_arg(const UCL_D_Vec * const arg) + inline void set_arg(const UCL_D_Vec * const arg) { set_arg(&arg->begin()); } /// Set a geryon container as a kernel argument. template - inline void set_arg(const UCL_D_Mat * const arg) + inline void set_arg(const UCL_D_Mat * const arg) { set_arg(&arg->begin()); } /// Set a geryon container as a kernel argument. template - inline void set_arg(const UCL_Vector * const arg) + inline void set_arg(const UCL_Vector * const arg) { set_arg(&arg->device.begin()); } /// Set a geryon container as a kernel argument. template - inline void set_arg(const UCL_Matrix * const arg) + inline void set_arg(const UCL_Matrix * const arg) { set_arg(&arg->device.begin()); } /// Add a kernel argument. @@ -257,37 +257,37 @@ class UCL_Kernel { /// Add a geryon container as a kernel argument. template - inline void add_arg(const UCL_D_Vec * const arg) + inline void add_arg(const UCL_D_Vec * const arg) { add_arg(&arg->begin()); } /// Add a geryon container as a kernel argument. template - inline void add_arg(const UCL_D_Mat * const arg) + inline void add_arg(const UCL_D_Mat * const arg) { add_arg(&arg->begin()); } /// Add a geryon container as a kernel argument. template - inline void add_arg(const UCL_Vector * const arg) + inline void add_arg(const UCL_Vector * const arg) { add_arg(&arg->device.begin()); } /// Add a geryon container as a kernel argument. template - inline void add_arg(const UCL_Matrix * const arg) + inline void add_arg(const UCL_Matrix * const arg) { add_arg(&arg->device.begin()); } /// Set the number of thread blocks and the number of threads in each block /** \note This should be called before any arguments have been added \note The default command queue is used for the kernel execution **/ - inline void set_size(const size_t num_blocks, const size_t block_size) { - _dimensions=1; - _num_blocks[0]=num_blocks; + inline void set_size(const size_t num_blocks, const size_t block_size) { + _dimensions=1; + _num_blocks[0]=num_blocks; _num_blocks[1]=1; _num_blocks[2]=1; #if CUDA_VERSION >= 4000 _block_size[0]=block_size; _block_size[1]=1; _block_size[2]=1; - #else + #else CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1)); #endif } @@ -303,43 +303,43 @@ class UCL_Kernel { /** \note This should be called before any arguments have been added \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, - const size_t block_size_x, const size_t block_size_y) { - _dimensions=2; - _num_blocks[0]=num_blocks_x; - _num_blocks[1]=num_blocks_y; + const size_t block_size_x, const size_t block_size_y) { + _dimensions=2; + _num_blocks[0]=num_blocks_x; + _num_blocks[1]=num_blocks_y; _num_blocks[2]=1; #if CUDA_VERSION >= 4000 _block_size[0]=block_size_x; _block_size[1]=block_size_y; _block_size[2]=1; - #else + #else CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1)); #endif } - + /// Set the number of thread blocks and the number of threads in each block /** \note This should be called before any arguments have been added \note The default command queue for the kernel is changed to cq **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, const size_t block_size_x, const size_t block_size_y, - command_queue &cq) + command_queue &cq) {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);} /// Set the number of thread blocks and the number of threads in each block /** \note This should be called before any arguments have been added \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, - const size_t block_size_x, + const size_t block_size_x, const size_t block_size_y, const size_t block_size_z) { - _dimensions=2; - _num_blocks[0]=num_blocks_x; - _num_blocks[1]=num_blocks_y; - _num_blocks[2]=1; + _dimensions=2; + _num_blocks[0]=num_blocks_x; + _num_blocks[1]=num_blocks_y; + _num_blocks[2]=1; #if CUDA_VERSION >= 4000 _block_size[0]=block_size_x; _block_size[1]=block_size_y; _block_size[2]=block_size_z; - #else + #else CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y, block_size_z)); #endif @@ -352,10 +352,10 @@ class UCL_Kernel { const size_t block_size_x, const size_t block_size_y, const size_t block_size_z, command_queue &cq) { _cq=cq; - set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, + set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, block_size_z); } - + /// Run the kernel in the default command queue inline void run() { #if CUDA_VERSION >= 4000 @@ -367,12 +367,12 @@ class UCL_Kernel { CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq)); #endif } - + /// Clear any arguments associated with the kernel - inline void clear_args() { - _num_args=0; + inline void clear_args() { + _num_args=0; #if CUDA_VERSION < 4000 - _offsets.clear(); + _offsets.clear(); _param_size=0; #endif } @@ -390,7 +390,7 @@ class UCL_Kernel { unsigned _num_blocks[3]; unsigned _num_args; friend class UCL_Texture; - + #if CUDA_VERSION >= 4000 unsigned _block_size[3]; void * _kernel_args[UCL_MAX_KERNEL_ARGS]; diff --git a/lib/gpu/geryon/nvd_mat.h b/lib/gpu/geryon/nvd_mat.h index 51cfe1d56f..042e2978c3 100644 --- a/lib/gpu/geryon/nvd_mat.h +++ b/lib/gpu/geryon/nvd_mat.h @@ -17,12 +17,12 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ /*! \file */ - + #ifndef NVD_MAT_H #define NVD_MAT_H @@ -52,6 +52,6 @@ namespace ucl_cudadr { #include "ucl_print.h" #undef UCL_PRINT_ALLOW -} // namespace ucl_cudadr +} // namespace ucl_cudadr #endif diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h index 5f7b98ba5c..0484e33de6 100644 --- a/lib/gpu/geryon/nvd_memory.h +++ b/lib/gpu/geryon/nvd_memory.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -46,7 +46,7 @@ typedef CUdeviceptr device_ptr; // - HOST MEMORY ALLOCATION ROUTINES // -------------------------------------------------------------------------- template -inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, +inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ CUresult err=CUDA_SUCCESS; if (kind==UCL_NOT_PINNED) @@ -62,7 +62,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, } template -inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, +inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ CUresult err=CUDA_SUCCESS; if (kind==UCL_NOT_PINNED) @@ -95,7 +95,7 @@ inline int _host_resize(mat_type &mat, const size_t n) { *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); else if (mat.kind()==UCL_WRITE_ONLY) err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); - else + else err=cuMemAllocHost((void **)mat.host_ptr(),n); if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) return UCL_MEMORY_ERROR; @@ -130,30 +130,30 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows, const size_t cols, size_t &pitch, const enum UCL_MEMOPT kind) { CUresult err; - CUDA_INT_TYPE upitch; + CUDA_INT_TYPE upitch; err=cuMemAllocPitch(&mat.cbegin(),&upitch, cols*sizeof(typename mat_type::data_type),rows,16); - pitch=static_cast(upitch); + pitch=static_cast(upitch); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; mat.cq()=cm.cq(); return UCL_SUCCESS; -} +} template inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows, const size_t cols, size_t &pitch, const enum UCL_MEMOPT kind) { CUresult err; - unsigned upitch; + unsigned upitch; err=cuMemAllocPitch(&mat.cbegin(),&upitch, cols*sizeof(typename mat_type::data_type),rows,16); - pitch=static_cast(upitch); + pitch=static_cast(upitch); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; mat.cq()=d.cq(); return UCL_SUCCESS; -} +} template inline void _device_free(mat_type &mat) { @@ -175,33 +175,33 @@ inline int _device_resize(mat_type &mat, const size_t rows, const size_t cols, size_t &pitch) { _device_free(mat); CUresult err; - CUDA_INT_TYPE upitch; + CUDA_INT_TYPE upitch; err=cuMemAllocPitch(&mat.cbegin(),&upitch, cols*sizeof(typename mat_type::data_type),rows,16); - pitch=static_cast(upitch); + pitch=static_cast(upitch); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; return UCL_SUCCESS; -} +} -inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { +inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { *ptr=in; } template -inline void _device_view(CUdeviceptr *ptr, numtyp *in) { - *ptr=0; +inline void _device_view(CUdeviceptr *ptr, numtyp *in) { + *ptr=0; } -inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in, - const size_t offset, const size_t numsize) { +inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in, + const size_t offset, const size_t numsize) { *ptr=in+offset*numsize; } template inline void _device_view(CUdeviceptr *ptr, numtyp *in, - const size_t offset, const size_t numsize) { - *ptr=0; + const size_t offset, const size_t numsize) { + *ptr=0; } // -------------------------------------------------------------------------- @@ -211,13 +211,13 @@ template inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows, const size_t cols) { assert(0==1); -} +} template inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows, const size_t cols) { assert(0==1); -} +} template inline void _device_image_free(mat_type &mat) { @@ -245,7 +245,7 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) { // - HELPER FUNCTIONS FOR MEMCPY ROUTINES // -------------------------------------------------------------------------- -inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch, +inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch, const size_t spitch, const size_t cols, const size_t rows) { ins.srcXInBytes=0; @@ -257,13 +257,13 @@ inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch, ins.WidthInBytes=cols; ins.Height=rows; } - + template struct _nvd_set_2D_mem; -template <> struct _nvd_set_2D_mem<1> +template <> struct _nvd_set_2D_mem<1> { static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } }; -template <> struct _nvd_set_2D_mem<2> +template <> struct _nvd_set_2D_mem<2> { static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } }; -template struct _nvd_set_2D_mem +template struct _nvd_set_2D_mem { static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } }; @@ -285,7 +285,7 @@ template<> struct _ucl_memcpy<2,2> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; @@ -297,7 +297,7 @@ template<> struct _ucl_memcpy<2,2> { CU_SAFE_CALL(cuMemcpy2D(&ins)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; @@ -322,7 +322,7 @@ template<> struct _ucl_memcpy<2,0> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; @@ -334,7 +334,7 @@ template<> struct _ucl_memcpy<2,0> { CU_SAFE_CALL(cuMemcpy2D(&ins)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; @@ -359,7 +359,7 @@ template<> struct _ucl_memcpy<2,1> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; @@ -371,7 +371,7 @@ template<> struct _ucl_memcpy<2,1> { CU_SAFE_CALL(cuMemcpy2D(&ins)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; @@ -396,7 +396,7 @@ template<> struct _ucl_memcpy<0,2> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; @@ -408,7 +408,7 @@ template<> struct _ucl_memcpy<0,2> { CU_SAFE_CALL(cuMemcpy2D(&ins)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; @@ -433,7 +433,7 @@ template<> struct _ucl_memcpy<1,2> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; @@ -445,7 +445,7 @@ template<> struct _ucl_memcpy<1,2> { CU_SAFE_CALL(cuMemcpy2D(&ins)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; @@ -470,7 +470,7 @@ template <> struct _ucl_memcpy<1,0> { CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; @@ -482,7 +482,7 @@ template <> struct _ucl_memcpy<1,0> { CU_SAFE_CALL(cuMemcpy2D(&ins)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; @@ -507,7 +507,7 @@ template <> struct _ucl_memcpy<0,1> { CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; @@ -519,7 +519,7 @@ template <> struct _ucl_memcpy<0,1> { CU_SAFE_CALL(cuMemcpy2D(&ins)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; @@ -542,7 +542,7 @@ template <> struct _ucl_memcpy<1,1> { CUstream &cq) { memcpy(dst.begin(),src.begin(),n); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; @@ -554,7 +554,7 @@ template <> struct _ucl_memcpy<1,1> { CU_SAFE_CALL(cuMemcpy2D(&ins)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; @@ -579,18 +579,18 @@ template struct _ucl_memcpy { CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { if (p1::PADDED==0 || p2::PADDED==0) { size_t src_offset=0, dst_offset=0; - for (size_t i=0; i::a(); @@ -601,12 +601,12 @@ template struct _ucl_memcpy { } } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { if (p1::PADDED==0 || p2::PADDED==0) { size_t src_offset=0, dst_offset=0; - for (size_t i=0; i -inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, - const size_t spitch, const size_t cols, +inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, + const size_t spitch, const size_t cols, const size_t rows) { _ucl_memcpy::mc(dst,dpitch,src,spitch,cols, rows); } template -inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, - const size_t spitch, const size_t cols, +inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, + const size_t spitch, const size_t cols, const size_t rows,CUstream &cq) { _ucl_memcpy::mc(dst,dpitch,src,spitch,cols, rows,cq); } -} // namespace ucl_cudart +} // namespace ucl_cudart #endif diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h index 07650263a5..965595a448 100644 --- a/lib/gpu/geryon/nvd_texture.h +++ b/lib/gpu/geryon/nvd_texture.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -28,7 +28,7 @@ #include "nvd_mat.h" namespace ucl_cudadr { - + /// Class storing a texture reference class UCL_Texture { public: @@ -38,39 +38,39 @@ class UCL_Texture { inline UCL_Texture(UCL_Program &prog, const char *texture_name) { get_texture(prog,texture_name); } /// Set the texture reference for this object - inline void get_texture(UCL_Program &prog, const char *texture_name) + inline void get_texture(UCL_Program &prog, const char *texture_name) { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); } /// Bind a float array where each fetch grabs a vector of length numel template - inline void bind_float(UCL_D_Vec &vec, const unsigned numel) + inline void bind_float(UCL_D_Vec &vec, const unsigned numel) { _bind_float(vec,numel); } /// Bind a float array where each fetch grabs a vector of length numel template - inline void bind_float(UCL_D_Mat &vec, const unsigned numel) + inline void bind_float(UCL_D_Mat &vec, const unsigned numel) { _bind_float(vec,numel); } /// Bind a float array where each fetch grabs a vector of length numel template - inline void bind_float(UCL_Vector &vec, const unsigned numel) + inline void bind_float(UCL_Vector &vec, const unsigned numel) { _bind_float(vec.device,numel); } /// Bind a float array where each fetch grabs a vector of length numel template - inline void bind_float(UCL_Matrix &vec, const unsigned numel) + inline void bind_float(UCL_Matrix &vec, const unsigned numel) { _bind_float(vec.device,numel); } /// Unbind the texture reference from the memory allocation inline void unbind() { } - /// Make a texture reference available to kernel - inline void allow(UCL_Kernel &kernel) { + /// Make a texture reference available to kernel + inline void allow(UCL_Kernel &kernel) { #if CUDA_VERSION < 4000 - CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); + CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); #endif } - + private: CUtexref _tex; friend class UCL_Kernel; @@ -80,7 +80,7 @@ class UCL_Texture { #ifdef UCL_DEBUG assert(numel!=0 && numel<5); #endif - CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), + CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), vec.numel()*vec.element_size())); if (vec.element_size()==sizeof(float)) CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel)); diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h index 4c3e993e0d..aefbaea0c3 100644 --- a/lib/gpu/geryon/nvd_timer.h +++ b/lib/gpu/geryon/nvd_timer.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -41,7 +41,7 @@ class UCL_Timer { /// Clear any data associated with timer /** \note init() must be called to reuse timer after a clear() **/ inline void clear() { - if (_initialized) { + if (_initialized) { CU_DESTRUCT_CALL(cuEventDestroy(start_event)); CU_DESTRUCT_CALL(cuEventDestroy(stop_event)); _initialized=false; @@ -63,16 +63,16 @@ class UCL_Timer { /// Start timing on command queue inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); } - + /// Stop timing on command queue inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); } - + /// Block until the start event has been reached on device - inline void sync_start() + inline void sync_start() { CU_SAFE_CALL(cuEventSynchronize(start_event)); } /// Block until the stop event has been reached on device - inline void sync_stop() + inline void sync_stop() { CU_SAFE_CALL(cuEventSynchronize(stop_event)); } /// Set the time elapsed to zero (not the total_time) @@ -80,29 +80,29 @@ class UCL_Timer { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); } - + /// Set the total time to zero inline void zero_total() { _total_time=0.0; } - + /// Add time from previous start and stop to total /** Forces synchronization **/ - inline double add_to_total() + inline double add_to_total() { double t=time(); _total_time+=t; return t/1000.0; } - + /// Add a user specified time to the total (ms) inline void add_time_to_total(const double t) { _total_time+=t; } - + /// Return the time (ms) of last start to stop - Forces synchronization - inline double time() { + inline double time() { float timer; CU_SAFE_CALL(cuEventSynchronize(stop_event)); CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) ); - return timer; + return timer; } - + /// Return the time (s) of last start to stop - Forces synchronization inline double seconds() { return time()/1000.0; } - + /// Return the total time in ms inline double total_time() { return _total_time; } diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 8dadcf2efd..20656c8489 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -40,13 +40,13 @@ #include "ucl_types.h" namespace ucl_opencl { - + // -------------------------------------------------------------------------- // - COMMAND QUEUE STUFF // -------------------------------------------------------------------------- -typedef cl_command_queue command_queue; +typedef cl_command_queue command_queue; typedef cl_context context_type; - + inline void ucl_sync(cl_command_queue &cq) { CL_SAFE_CALL(clFinish(cq)); } @@ -76,19 +76,19 @@ struct OCLProperties { /// Class for looking at data parallel device properties /** \note Calls to change the device outside of the class results in incorrect - * behavior + * behavior * \note There is no error checking for indexing past the number of devices **/ class UCL_Device { public: /// Collect properties for every device on the node /** \note You must set the active GPU with set() before using the device **/ inline UCL_Device(); - + inline ~UCL_Device(); /// Return the number of platforms (0 if error or no platforms) inline int num_platforms() { return _num_platforms; } - + /// Return a string with name and info of the current platform inline std::string platform_name(); @@ -104,38 +104,38 @@ class UCL_Device { * be allocated for use. clear() is called to delete any contexts and * associated data from previous calls to set(). **/ inline int set(int num); - + /// Delete any context and associated data stored from a call to set() inline void clear(); /// Get the current device number inline int device_num() { return _device; } - + /// Returns the context for the current device inline cl_context & context() { return _context; } - + /// Returns the default stream for the current device inline command_queue & cq() { return cq(_default_cq); } - + /// Returns the stream indexed by i inline command_queue & cq(const int i) { return _cq[i]; } - + /// Set the default command queue - /** \param i index of the command queue (as added by push_command_queue()) + /** \param i index of the command queue (as added by push_command_queue()) If i is 0, the command queue created with device initialization is used **/ inline void set_command_queue(const int i) { _default_cq=i; } - + /// Block until all commands in the default stream have completed inline void sync() { sync(_default_cq); } - + /// Block until all commands in the specified stream have completed inline void sync(const int i) { ucl_sync(cq(i)); } - + /// Get the number of command queues currently available on device - inline int num_queues() + inline int num_queues() { return _cq.size(); } - + /// Add a command queue for device computations (with profiling enabled) inline void push_command_queue() { cl_int errorv; @@ -143,7 +143,7 @@ class UCL_Device { _cq.back()=clCreateCommandQueue(_context,_cl_device, CL_QUEUE_PROFILING_ENABLE,&errorv); if (errorv!=CL_SUCCESS) { - std::cerr << "Could not create command queue on device: " << name() + std::cerr << "Could not create command queue on device: " << name() << std::endl; UCL_GERYON_EXIT; } @@ -160,76 +160,76 @@ class UCL_Device { /// Get the current OpenCL device name inline std::string name() { return name(_device); } /// Get the OpenCL device name - inline std::string name(const int i) + inline std::string name(const int i) { return std::string(_properties[i].name); } /// Get a string telling the type of the current device inline std::string device_type_name() { return device_type_name(_device); } /// Get a string telling the type of the device inline std::string device_type_name(const int i); - + /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) inline int device_type() { return device_type(_device); } /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) inline int device_type(const int i); - + /// Returns true if host memory is efficiently addressable from device inline bool shared_memory() { return shared_memory(_device); } /// Returns true if host memory is efficiently addressable from device - inline bool shared_memory(const int i) + inline bool shared_memory(const int i) { return _shared_mem_device(_properties[i].device_type); } - + /// Returns true if double precision is support for the current device inline bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device - inline bool double_precision(const int i) + inline bool double_precision(const int i) {return _properties[i].double_precision;} - + /// Get the number of compute units on the current device inline unsigned cus() { return cus(_device); } /// Get the number of compute units - inline unsigned cus(const int i) + inline unsigned cus(const int i) { return _properties[i].compute_units; } /// Get the gigabytes of global memory in the current device inline double gigabytes() { return gigabytes(_device); } /// Get the gigabytes of global memory - inline double gigabytes(const int i) + inline double gigabytes(const int i) { return static_cast(_properties[i].global_mem)/1073741824; } /// Get the bytes of global memory in the current device inline size_t bytes() { return bytes(_device); } /// Get the bytes of global memory inline size_t bytes(const int i) { return _properties[i].global_mem; } - + /// Return the GPGPU revision number for current device //inline double revision() { return revision(_device); } /// Return the GPGPU revision number - //inline double revision(const int i) + //inline double revision(const int i) // { return //static_cast(_properties[i].minor)/10+_properties[i].major;} - + /// Clock rate in GHz for current device inline double clock_rate() { return clock_rate(_device); } /// Clock rate in GHz inline double clock_rate(const int i) { return _properties[i].clock*1e-3;} - + /// Return the address alignment in bytes inline int alignment() { return alignment(_device); } /// Return the address alignment in bytes inline int alignment(const int i) { return _properties[i].alignment; } - + /// Return the timer resolution inline size_t timer_resolution() { return timer_resolution(_device); } /// Return the timer resolution - inline size_t timer_resolution(const int i) + inline size_t timer_resolution(const int i) { return _properties[i].timer_resolution; } - + /// Get the maximum number of threads per block inline size_t group_size() { return group_size(_device); } /// Get the maximum number of threads per block - inline size_t group_size(const int i) + inline size_t group_size(const int i) { return _properties[i].work_group_size; } - + /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } /// Return the maximum memory pitch in bytes @@ -254,7 +254,7 @@ class UCL_Device { { return fission_by_counts(_device); } /// True if splitting device into subdevices by specified counts supported inline bool fission_by_counts(const int i) - { return _properties[i].partition_counts; } + { return _properties[i].partition_counts; } /// True if splitting device into subdevices by affinity domains supported inline bool fission_by_affinity() { return fission_by_affinity(_device); } @@ -271,10 +271,10 @@ class UCL_Device { /// List all devices along with all properties inline void print_all(std::ostream &out); - + /// Return the OpenCL type for the device inline cl_device_id & cl_device() { return _cl_device; } - + private: int _num_platforms; // Number of platforms int _platform; // UCL_Device ID for current platform @@ -287,7 +287,7 @@ class UCL_Device { std::vector _cl_devices; // OpenCL IDs for all devices int _num_devices; // Number of devices std::vector _properties; // Properties for each device - + inline void add_properties(cl_device_id); inline int create_context(); int _default_cq; @@ -300,7 +300,7 @@ UCL_Device::UCL_Device() { // --- Get Number of Platforms cl_uint nplatforms; cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms); - + if (errorv!=CL_SUCCESS) { _num_platforms=0; return; @@ -328,18 +328,18 @@ void UCL_Device::clear() { int UCL_Device::set_platform(int pid) { clear(); cl_int errorv; - + _cl_device=0; _device=-1; _num_devices=0; _default_cq=0; - + #ifdef UCL_DEBUG assert(pid namespace ucl_opencl { - + class UCL_Texture; template class UCL_D_Vec; template class UCL_D_Mat; @@ -41,10 +41,10 @@ class UCL_Program { public: inline UCL_Program() : _init_done(false) {} inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); } - inline UCL_Program(UCL_Device &device, const void *program, - const char *flags="", std::string *log=NULL) : - _init_done(false) { - init(device); + inline UCL_Program(UCL_Device &device, const void *program, + const char *flags="", std::string *log=NULL) : + _init_done(false) { + init(device); load_string(program,flags,log); } @@ -56,7 +56,7 @@ class UCL_Program { _device=device.cl_device(); _context=device.context(); _cq=device.cq(); - CL_SAFE_CALL(clRetainContext(_context)); + CL_SAFE_CALL(clRetainContext(_context)); CL_SAFE_CALL(clRetainCommandQueue(_cq)); _init_done=true; } @@ -65,7 +65,7 @@ class UCL_Program { /** \note Must call init() after each clear **/ inline void clear() { if (_init_done) { - CL_DESTRUCT_CALL(clReleaseProgram(_program)); + CL_DESTRUCT_CALL(clReleaseProgram(_program)); CL_DESTRUCT_CALL(clReleaseContext(_context)); CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); _init_done=false; @@ -77,20 +77,20 @@ class UCL_Program { std::string *log=NULL) { std::ifstream in(filename); if (!in || in.is_open()==false) { - #ifndef UCL_NO_EXIT - std::cerr << "UCL Error: Could not open kernel file: " + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not open kernel file: " << filename << std::endl; UCL_GERYON_EXIT; #endif return UCL_FILE_NOT_FOUND; } - + std::string program((std::istreambuf_iterator(in)), std::istreambuf_iterator()); in.close(); return load_string(program.c_str(),flags,log); } - + /// Load a program from a string and compile with flags inline int load_string(const void *program, const char *flags="", std::string *log=NULL) { @@ -103,23 +103,23 @@ class UCL_Program { CL_CHECK_ERR(error_flag); cl_build_status build_status; CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device, - CL_PROGRAM_BUILD_STATUS, + CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status),&build_status, NULL)); - + if (build_status != CL_SUCCESS || log!=NULL) { size_t ms; - CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0, + CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0, NULL, &ms)); - char build_log[ms]; + char build_log[ms]; CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms, build_log, NULL)); - + if (log!=NULL) *log=std::string(build_log); - + if (build_status != CL_SUCCESS) { - #ifndef UCL_NO_EXIT + #ifndef UCL_NO_EXIT std::cerr << std::endl << "----------------------------------------------------------\n" << " UCL Error: Error compiling OpenCL Program (" @@ -130,10 +130,10 @@ class UCL_Program { return UCL_COMPILE_ERROR; } } - + return UCL_SUCCESS; } - + /// Return the default command queue/stream associated with this data inline command_queue & cq() { return _cq; } /// Change the default command queue associated with matrix @@ -143,7 +143,7 @@ class UCL_Program { private: bool _init_done; cl_program _program; - cl_device_id _device; + cl_device_id _device; cl_context _context; cl_command_queue _cq; }; @@ -153,7 +153,7 @@ class UCL_Kernel { public: UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0) { _block_size[0]=0; _num_blocks[0]=0; } - + inline UCL_Kernel(UCL_Program &program, const char *function) : _dimensions(1), _function_set(false), _num_args(0) { _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); } @@ -178,48 +178,48 @@ class UCL_Kernel { /** If not a device pointer, this must be repeated each time the argument * changes **/ template - inline void set_arg(const cl_uint index, const dtype * const arg) { - CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); + inline void set_arg(const cl_uint index, const dtype * const arg) { + CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); if (index>_num_args) { _num_args=index; #ifdef UCL_DEBUG if (_num_args>_kernel_info_nargs) { - std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " + std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " << _kernel_info_name << std::endl; assert(0==1); } #endif } } - + /// Set a geryon container as a kernel argument. template - inline void set_arg(const UCL_D_Vec * const arg) + inline void set_arg(const UCL_D_Vec * const arg) { set_arg(&arg->begin()); } /// Set a geryon container as a kernel argument. template - inline void set_arg(const UCL_D_Mat * const arg) + inline void set_arg(const UCL_D_Mat * const arg) { set_arg(&arg->begin()); } /// Set a geryon container as a kernel argument. template - inline void set_arg(const UCL_Vector * const arg) + inline void set_arg(const UCL_Vector * const arg) { set_arg(&arg->device.begin()); } /// Set a geryon container as a kernel argument. template - inline void set_arg(const UCL_Matrix * const arg) + inline void set_arg(const UCL_Matrix * const arg) { set_arg(&arg->device.begin()); } /// Add a kernel argument. template inline void add_arg(const dtype * const arg) { - CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); - _num_args++; + CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); + _num_args++; #ifdef UCL_DEBUG if (_num_args>_kernel_info_nargs) { - std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " + std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " << _kernel_info_name << std::endl; assert(0==1); } @@ -228,31 +228,31 @@ class UCL_Kernel { /// Add a geryon container as a kernel argument. template - inline void add_arg(const UCL_D_Vec * const arg) + inline void add_arg(const UCL_D_Vec * const arg) { add_arg(&arg->begin()); } /// Add a geryon container as a kernel argument. template - inline void add_arg(const UCL_D_Mat * const arg) + inline void add_arg(const UCL_D_Mat * const arg) { add_arg(&arg->begin()); } /// Add a geryon container as a kernel argument. template - inline void add_arg(const UCL_Vector * const arg) + inline void add_arg(const UCL_Vector * const arg) { add_arg(&arg->device.begin()); } /// Add a geryon container as a kernel argument. template - inline void add_arg(const UCL_Matrix * const arg) + inline void add_arg(const UCL_Matrix * const arg) { add_arg(&arg->device.begin()); } /// Set the number of thread blocks and the number of threads in each block /** \note This should be called before any arguments have been added \note The default command queue is used for the kernel execution **/ - inline void set_size(const size_t num_blocks, const size_t block_size) { - _dimensions=1; - _num_blocks[0]=num_blocks*block_size; - _block_size[0]=block_size; + inline void set_size(const size_t num_blocks, const size_t block_size) { + _dimensions=1; + _num_blocks[0]=num_blocks*block_size; + _block_size[0]=block_size; } /// Set the number of thread blocks and the number of threads in each block @@ -266,36 +266,36 @@ class UCL_Kernel { /** \note This should be called before any arguments have been added \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, - const size_t block_size_x, const size_t block_size_y) { - _dimensions=2; - _num_blocks[0]=num_blocks_x*block_size_x; - _block_size[0]=block_size_x; - _num_blocks[1]=num_blocks_y*block_size_y; - _block_size[1]=block_size_y; + const size_t block_size_x, const size_t block_size_y) { + _dimensions=2; + _num_blocks[0]=num_blocks_x*block_size_x; + _block_size[0]=block_size_x; + _num_blocks[1]=num_blocks_y*block_size_y; + _block_size[1]=block_size_y; } - + /// Set the number of thread blocks and the number of threads in each block /** \note This should be called before any arguments have been added \note The default command queue for the kernel is changed to cq **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, const size_t block_size_x, const size_t block_size_y, - command_queue &cq) + command_queue &cq) {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);} /// Set the number of thread blocks and the number of threads in each block /** \note This should be called before any arguments have been added \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, - const size_t block_size_x, + const size_t block_size_x, const size_t block_size_y, const size_t block_size_z) { - _dimensions=3; + _dimensions=3; const size_t num_blocks_z=1; - _num_blocks[0]=num_blocks_x*block_size_x; - _block_size[0]=block_size_x; - _num_blocks[1]=num_blocks_y*block_size_y; - _block_size[1]=block_size_y; - _num_blocks[2]=num_blocks_z*block_size_z; - _block_size[2]=block_size_z; + _num_blocks[0]=num_blocks_x*block_size_x; + _block_size[0]=block_size_x; + _num_blocks[1]=num_blocks_y*block_size_y; + _block_size[1]=block_size_y; + _num_blocks[2]=num_blocks_z*block_size_z; + _block_size[2]=block_size_z; } /// Set the number of thread blocks and the number of threads in each block @@ -305,13 +305,13 @@ class UCL_Kernel { const size_t block_size_x, const size_t block_size_y, const size_t block_size_z, command_queue &cq) { _cq=cq; - set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, + set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, block_size_z); } - + /// Run the kernel in the default command queue inline void run(); - + /// Clear any arguments associated with the kernel inline void clear_args() { _num_args=0; } @@ -320,7 +320,7 @@ class UCL_Kernel { /// Change the default command queue associated with matrix inline void cq(command_queue &cq_in) { _cq=cq_in; } #include "ucl_arg_kludge.h" - + private: cl_kernel _kernel; cl_program _program; @@ -328,7 +328,7 @@ class UCL_Kernel { size_t _block_size[3]; size_t _num_blocks[3]; bool _function_set; - + cl_command_queue _cq; // The default command queue for this kernel unsigned _num_args; @@ -348,7 +348,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) CL_SAFE_CALL(clRetainProgram(_program)); cl_int error_flag; _kernel=clCreateKernel(program._program,function,&error_flag); - + if (error_flag!=CL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not find function: " << function @@ -357,7 +357,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) #endif return UCL_FUNCTION_NOT_FOUND; } - + #ifdef UCL_DEBUG _kernel_info_name=function; cl_uint nargs; @@ -375,7 +375,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) #endif #endif - return UCL_SUCCESS; + return UCL_SUCCESS; } void UCL_Kernel::run() { diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h index 2909d72a72..3135594dc3 100644 --- a/lib/gpu/geryon/ocl_mat.h +++ b/lib/gpu/geryon/ocl_mat.h @@ -17,12 +17,12 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ /*! \file */ - + #ifndef OCL_MAT_H #define OCL_MAT_H @@ -54,6 +54,6 @@ namespace ucl_opencl { #include "ucl_print.h" #undef UCL_PRINT_ALLOW -} // namespace ucl_cudart +} // namespace ucl_cudart #endif diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h index 7aed0a1a8c..28bb88941f 100644 --- a/lib/gpu/geryon/ocl_memory.h +++ b/lib/gpu/geryon/ocl_memory.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -36,10 +36,10 @@ namespace ucl_opencl { // -------------------------------------------------------------------------- struct ocl_kernel_dim { size_t x,y,z; - ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) : + ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) : x(_x), y(_y), z(_z) {} operator size_t * () { return (size_t *)this; } - operator const size_t * () const { return (const size_t *)this; } + operator const size_t * () const { return (const size_t *)this; } }; typedef ocl_kernel_dim ucl_kernel_dim; @@ -53,13 +53,13 @@ typedef cl_mem device_ptr; // -------------------------------------------------------------------------- template -inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, +inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ cl_int error_flag; cl_context context; CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context), &context,NULL)); - + cl_mem_flags buffer_perm; cl_map_flags map_perm; if (kind2==UCL_NOT_SPECIFIED) { @@ -88,7 +88,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR; else buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; - + if (kind==UCL_READ_ONLY) { #ifdef CL_VERSION_1_2 buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY; @@ -102,9 +102,9 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, } else map_perm=CL_MAP_READ | CL_MAP_WRITE; } - + mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) + if (error_flag != CL_SUCCESS) return UCL_MEMORY_ERROR; *mat.host_ptr() = (typename mat_type::data_type*) clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE, @@ -125,7 +125,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) { CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags), &orig_flags,NULL)); orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR; - + mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n, *mat.host_ptr(), &error_flag); @@ -135,7 +135,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) { } template -inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, +inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ cl_mem_flags buffer_perm; cl_map_flags map_perm; @@ -160,7 +160,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, cl_int error_flag; mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) + if (error_flag != CL_SUCCESS) return UCL_MEMORY_ERROR; *mat.host_ptr() = (typename mat_type::data_type*) @@ -210,7 +210,7 @@ inline int _host_resize(mat_type &mat, const size_t n) { map_perm=CL_MAP_READ | CL_MAP_WRITE; mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) + if (error_flag != CL_SUCCESS) return UCL_MEMORY_ERROR; *mat.host_ptr() = (typename mat_type::data_type*) clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE, @@ -248,7 +248,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n, else assert(0==1); mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) + if (error_flag != CL_SUCCESS) return UCL_MEMORY_ERROR; mat.cq()=cm.cq(); CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); @@ -278,7 +278,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n, assert(0==1); mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL, &error_flag); - if (error_flag != CL_SUCCESS) + if (error_flag != CL_SUCCESS) return UCL_MEMORY_ERROR; mat.cq()=dev.cq(); CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); @@ -304,7 +304,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows, if (dev.device_type()!=UCL_CPU && cols%256!=0) padded_cols+=256-cols%256; pitch=padded_cols*sizeof(typename mat_type::data_type); - return _device_alloc(mat,dev,pitch*rows,kind); + return _device_alloc(mat,dev,pitch*rows,kind); } template @@ -342,7 +342,7 @@ inline int _device_resize(mat_type &mat, const size_t n) { else assert(0==1); mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) + if (error_flag != CL_SUCCESS) return UCL_MEMORY_ERROR; return UCL_SUCCESS; } @@ -380,7 +380,7 @@ inline int _device_resize(mat_type &mat, const size_t rows, else assert(0==1); mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag); - if (error_flag != CL_SUCCESS) + if (error_flag != CL_SUCCESS) return UCL_MEMORY_ERROR; return UCL_SUCCESS; } @@ -396,21 +396,21 @@ inline void _host_zero(void *ptr, const size_t n) { inline void _ocl_build(cl_program &program, cl_device_id &device, const char* options = "") { clBuildProgram(program,1,&device,options,NULL,NULL); - + cl_build_status build_status; - CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, + CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status),&build_status, NULL)); if (build_status == CL_SUCCESS) return; - + size_t ms; - CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0, + CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0, NULL, &ms)); - char build_log[ms]; + char build_log[ms]; CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms, build_log, NULL)); - + std::cerr << std::endl << "----------------------------------------------------------\n" << " Error compiling OpenCL Program...\n" @@ -423,13 +423,13 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device, cl_kernel &kernel, const char *function, const char *options="") { cl_int error_flag; - + cl_program program=clCreateProgramWithSource(context,lines,source, NULL,&error_flag); - CL_CHECK_ERR(error_flag); + CL_CHECK_ERR(error_flag); _ocl_build(program,device,options); kernel=clCreateKernel(program,function,&error_flag); - CL_CHECK_ERR(error_flag); + CL_CHECK_ERR(error_flag); } template @@ -452,17 +452,17 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) { cl_device_id device; CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES, sizeof(cl_device_id),&device,NULL)); - + const char * szero[3]={ "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void _device_zero(__global NUMTYP *a, const int offset)", " { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }" }; - + cl_kernel kzero; _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero", _UCL_DATA_ID::numtyp_flag()); - + cl_int offset=mat.offset(); CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin())); CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset)); @@ -486,7 +486,7 @@ template<> struct _ucl_memcpy<2,2> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, cl_command_queue &cq, const cl_bool block, @@ -504,7 +504,7 @@ template<> struct _ucl_memcpy<2,0> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, cl_command_queue &cq, const cl_bool block, @@ -522,7 +522,7 @@ template<> struct _ucl_memcpy<2,1> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, cl_command_queue &cq, const cl_bool block, @@ -540,7 +540,7 @@ template<> struct _ucl_memcpy<0,2> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, cl_command_queue &cq, const cl_bool block, @@ -558,7 +558,7 @@ template<> struct _ucl_memcpy<1,2> { assert(0==1); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, cl_command_queue &cq, const cl_bool block, @@ -587,9 +587,9 @@ template <> struct _ucl_memcpy<1,0> { dst.begin(),0,NULL,NULL)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, - const size_t rows, cl_command_queue &cq, + const size_t rows, cl_command_queue &cq, const cl_bool block, size_t dst_offset, size_t src_offset) { if (src.cbegin()==dst.cbegin()) { @@ -602,20 +602,20 @@ template <> struct _ucl_memcpy<1,0> { #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 2NS\n"; #endif - if (spitch==dpitch && dst.cols()==src.cols() && + if (spitch==dpitch && dst.cols()==src.cols() && src.cols()==cols/src.element_size()) CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset, spitch*rows, (char *)dst.begin()+dst_offset,0,NULL, NULL)); else - for (size_t i=0; i struct _ucl_memcpy<0,1> { #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 3S\n"; #endif - return; + return; } #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 3NS\n"; @@ -639,9 +639,9 @@ template <> struct _ucl_memcpy<0,1> { src.begin(),0,NULL,NULL)); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, - const size_t rows, cl_command_queue &cq, + const size_t rows, cl_command_queue &cq, const cl_bool block, size_t dst_offset, size_t src_offset) { if (src.cbegin()==dst.cbegin()) { @@ -649,12 +649,12 @@ template <> struct _ucl_memcpy<0,1> { #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 4S\n"; #endif - return; + return; } #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 4NS\n"; #endif - if (spitch==dpitch && dst.cols()==src.cols() && + if (spitch==dpitch && dst.cols()==src.cols() && src.cols()==cols/src.element_size()) CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset, spitch*rows, @@ -667,7 +667,7 @@ template <> struct _ucl_memcpy<0,1> { NULL)); src_offset+=spitch; dst_offset+=dpitch; - } + } } }; @@ -687,33 +687,33 @@ template struct _ucl_memcpy { #ifdef UCL_DBG_MEM_TRACE else std::cerr << "UCL_COPY 6S\n"; #endif - + if (block==CL_TRUE) ucl_sync(cq); } template - static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, + static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, cl_command_queue &cq, const cl_bool block, size_t dst_offset, size_t src_offset) { - if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) { + if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) { #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 7NS\n"; #endif - if (spitch==dpitch && dst.cols()==src.cols() && + if (spitch==dpitch && dst.cols()==src.cols() && src.cols()==cols/src.element_size()) CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset, dst_offset,spitch*rows,0,NULL,NULL)); - + else - for (size_t i=0; i -inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, - const size_t spitch, const size_t cols, +inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, + const size_t spitch, const size_t cols, const size_t rows) { _ucl_memcpy::mc(dst,dpitch,src,spitch,cols, rows,dst.cq(),CL_TRUE, @@ -745,15 +745,15 @@ inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, } template -inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, - const size_t spitch, const size_t cols, +inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, + const size_t spitch, const size_t cols, const size_t rows,cl_command_queue &cq) { _ucl_memcpy::mc(dst,dpitch,src,spitch,cols, rows,cq,CL_FALSE, dst.byteoff(),src.byteoff()); } -} // namespace ucl_cudart +} // namespace ucl_cudart #endif diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h index 8e72c51730..0e60045f55 100644 --- a/lib/gpu/geryon/ocl_texture.h +++ b/lib/gpu/geryon/ocl_texture.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -28,7 +28,7 @@ #include "ocl_mat.h" namespace ucl_opencl { - + /// Class storing a texture reference class UCL_Texture { public: @@ -46,9 +46,9 @@ class UCL_Texture { /// Unbind the texture reference from the memory allocation inline void unbind() { } - /// Make a texture reference available to kernel + /// Make a texture reference available to kernel inline void allow(UCL_Kernel &kernel) { } - + private: friend class UCL_Kernel; }; diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index 627d19d66f..66b79dcab1 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -67,33 +67,33 @@ class UCL_Timer { clRetainCommandQueue(_cq); _initialized=true; } - + /// Start timing on default command queue inline void start() { UCL_OCL_MARKER(_cq,&start_event); } - + /// Stop timing on default command queue inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); } - + /// Block until the start event has been reached on device - inline void sync_start() + inline void sync_start() { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); } /// Block until the stop event has been reached on device - inline void sync_stop() + inline void sync_stop() { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); } /// Set the time elapsed to zero (not the total_time) - inline void zero() - { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); } - + inline void zero() + { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); } + /// Set the total time to zero inline void zero_total() { _total_time=0.0; } - + /// Add time from previous start and stop to total /** Forces synchronization **/ - inline double add_to_total() + inline double add_to_total() { double t=time(); _total_time+=t; return t/1000.0; } - + /// Add a user specified time to the total (ms) inline void add_time_to_total(const double t) { _total_time+=t; } @@ -107,12 +107,12 @@ class UCL_Timer { CL_SAFE_CALL(clGetEventProfilingInfo(start_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tstart, NULL)); - return (tend-tstart)*t_factor; + return (tend-tstart)*t_factor; } - + /// Return the time (s) of last start to stop - Forces synchronization inline double seconds() { return time()/1000.0; } - + /// Return the total time in ms inline double total_time() { return _total_time; } diff --git a/lib/gpu/geryon/ucl_arg_kludge.h b/lib/gpu/geryon/ucl_arg_kludge.h index 646aa4d68f..eea913863d 100644 --- a/lib/gpu/geryon/ucl_arg_kludge.h +++ b/lib/gpu/geryon/ucl_arg_kludge.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -38,47 +38,47 @@ template inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) { - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); } template inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6) { - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); } template inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6, t7 *a7) { - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); } template inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6, t7 *a7, t8 *a8) { - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); } template inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6, t7 *a7, t8 *a8, t9 *a9) { - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); } template inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) { - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); } template inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); run(); } @@ -434,8 +434,8 @@ inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); run(); } @@ -444,8 +444,8 @@ inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6, t7 *a7) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); run(); } @@ -454,8 +454,8 @@ inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6, t7 *a7, t8 *a8) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); run(); } @@ -464,8 +464,8 @@ inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6, t7 *a7, t8 *a8, t9 *a9) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); run(); } @@ -474,8 +474,8 @@ inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); run(); } @@ -486,9 +486,9 @@ t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t11 *a11) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); run(); } @@ -499,8 +499,8 @@ t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t11 *a11, t12 *a12) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); add_arg(a11); add_arg(a12); run(); } @@ -512,9 +512,9 @@ t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t11 *a11, t12 *a12, t13 *a13) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); run(); } @@ -525,9 +525,9 @@ t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t11 *a11, t12 *a12, t13 *a13, t14 *a14) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); run(); } @@ -538,9 +538,9 @@ t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); run(); } @@ -553,10 +553,10 @@ t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t16 *a16) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); run(); } @@ -569,10 +569,10 @@ t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t16 *a16, t17 *a17) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); run(); } @@ -585,10 +585,10 @@ t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t16 *a16, t17 *a17, t18 *a18) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); run(); } @@ -601,10 +601,10 @@ t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t16 *a16, t17 *a17, t18 *a18, t19 *a19) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); run(); } @@ -617,10 +617,10 @@ t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); run(); } @@ -635,10 +635,10 @@ t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t21 *a21) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a21); run(); } @@ -654,10 +654,10 @@ t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t21 *a21, t22 *a22) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a21); add_arg(a22); run(); } @@ -673,10 +673,10 @@ t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t21 *a21, t22 *a22, t23 *a23) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a21); add_arg(a22); add_arg(a23); run(); } @@ -692,10 +692,10 @@ t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t21 *a21, t22 *a22, t23 *a23, t24 *a24) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); run(); } @@ -711,11 +711,11 @@ t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); run(); } @@ -732,11 +732,11 @@ t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t26 *a26) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a26); run(); } @@ -754,11 +754,11 @@ t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t26 *a26, t27 *a27) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a26); add_arg(a27); run(); } @@ -776,12 +776,12 @@ t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t26 *a26, t27 *a27, t28 *a28) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); - add_arg(a26); add_arg(a27); add_arg(a28); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); run(); } @@ -798,11 +798,11 @@ t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t26 *a26, t27 *a27, t28 *a28, t29 *a29) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); run(); } @@ -820,11 +820,11 @@ t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); - add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); run(); } diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h index 4edf83e057..1ded9f043b 100644 --- a/lib/gpu/geryon/ucl_basemat.h +++ b/lib/gpu/geryon/ucl_basemat.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -52,10 +52,10 @@ /// Base class for vector/matrix containers /** All containers are associated with a default command queue. * For CUDA, this is the default stream. - * - * The default queue is used for asynchonrous operations on the container + * + * The default queue is used for asynchonrous operations on the container * that do not specify a queue. For OpenCL, this queue is also used in - * calls for reserving and copying memory **/ + * calls for reserving and copying memory **/ class UCL_BaseMat { public: UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { } @@ -68,8 +68,8 @@ class UCL_BaseMat { inline void sync() { ucl_sync(_cq); } /// Return the type/permissions of memory allocation /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED - * or UCL_VIEW **/ - inline enum UCL_MEMOPT kind() const { return _kind; } + * or UCL_VIEW **/ + inline enum UCL_MEMOPT kind() const { return _kind; } inline bool shared_mem_device() { #ifdef _OCL_MAT @@ -79,12 +79,12 @@ class UCL_BaseMat { cl_device_type device_type; CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE, sizeof(device_type),&device_type,NULL)); - return _shared_mem_device(device_type); + return _shared_mem_device(device_type); #else return false; #endif } - + protected: command_queue _cq; enum UCL_MEMOPT _kind; diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h index c6bff97a8c..c906a14f30 100644 --- a/lib/gpu/geryon/ucl_copy.h +++ b/lib/gpu/geryon/ucl_copy.h @@ -17,33 +17,33 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ - + /*************************************************************************** The ucl_copy and ucl_cast_copy routines provide a general prototype for copying data between host and device memory (including texture memory) for the matrix and vector types in nvc_memory. - - For host/host and host/device transfers, typecasting is performed - automatically as necessary. - - The routines are written so that all branches can be removed by the + + For host/host and host/device transfers, typecasting is performed + automatically as necessary. + + The routines are written so that all branches can be removed by the compiler during template instantiation. - + The routines currently assume row-major ordering for all types. - + For asynchronous copy in the default command queue, async is boolean true; For asynchronous copy in a specified command queue, async is command queue Otherwise, set async to boolean false; - + When performing frequent data copies that require casting, it is more efficient to allocate a casting buffer once and then pass that buffer to the copy routine. This can be accomplished with the ucl_cast_copy routines. - - Examples + + Examples (x's represent alignment padding - to maintain alignment) (o's represent a larger matrix in memory) (vectors represented as single row) @@ -51,18 +51,18 @@ dst src command ---------------------------------------------------------------- 0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async) - + 0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async) - + 0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async) - 3 4 5 - + 3 4 5 + 0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async) 3 4 5 - + 0 1 2 <-- 0 1 2 ucl_copy(dst,src,async) 3 4 5 3 4 5 - + 0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async) 3 4 5 3 4 5 5 6 7 @@ -70,33 +70,33 @@ 0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async) 4 5 6 4 5 6 7 8 9 10 11 - + 0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async) 3 4 5 x x 3 4 5 - + 0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async) 3 4 5 3 4 5 x x - + 0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async) 3 4 5 o o 3 4 5 - o o o o o + o o o o o 0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async) - 3 4 5 o o - o o o o o + 3 4 5 o o + o o o o o 0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async) - 2 3 o o o - o o o o o + 2 3 o o o + o o o o o 0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async) 5 6 7 o o 5 6 7 8 9 o o o o o 10 11 12 13 14 - + 0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async) 5 6 7 8 9 10 11 12 13 14 - + ***************************************************************************/ // Only allow this file to be included by nvc_memory.h and ocl_memory.h @@ -124,7 +124,7 @@ inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) { assert(0==1); } } -} +} // -------------------------------------------------------------------------- // - HOST-HOST COPY ROUTINES @@ -182,7 +182,7 @@ template <> struct _host_host_copy<1,1> { return; } #endif - + #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 8NS\n"; #endif @@ -212,7 +212,7 @@ template struct _host_host_copy { static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols) { assert(0==1); - } + } }; // -------------------------------------------------------------------------- @@ -242,20 +242,20 @@ template struct _ucl_cast_copy<1,host_type2> { template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer) { - // Asynchronous currently pointless here + // Asynchronous currently pointless here #ifdef UCL_DEBUG assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); - if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); - if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); - #endif + if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); + if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); + #endif if (mat1::VECTOR) { ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, src.row_bytes(),cols*sizeof(typename mat2::data_type),rows); for (size_t i=0; i(cast_buffer[i]); } else { - if (mat2::VECTOR) + if (mat2::VECTOR) ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, cols*sizeof(typename mat2::data_type), cols*sizeof(typename mat2::data_type),rows); @@ -276,23 +276,23 @@ template struct _ucl_cast_copy<1,host_type2> { } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer, + const size_t cols, mat3 &cast_buffer, command_queue &cq) { - // Asynchronous currently pointless here + // Asynchronous currently pointless here #ifdef UCL_DEBUG assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); - if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); - if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); - #endif + if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); + if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); + #endif if (mat1::VECTOR) { ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq); - cast_buffer.sync(); + cast_buffer.sync(); for (size_t i=0; i(cast_buffer[i]); } else { - if (mat2::VECTOR) + if (mat2::VECTOR) ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, cols*sizeof(typename mat2::data_type), cols*sizeof(typename mat2::data_type),rows,cq); @@ -338,7 +338,7 @@ template struct _ucl_cast_copy { assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); - if (mat3::VECTOR==0) { + if (mat3::VECTOR==0) { assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols); assert(dst.rows()>=rows && dst.cols()>=cols); } @@ -404,9 +404,9 @@ template struct _ucl_cast_copy { #ifdef UCL_DEBUG assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); - if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); - if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); - if (mat3::VECTOR==0) { + if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); + if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); + if (mat3::VECTOR==0) { assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols); assert(dst.rows()>=rows && dst.cols()>=cols); } @@ -472,23 +472,23 @@ template <> struct _ucl_cast_copy<1,1> { template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer, command_queue &cq) { - assert(0==1); + assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer) { - assert(0==1); + assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer) { - assert(0==1); + assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer, command_queue &cq) { - assert(0==1); + assert(0==1); } }; @@ -497,23 +497,23 @@ template <> struct _ucl_cast_copy<0,0> { template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer, command_queue &cq) { - assert(0==1); + assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer) { - assert(0==1); + assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer) { - assert(0==1); + assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer, command_queue &cq) { - assert(0==1); + assert(0==1); } }; @@ -525,7 +525,7 @@ template <> struct _ucl_cast_copy<0,0> { /** \param numel Number of elements (not bytes) to copy * \param cast_buffer Buffer on host with enough storage for casting * - If the data types for the two matrices are same, no cast performed - * - Padding for 2D matrices is not considered in this routine. + * - Padding for 2D matrices is not considered in this routine. * - Currently does not handle textures **/ template inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, @@ -551,7 +551,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, * \param async Perform non-blocking copy on default stream * \param cast_buffer Buffer on host with enough storage for casting * - If the data types for the two matrices are same, no cast performed - * - Padding for 2D matrices is not considered in this routine. + * - Padding for 2D matrices is not considered in this routine. * - Currently does not handle textures **/ template inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, @@ -580,7 +580,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can * be passed to an alternative copy routine. - * - Padding for 2D matrices is not considered in this routine. + * - Padding for 2D matrices is not considered in this routine. * - Currently does not handle textures **/ template inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, @@ -593,7 +593,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, #endif if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,numel); - else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && + else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; @@ -606,8 +606,8 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, _ucl_cast_copy::cc(dst,src,numel, cast_buffer,cq); } - } else - ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq); + } else + ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq); } /// Copy matrix/vector (memory already allocated) @@ -619,7 +619,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can * be passed to an alternative copy routine. - * - Padding for 2D matrices is not considered in this routine. + * - Padding for 2D matrices is not considered in this routine. * - The default stream is used for asynchronous copy * - Currently does not handle textures **/ template @@ -648,7 +648,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, cast_buffer); } } else - ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type)); + ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type)); } // -------------------------------------------------------------------------- @@ -659,11 +659,11 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, /** \param async Perform non-blocking copy on default stream * \param cast_buffer Buffer on host with enough storage for casting * - If src is a vector, routine assumes row-major rows by cols copy - * - If src is a matrix, routine will copy upper left tile of matrix + * - If src is a matrix, routine will copy upper left tile of matrix * - If dst is a vector, routine assumes row-major rows by cols copy - * - If dst is a matrix, routine will copy into left tile of matrix + * - If dst is a matrix, routine will copy into left tile of matrix * - If the data types for the two matrices are same, no cast performed - * - Padding for 2D matrices is not considered in this routine. + * - Padding for 2D matrices is not considered in this routine. * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template @@ -686,16 +686,16 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer) /** \param cast_buffer Buffer on host with enough storage for casting * - If src is a vector, routine assumes row-major rows by cols copy - * - If src is a matrix, routine will copy upper left tile of matrix + * - If src is a matrix, routine will copy upper left tile of matrix * - If dst is a vector, routine assumes row-major rows by cols copy - * - If dst is a matrix, routine will copy into upper left tile of matrix + * - If dst is a matrix, routine will copy into upper left tile of matrix * - If the data types for the two matrices are same, no cast performed - * - Padding for 2D matrices is not considered in this routine. + * - Padding for 2D matrices is not considered in this routine. * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer, + const size_t cols, mat3 &cast_buffer, command_queue &cq) { if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,rows,cols,cq); @@ -710,11 +710,11 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, /// Asynchronous copy of subset matrix rows,cols (memory already allocated) /** - If src is a vector, routine assumes row-major rows by cols copy - * - If src is a matrix, routine will copy upper left tile of matrix + * - If src is a matrix, routine will copy upper left tile of matrix * - If dst is a vector, routine assumes row-major rows by cols copy - * - If dst is a matrix, routine will copy into left tile of matrix + * - If dst is a matrix, routine will copy into left tile of matrix * - If the data types of the two matrices are not the same, - * casting will be performed automatically as long as the copy is + * casting will be performed automatically as long as the copy is * not device to device. For host/device transfers, a temporary * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can @@ -730,7 +730,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, #endif if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,rows,cols); - else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && + else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; @@ -773,9 +773,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, /// Copy subset of matrix rows,cols (memory already allocated) /** \param async Perform non-blocking copy (ignored for host to host copy) * - If src is a vector, routine assumes row-major rows by cols copy - * - If src is a matrix, routine will copy upper left tile of matrix + * - If src is a matrix, routine will copy upper left tile of matrix * - If dst is a vector, routine assumes row-major rows by cols copy - * - If dst is a matrix, routine will copy into left tile of matrix + * - If dst is a matrix, routine will copy into left tile of matrix * - If the data types of the two matrices are not the same, * casting will be performed automatically as long as the copy is * not device to device. For host/device transfers, a temporary @@ -796,7 +796,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, ucl_copy(dst,src,rows,cols,dst.cq()); else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,rows,cols); - else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && + else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; @@ -846,7 +846,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, * \param cast_buffer Buffer on host with enough storage for casting * - If the data types for the two matrices are same, no cast performed * - The number of bytes copied is determined by entire src data - * - Padding for 2D matrices is not considered in this routine. + * - Padding for 2D matrices is not considered in this routine. * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template @@ -866,7 +866,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, /** \param cast_buffer Buffer on host with enough storage for casting * - If the data types for the two matrices are same, no cast performed * - The number of bytes copied is determined by entire src data - * - Padding for 2D matrices is not considered in this routine. + * - Padding for 2D matrices is not considered in this routine. * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template @@ -885,7 +885,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, /// Asynchronous copy of matrix/vector (memory already allocated) /** - The number of bytes copied is determined by entire src data * - If the data types of the two matrices are not the same, - * casting will be performed automatically as long as the copy is + * casting will be performed automatically as long as the copy is * not device to device. For host/device transfers, a temporary * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can @@ -924,7 +924,7 @@ template inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) { if (async) ucl_copy(dst,src,dst.cq()); - else if (dst.row_bytes()==src.row_bytes() && + else if (dst.row_bytes()==src.row_bytes() && src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW && (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,src.row_size()*src.rows(),async); diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h index f1aaa27903..da55cc6ebc 100644 --- a/lib/gpu/geryon/ucl_d_mat.h +++ b/lib/gpu/geryon/ucl_d_mat.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -37,23 +37,23 @@ class UCL_D_Mat : public UCL_BaseMat { ROW_MAJOR = 1, VECTOR = 0 }; - typedef numtyp data_type; + typedef numtyp data_type; UCL_D_Mat() : _cols(0) {} ~UCL_D_Mat() { _device_free(*this); } - + /// Construct with specified rows and cols /** \sa alloc() **/ UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device, - const enum UCL_MEMOPT kind=UCL_READ_WRITE) : + const enum UCL_MEMOPT kind=UCL_READ_WRITE) : _cols(0) { alloc(rows,cols,device,kind); } - + /// Row major matrix on device /** The kind parameter controls memory optimizations as follows: * - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels - * \param cq Default command queue for operations copied from another mat + * \param cq Default command queue for operations copied from another mat * \note - Coalesced access using adjacent cols on same row * UCL_D_Mat(row,col) given by array[row*row_size()+col] * \return UCL_SUCCESS if the memory allocation is successful **/ @@ -65,7 +65,7 @@ class UCL_D_Mat : public UCL_BaseMat { int err=_device_alloc(*this,cq,rows,cols,_pitch,kind); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT - std::cerr << "UCL Error: Could not allocate " + std::cerr << "UCL Error: Could not allocate " << rows*cols*sizeof(numtyp) << " bytes on device.\n"; UCL_GERYON_EXIT; #endif @@ -82,9 +82,9 @@ class UCL_D_Mat : public UCL_BaseMat { #ifdef _OCL_MAT _offset=0; #endif - return err; + return err; } - + /// Row major matrix on device /** The kind parameter controls memory optimizations as follows: * - UCL_READ_WRITE - Specify that you will read and write in kernels @@ -118,15 +118,15 @@ class UCL_D_Mat : public UCL_BaseMat { #ifdef _OCL_MAT _offset=0; #endif - return err; + return err; } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * \param stride Number of _elements_ between the start of each row **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols, const size_t stride) { @@ -145,7 +145,7 @@ class UCL_D_Mat : public UCL_BaseMat { #else _device_view(&_array,input.begin()); #endif - + #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif @@ -157,39 +157,39 @@ class UCL_D_Mat : public UCL_BaseMat { * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template - inline void view(ucl_type &input, const size_t rows, const size_t cols) + inline void view(ucl_type &input, const size_t rows, const size_t cols) { view(input,rows,cols,input.row_size()); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view(ucl_type &input, const size_t cols) { view(input,1,cols); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template - inline void view(ucl_type &input) + inline void view(ucl_type &input) { view(input,input.rows(),input.cols()); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * \param stride Number of _elements_ between the start of each row **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, - const size_t stride, UCL_Device &dev) { + const size_t stride, UCL_Device &dev) { clear(); _kind=UCL_VIEW; _cols=cols; @@ -215,7 +215,7 @@ class UCL_D_Mat : public UCL_BaseMat { template inline void view(ptr_type input, const size_t rows, const size_t cols, UCL_Device &dev) { view(input,rows,cols,cols,dev); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the @@ -223,13 +223,13 @@ class UCL_D_Mat : public UCL_BaseMat { template inline void view(ptr_type input, const size_t cols, UCL_Device &dev) { view(input,1,cols,dev); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, const size_t cols, const size_t stride) { @@ -248,7 +248,7 @@ class UCL_D_Mat : public UCL_BaseMat { #else _device_view(&_array,input.begin(),offset,sizeof(numtyp)); #endif - + #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif @@ -261,45 +261,45 @@ class UCL_D_Mat : public UCL_BaseMat { * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, - const size_t cols) + const size_t cols) { view_offset(offset,input,rows,cols,input.row_size()); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) { view_offset(offset,input,1,cols); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template - inline void view_offset(const size_t offset, ucl_type &input) { - if (input.rows()==1) + inline void view_offset(const size_t offset, ucl_type &input) { + if (input.rows()==1) view_offset(offset,input,1,input.cols()-offset); - else + else view_offset(offset,input,input.rows()-offset/input.row_size(), input.cols()); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, const size_t cols,const size_t stride, - UCL_Device &dev) { + UCL_Device &dev) { clear(); _kind=UCL_VIEW; _cols=cols; @@ -307,7 +307,7 @@ class UCL_D_Mat : public UCL_BaseMat { _pitch=stride*sizeof(numtyp); _row_size=stride; this->_cq=dev.cq(); - + #ifdef _OCL_MAT _array=input; _offset=offset; @@ -320,7 +320,7 @@ class UCL_D_Mat : public UCL_BaseMat { _array=input+offset; #endif #endif - + #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif @@ -332,20 +332,20 @@ class UCL_D_Mat : public UCL_BaseMat { * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, - const size_t cols, UCL_Device &dev) + const size_t cols, UCL_Device &dev) { view_offset(offset,input,rows,cols,cols,dev); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template - inline void view_offset(const size_t offset, ptr_type input, + inline void view_offset(const size_t offset, ptr_type input, const size_t cols, UCL_Device &dev) { view_offset(offset,input,1,cols,dev); } - + /// Free memory and set size to 0 - inline void clear() + inline void clear() { _device_free(*this); _cols=0; _kind=UCL_VIEW; } /// Resize the allocation to contain cols elements @@ -356,7 +356,7 @@ class UCL_D_Mat : public UCL_BaseMat { int err=_device_resize(*this,rows,cols,_pitch); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT - std::cerr << "UCL Error: Could not allocate " + std::cerr << "UCL Error: Could not allocate " << rows*cols*sizeof(numtyp) << " bytes on device.\n"; UCL_GERYON_EXIT; #endif @@ -372,13 +372,13 @@ class UCL_D_Mat : public UCL_BaseMat { #ifdef _OCL_MAT _offset=0; #endif - return err; + return err; } - + /// Resize (only if bigger) the allocation to contain rows x cols elements /** \note Cannot be used on views **/ inline int resize_ib(const int rows, const int cols) - { if (cols>_cols || rows>_rows) return resize(rows,cols); + { if (cols>_cols || rows>_rows) return resize(rows,cols); else return UCL_SUCCESS; } /// Set each element to zero asynchronously in the default command_queue @@ -386,10 +386,10 @@ class UCL_D_Mat : public UCL_BaseMat { /// Set first n elements to zero asynchronously in the default command_queue inline void zero(const int n) { zero(n,_cq); } /// Set each element to zero asynchronously - inline void zero(command_queue &cq) + inline void zero(command_queue &cq) { _device_zero(*this,row_bytes()*_rows,cq); } /// Set first n elements to zero asynchronously - inline void zero(const int n, command_queue &cq) + inline void zero(const int n, command_queue &cq) { _device_zero(*this,n*sizeof(numtyp),cq); } @@ -445,7 +445,7 @@ class UCL_D_Mat : public UCL_BaseMat { inline size_t row_bytes() const { return _pitch; } /// Get the size in bytes of 1 element inline int element_size() const { return sizeof(numtyp); } - + #ifdef _OCL_MAT /// Return the offset (in elements) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ @@ -459,7 +459,7 @@ class UCL_D_Mat : public UCL_BaseMat { /// Return the offset (in bytes) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t byteoff() const { return offset()*sizeof(numtyp); } - + private: size_t _pitch, _row_size, _rows, _cols; diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h index fc1977f4b5..99a6c939c6 100644 --- a/lib/gpu/geryon/ucl_d_vec.h +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -17,14 +17,14 @@ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ // Only allow this file to be included by CUDA and OpenCL specific headers #ifdef _UCL_MAT_ALLOW -/// Row vector on device +/// Row vector on device template class UCL_D_Vec : public UCL_BaseMat { public: @@ -37,7 +37,7 @@ class UCL_D_Vec : public UCL_BaseMat { ROW_MAJOR = 1, VECTOR = 1 }; - typedef numtyp data_type; + typedef numtyp data_type; UCL_D_Vec() : _cols(0) {} ~UCL_D_Vec() { _device_free(*this); } @@ -45,7 +45,7 @@ class UCL_D_Vec : public UCL_BaseMat { /// Construct with n columns /** \sa alloc() **/ UCL_D_Vec(const size_t n, UCL_Device &device, - const enum UCL_MEMOPT kind=UCL_READ_WRITE) : + const enum UCL_MEMOPT kind=UCL_READ_WRITE) : _cols(0) { alloc(n,device,kind); } /// Set up host vector with 'cols' columns and reserve memory @@ -58,7 +58,7 @@ class UCL_D_Vec : public UCL_BaseMat { template inline int alloc(const size_t cols, mat_type &cq, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { - + clear(); _row_bytes=cols*sizeof(numtyp); @@ -82,8 +82,8 @@ class UCL_D_Vec : public UCL_BaseMat { #ifdef _OCL_MAT _offset=0; #endif - return err; - } + return err; + } /// Set up host vector with 'cols' columns and reserve memory /** The kind parameter controls memory optimizations as follows: @@ -116,7 +116,7 @@ class UCL_D_Vec : public UCL_BaseMat { #ifdef _OCL_MAT _offset=0; #endif - return err; + return err; } /// Do not allocate memory, instead use an existing allocation from Geryon @@ -142,18 +142,18 @@ class UCL_D_Vec : public UCL_BaseMat { #else _device_view(&_array,input.begin()); #endif - + #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * \param stride Number of _elements_ between the start of each row **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols, const size_t stride) { view(input,rows,cols); } @@ -162,24 +162,24 @@ class UCL_D_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view(ucl_type &input, const size_t cols) { view(input,1,cols); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template - inline void view(ucl_type &input) + inline void view(ucl_type &input) { view(input,input.rows()*input.row_size()); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the @@ -205,15 +205,15 @@ class UCL_D_Vec : public UCL_BaseMat { CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); #endif } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * \param stride Number of _elements_ between the start of each row **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, - const size_t stride, UCL_Device &dev) + const size_t stride, UCL_Device &dev) { view(input,rows,cols,stride); } /// Do not allocate memory, instead use an existing allocation @@ -223,7 +223,7 @@ class UCL_D_Vec : public UCL_BaseMat { template inline void view(ptr_type input, const size_t cols, UCL_Device &dev) { view(input,1,cols,dev); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. @@ -248,45 +248,45 @@ class UCL_D_Vec : public UCL_BaseMat { #else _device_view(&_array,input.begin(),offset,sizeof(numtyp)); #endif - + #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, - const size_t cols, const size_t stride) + const size_t cols, const size_t stride) { view_offset(offset,input,rows,cols); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) { view_offset(offset,input,1,cols); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template - inline void view_offset(const size_t offset, ucl_type &input) + inline void view_offset(const size_t offset, ucl_type &input) { view_offset(offset,input,input.rows()*input.row_size()-offset); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the @@ -302,7 +302,7 @@ class UCL_D_Vec : public UCL_BaseMat { _cols=cols; _row_bytes=_cols*sizeof(numtyp); this->_cq=dev.cq(); - + #ifdef _OCL_MAT _array=input; _offset=offset; @@ -315,20 +315,20 @@ class UCL_D_Vec : public UCL_BaseMat { _array=input+offset; #endif #endif - + #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, - const size_t cols,const size_t stride,UCL_Device &dev) + const size_t cols,const size_t stride,UCL_Device &dev) { view_offset(offset,input,rows,cols,stride); } /// Do not allocate memory, instead use an existing allocation @@ -336,12 +336,12 @@ class UCL_D_Vec : public UCL_BaseMat { * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template - inline void view_offset(const size_t offset, ptr_type input, + inline void view_offset(const size_t offset, ptr_type input, const size_t cols, UCL_Device &dev) { view_offset(offset,input,1,cols,dev); } - + /// Free memory and set size to 0 - inline void clear() + inline void clear() { _device_free(*this); _cols=0; _kind=UCL_VIEW; } /// Resize the allocation to contain cols elements @@ -369,9 +369,9 @@ class UCL_D_Vec : public UCL_BaseMat { #ifdef _OCL_MAT _offset=0; #endif - return err; + return err; } - + /// Resize (only if bigger) the allocation to contain cols elements /** \note Cannot be used on views **/ inline int resize_ib(const int cols) @@ -384,7 +384,7 @@ class UCL_D_Vec : public UCL_BaseMat { /// Set each element to zero asynchronously inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); } /// Set first n elements to zero asynchronously - inline void zero(const int n, command_queue &cq) + inline void zero(const int n, command_queue &cq) { _device_zero(*this,n*sizeof(numtyp),cq); } #ifdef _UCL_DEVICE_PTR_MAT @@ -402,7 +402,7 @@ class UCL_D_Vec : public UCL_BaseMat { /// For CUDA-RT, get device pointer to one past last element inline numtyp * end() const { return _end; } #endif - + #ifdef _UCL_DEVICE_PTR_MAT /// Returns an API specific device pointer /** - For OpenCL, returns a &cl_mem object @@ -427,10 +427,10 @@ class UCL_D_Vec : public UCL_BaseMat { inline const numtyp ** cbegin() const { return &_array; } /// For CUDA-RT, allocate row vector and bind texture inline void safe_alloc(const size_t cols, UCL_Device &dev, - textureReference *t) + textureReference *t) { alloc(cols,dev); assign_texture(t); bind(); } /// For CUDA-RT, assign a texture to matrix - inline void assign_texture(textureReference *t) { _tex_ptr=t; } + inline void assign_texture(textureReference *t) { _tex_ptr=t; } /// For CUDA-RT, bind to texture inline void bind() { cuda_gb_get_channel(_channel); @@ -456,7 +456,7 @@ class UCL_D_Vec : public UCL_BaseMat { inline size_t row_bytes() const { return _row_bytes; } /// Get the size in bytes of 1 element inline int element_size() const { return sizeof(numtyp); } - + #ifdef _OCL_MAT /// Return the offset (in elements) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ @@ -473,7 +473,7 @@ class UCL_D_Vec : public UCL_BaseMat { private: size_t _row_bytes, _row_size, _rows, _cols; - + #ifdef _UCL_DEVICE_PTR_MAT device_ptr _array; #else diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h index dc6da3de0c..1df3c2de4b 100644 --- a/lib/gpu/geryon/ucl_h_mat.h +++ b/lib/gpu/geryon/ucl_h_mat.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -37,21 +37,21 @@ class UCL_H_Mat : public UCL_BaseMat { ROW_MAJOR = 1, VECTOR = 0 }; - typedef numtyp data_type; - + typedef numtyp data_type; + UCL_H_Mat() : _cols(0) { #ifdef _OCL_MAT _carray=(cl_mem)(0); #endif } ~UCL_H_Mat() { _host_free(*this); } - + /// Construct with specied number of rows and columns /** \sa alloc() **/ - UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, - const enum UCL_MEMOPT kind=UCL_READ_WRITE) + UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_READ_WRITE) { _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); } - + /// Set up host matrix with specied # of rows/cols and reserve memory /** The kind parameter controls memory pinning as follows: * - UCL_READ_WRITE - Specify that you will read and write from host @@ -74,7 +74,7 @@ class UCL_H_Mat : public UCL_BaseMat { << " bytes on host.\n"; _row_bytes=0; UCL_GERYON_EXIT; - #endif + #endif _row_bytes=0; return err; } @@ -84,7 +84,7 @@ class UCL_H_Mat : public UCL_BaseMat { _kind=kind; _end=_array+rows*cols; return err; - } + } /// Set up host matrix with specied # of rows/cols and reserve memory /** The kind parameter controls memory pinning as follows: @@ -117,15 +117,15 @@ class UCL_H_Mat : public UCL_BaseMat { _kind=kind; _end=_array+rows*cols; return err; - } - + } + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device container on the host is not supported - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * - Viewing a device container on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols, const size_t stride) { @@ -149,45 +149,45 @@ class UCL_H_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device container on the host is not supported **/ + * allocating container when using CUDA APIs + * - Viewing a device container on the host is not supported **/ template - inline void view(ucl_type &input, const size_t rows, const size_t cols) + inline void view(ucl_type &input, const size_t rows, const size_t cols) { view(input,rows,cols,input.row_size()); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) - * will be used for view - * - Viewing a device container on the host is not supported **/ + * will be used for view + * - Viewing a device container on the host is not supported **/ template inline void view(ucl_type &input, const size_t cols) { view(input,1,cols); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) - * will be used for view when using CUDA APIs - * - Viewing a device container on the host is not supported **/ + * will be used for view when using CUDA APIs + * - Viewing a device container on the host is not supported **/ template - inline void view(ucl_type &input) + inline void view(ucl_type &input) { view(input,input.rows(),input.cols()); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * - Viewing a device pointer on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ template inline void view(ptr_type *input, const size_t rows, const size_t cols, - const size_t stride, UCL_Device &dev) { + const size_t stride, UCL_Device &dev) { assert(rows==1 || stride==cols); clear(); _kind=UCL_VIEW; @@ -197,40 +197,40 @@ class UCL_H_Mat : public UCL_BaseMat { this->_cq=dev.cq(); _array=input; _end=_array+_cols; - + #ifdef _OCL_MAT _host_view(*this,dev,_row_bytes*rows); - #endif + #endif } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported **/ + * allocating container when using CUDA APIs + * - Viewing a device pointer on the host is not supported **/ template inline void view(ptr_type *input, const size_t rows, const size_t cols, UCL_Device &dev) { view(input,rows,cols,cols,dev); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported **/ + * allocating container when using CUDA APIs + * - Viewing a device pointer on the host is not supported **/ template inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) { view(input,1,cols,dev); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device container on the host is not supported - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * - Viewing a device container on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, - const size_t cols, const size_t stride) { + const size_t cols, const size_t stride) { assert(rows==1 || stride==cols); clear(); _kind=UCL_VIEW; @@ -244,81 +244,81 @@ class UCL_H_Mat : public UCL_BaseMat { _host_view(*this,input,_row_bytes*_rows); #endif } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device container on the host is not supported **/ + * allocating container when using CUDA APIs + * - Viewing a device container on the host is not supported **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, - const size_t cols) + const size_t cols) { view_offset(offset,input,rows,cols,input.row_size()); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) - * will be used for view - * - Viewing a device container on the host is not supported **/ + * will be used for view + * - Viewing a device container on the host is not supported **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) { view_offset(offset,input,1,cols); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) - * will be used for view - * - Viewing a device container on the host is not supported **/ + * will be used for view + * - Viewing a device container on the host is not supported **/ template - inline void view_offset(const size_t offset, ucl_type &input) { - if (input.rows()==1) + inline void view_offset(const size_t offset, ucl_type &input) { + if (input.rows()==1) view_offset(offset,input,1,input.cols()-offset); - else + else view_offset(offset,input,input.rows()-offset/input.row_size(), input.cols()); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container - * - Viewing a device pointer on the host is not supported **/ + * allocating container + * - Viewing a device pointer on the host is not supported **/ template inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, const size_t cols, UCL_Device &dev) { view(input+offset,rows,cols,dev); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * - Viewing a device pointer on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, - const size_t cols,const size_t stride,UCL_Device &dev) + const size_t cols,const size_t stride,UCL_Device &dev) { view(input+offset,rows,cols,stride,dev); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported **/ + * allocating container when using CUDA APIs + * - Viewing a device pointer on the host is not supported **/ template - inline void view_offset(const size_t offset, ptr_type *input, + inline void view_offset(const size_t offset, ptr_type *input, const size_t cols, UCL_Device &dev) { view(input+offset,1,cols,dev); } - + /// Free memory and set size to 0 - inline void clear() - { _host_free(*this); _cols=0; _kind=UCL_VIEW; } + inline void clear() + { _host_free(*this); _cols=0; _kind=UCL_VIEW; } /// Resize the allocation to rows x cols elements /** \note Cannot be used on views **/ @@ -333,7 +333,7 @@ class UCL_H_Mat : public UCL_BaseMat { << " bytes on host.\n"; _row_bytes=0; UCL_GERYON_EXIT; - #endif + #endif _row_bytes=0; return err; } @@ -347,7 +347,7 @@ class UCL_H_Mat : public UCL_BaseMat { /// Resize (only if bigger) the allocation to contain rows x cols elements /** \note Cannot be used on views **/ inline int resize_ib(const int rows, const int cols) - { if (cols>_cols || rows>_rows) return resize(rows,cols); + { if (cols>_cols || rows>_rows) return resize(rows,cols); else return UCL_SUCCESS; } /// Set each element to zero @@ -376,21 +376,21 @@ class UCL_H_Mat : public UCL_BaseMat { inline size_t row_bytes() const { return _row_bytes; } /// Get the size in bytes of 1 element inline int element_size() const { return sizeof(numtyp); } - + /// Get element at index i inline numtyp & operator[](const int i) { return _array[i]; } /// Get element at index i inline const numtyp & operator[](const int i) const { return _array[i]; } - /// 2D access (row should always be 0) - inline numtyp & operator()(const int row, const int col) + /// 2D access (row should always be 0) + inline numtyp & operator()(const int row, const int col) { return _array[row*_cols+col]; } - /// 2D access (row should always be 0) + /// 2D access (row should always be 0) inline const numtyp & operator()(const int row, const int col) const { return _array[row*_cols+col]; } - + /// Returns pointer to memory pointer for allocation on host inline numtyp ** host_ptr() { return &_array; } - + /// Return the offset (in elements) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t offset() const { return 0; } @@ -409,14 +409,14 @@ class UCL_H_Mat : public UCL_BaseMat { /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA) inline const void ** cbegin() const { return (const void **)&_array; } #endif - + private: numtyp *_array, *_end; size_t _row_bytes, _rows, _cols; #ifdef _OCL_MAT device_ptr _carray; - #endif + #endif }; #endif diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index 773facdea0..a9d64349d9 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -37,21 +37,21 @@ class UCL_H_Vec : public UCL_BaseMat { ROW_MAJOR = 1, VECTOR = 1 }; - typedef numtyp data_type; - + typedef numtyp data_type; + UCL_H_Vec() : _cols(0) { #ifdef _OCL_MAT _carray=(cl_mem)(0); #endif } ~UCL_H_Vec() { _host_free(*this); } - + /// Construct with n columns /** \sa alloc() **/ - UCL_H_Vec(const size_t n, UCL_Device &device, - const enum UCL_MEMOPT kind=UCL_READ_WRITE) + UCL_H_Vec(const size_t n, UCL_Device &device, + const enum UCL_MEMOPT kind=UCL_READ_WRITE) { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); } - + /// Set up host vector with 'cols' columns and reserve memory /** The kind parameter controls memory pinning as follows: * - UCL_READ_WRITE - Specify that you will read and write from host @@ -84,7 +84,7 @@ class UCL_H_Vec : public UCL_BaseMat { _kind=kind; _end=_array+cols; return err; - } + } /// Set up host vector with 'cols' columns and reserve memory /** The kind parameter controls memory pinning as follows: @@ -108,7 +108,7 @@ class UCL_H_Vec : public UCL_BaseMat { << " bytes on host.\n"; _row_bytes=0; UCL_GERYON_EXIT; - #endif + #endif _row_bytes=0; return err; } @@ -118,13 +118,13 @@ class UCL_H_Vec : public UCL_BaseMat { _end=_array+cols; return err; } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device container on the host is not supported **/ + * allocating container when using CUDA APIs + * - Viewing a device container on the host is not supported **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols) { #ifdef UCL_DEBUG @@ -143,14 +143,14 @@ class UCL_H_Vec : public UCL_BaseMat { CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #endif } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs - * - Viewing a device container on the host is not supported - * \param stride Number of _elements_ between the start of each row **/ + * - Viewing a device container on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols, const size_t stride) { view(input,rows,cols); } @@ -159,31 +159,31 @@ class UCL_H_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) - * will be used for view - * - Viewing a device container on the host is not supported **/ + * will be used for view + * - Viewing a device container on the host is not supported **/ template inline void view(ucl_type &input, const size_t cols) { view(input,1,cols); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container * - If a matrix is used a input, all elements (including padding) - * will be used for view - * - Viewing a device container on the host is not supported **/ + * will be used for view + * - Viewing a device container on the host is not supported **/ template - inline void view(ucl_type &input) + inline void view(ucl_type &input) { view(input,input.rows()*input.row_size()); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported **/ + * allocating container when using CUDA APIs + * - Viewing a device pointer on the host is not supported **/ template inline void view(ptr_type *input, const size_t rows, const size_t cols, UCL_Device &dev) { @@ -197,38 +197,38 @@ class UCL_H_Vec : public UCL_BaseMat { this->_cq=dev.cq(); _array=input; _end=_array+_cols; - + #ifdef _OCL_MAT _host_view(*this,dev,_row_bytes); - #endif + #endif } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported - * \param stride Number of _elements_ between the start of each row **/ + * - Viewing a device pointer on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ template inline void view(ptr_type *input, const size_t rows, const size_t cols, - const size_t stride, UCL_Device &dev) + const size_t stride, UCL_Device &dev) { view(input,rows,cols,stride); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported **/ + * - Viewing a device pointer on the host is not supported **/ template inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) { view(input,1,cols,dev); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs - * - Viewing a device container on the host is not supported **/ + * - Viewing a device container on the host is not supported **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, const size_t cols) { @@ -246,76 +246,76 @@ class UCL_H_Vec : public UCL_BaseMat { _host_view(*this,input,_row_bytes); #endif } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device container on the host is not supported - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * - Viewing a device container on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, - const size_t cols, const size_t stride) + const size_t cols, const size_t stride) { view_offset(offset,input,rows,cols); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) - * will be used for view - * - Viewing a device container on the host is not supported **/ + * will be used for view + * - Viewing a device container on the host is not supported **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) { view_offset(offset,input,1,cols); } - + /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) - * will be used for view - * - Viewing a device container on the host is not supported **/ + * will be used for view + * - Viewing a device container on the host is not supported **/ template - inline void view_offset(const size_t offset, ucl_type &input) + inline void view_offset(const size_t offset, ucl_type &input) { view_offset(offset,input,input.rows()*input.row_size()-offset); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported **/ + * allocating container when using CUDA APIs + * - Viewing a device pointer on the host is not supported **/ template inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, const size_t cols, UCL_Device &dev) { view(input+offset,rows,cols,dev); } - + /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported - * \param stride Number of _elements_ between the start of each row **/ + * allocating container when using CUDA APIs + * - Viewing a device pointer on the host is not supported + * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, - const size_t cols,const size_t stride,UCL_Device &dev) + const size_t cols,const size_t stride,UCL_Device &dev) { view(input+offset,rows,cols,stride,dev); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container when using CUDA APIs - * - Viewing a device pointer on the host is not supported **/ + * allocating container when using CUDA APIs + * - Viewing a device pointer on the host is not supported **/ template - inline void view_offset(const size_t offset, ptr_type *input, + inline void view_offset(const size_t offset, ptr_type *input, const size_t cols, UCL_Device &dev) { view(input+offset,1,cols,dev); } - + /// Free memory and set size to 0 - inline void clear() + inline void clear() { _host_free(*this); _kind=UCL_VIEW; _cols=0; } /// Resize the allocation to contain cols elements @@ -324,7 +324,7 @@ class UCL_H_Vec : public UCL_BaseMat { assert(_kind!=UCL_VIEW); _row_bytes=cols*sizeof(numtyp); int err=_host_resize(*this,_row_bytes); - + if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes @@ -340,7 +340,7 @@ class UCL_H_Vec : public UCL_BaseMat { _end=_array+cols; return err; } - + /// Resize (only if bigger) the allocation to contain cols elements /** \note Cannot be used on views **/ inline int resize_ib(const int cols) @@ -348,7 +348,7 @@ class UCL_H_Vec : public UCL_BaseMat { /// Set each element to zero inline void zero() { _host_zero(_array,row_bytes()); } - + /// Set first n elements to zero inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); } @@ -373,35 +373,35 @@ class UCL_H_Vec : public UCL_BaseMat { inline size_t row_bytes() const { return _row_bytes; } /// Get the size in bytes of 1 element inline int element_size() const { return sizeof(numtyp); } - + /// Get element at index i inline numtyp & operator[](const int i) { return _array[i]; } /// Get element at index i inline const numtyp & operator[](const int i) const { return _array[i]; } - /// 2D access (row should always be 0) - inline numtyp & operator()(const int row, const int col) + /// 2D access (row should always be 0) + inline numtyp & operator()(const int row, const int col) { return _array[col]; } - /// 2D access (row should always be 0) + /// 2D access (row should always be 0) inline const numtyp & operator()(const int row, const int col) const { return _array[col]; } - + /// Returns pointer to memory pointer for allocation on host inline numtyp ** host_ptr() { return &_array; } - + /// Return the offset (in elements) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t offset() const { return 0; } /// Return the offset (in bytes) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t byteoff() const { return 0; } - + #ifdef _OCL_MAT /// For OpenCL, returns a reference to the cl_mem object inline device_ptr & cbegin() { return _carray; } /// For OpenCL, returns a reference to the cl_mem object inline const device_ptr & cbegin() const { return _carray; } #endif - + private: numtyp *_array, *_end; size_t _row_bytes, _cols; diff --git a/lib/gpu/geryon/ucl_matrix.h b/lib/gpu/geryon/ucl_matrix.h index 301325b454..b93d1c7f68 100644 --- a/lib/gpu/geryon/ucl_matrix.h +++ b/lib/gpu/geryon/ucl_matrix.h @@ -34,25 +34,25 @@ class UCL_Matrix { ROW_MAJOR = 1, VECTOR = 0 }; - typedef hosttype data_type; + typedef hosttype data_type; /// Host Allocation UCL_H_Mat host; - + /// Device Allocation UCL_D_Mat device; UCL_Matrix() { } ~UCL_Matrix() { } - + /// Construct with specied number of rows and columns /** \sa alloc() **/ - UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, + UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } - + /// Set up host matrix with specied # of rows/cols and reserve memory /** The kind1 parameter controls memory access from the host * - UCL_READ_WRITE - Specify that you will read and write from host @@ -74,7 +74,7 @@ class UCL_Matrix { const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { return _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); } - + /// Set up host matrix with specied # of rows/cols and reserve memory /** The kind1 parameter controls memory access from the host * - UCL_READ_WRITE - Specify that you will read and write from host @@ -92,9 +92,9 @@ class UCL_Matrix { const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { return _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } - + /// Free memory and set size to 0 - inline void clear() + inline void clear() { host.clear(); device.clear(); } /// Resize the allocation to contain cols elements @@ -106,10 +106,10 @@ class UCL_Matrix { return _ucl_s_obj_help< ucl_same_type::ans >:: dev_resize(device,host,_buffer,rows,cols); } - + /// Resize (only if bigger) the allocation to contain cols elements inline int resize_ib(const int new_rows, const int new_cols) - { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); + { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); else return UCL_SUCCESS; } /// Set each element to zero (asynchronously on device) @@ -118,14 +118,14 @@ class UCL_Matrix { inline void zero(const int n) { zero(n,cq()); } /// Set each element to zero (asynchronously on device) inline void zero(command_queue &cq) { - host.zero(); + host.zero(); if (device.kind()!=UCL_VIEW) device.zero(cq); else if (_buffer.numel()>0) _buffer.zero(); } /// Set first n elements to zero (asynchronously on device) - inline void zero(const int n, command_queue &cq) { - host.zero(n); - if (device.kind()!=UCL_VIEW) device.zero(n,cq); + inline void zero(const int n, command_queue &cq) { + host.zero(n); + if (device.kind()!=UCL_VIEW) device.zero(n,cq); else if (_buffer.numel()>0) _buffer.zero(); } @@ -136,26 +136,26 @@ class UCL_Matrix { /// Get the number of columns inline size_t cols() const { return host.cols(); } /// Get the memory usage (bytes) of the s-object (including any buffers) - inline size_t host_mem_usage() + inline size_t host_mem_usage() { return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); } /// Get the memory usage (bytes) of the s-object (including any buffers) - inline size_t device_mem_usage() + inline size_t device_mem_usage() { return device.row_bytes()*device.rows(); } - + /// Get element at index i inline hosttype & operator[](const int i) { return host[i]; } /// Get element at index i inline const hosttype & operator[](const int i) const { return host[i]; } - /// 2D access (row should always be 0) - inline hosttype & operator()(const int row, const int col) + /// 2D access (row should always be 0) + inline hosttype & operator()(const int row, const int col) { return host(row,col); } - /// 2D access (row should always be 0) + /// 2D access (row should always be 0) inline const hosttype & operator()(const int row, const int col) const { return host(row,col); } - + /// Returns pointer to memory pointer for allocation on host inline hosttype ** host_ptr() { return host.host_ptr(); } - + /// Return the default command queue/stream associated with this data inline command_queue & cq() { return host.cq(); } /// Change the default command queue associated with this data @@ -172,7 +172,7 @@ class UCL_Matrix { /// Update the allocation on the host asynchronously - inline void update_host() + inline void update_host() { _ucl_s_obj_help< ucl_same_type::ans >:: copy(host,device,_buffer,true); } /// Update the allocation on the host (true for asynchronous copy) @@ -202,7 +202,7 @@ class UCL_Matrix { /// Update the allocation on the device asynchronously - inline void update_device() + inline void update_device() { _ucl_s_obj_help< ucl_same_type::ans >:: copy(device,host,_buffer,true); } /// Update the allocation on the device (true for asynchronous copy) diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h index bdba8ff7ae..437631ec3a 100644 --- a/lib/gpu/geryon/ucl_nv_kernel.h +++ b/lib/gpu/geryon/ucl_nv_kernel.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -53,9 +53,9 @@ typedef struct _double4 double4; #define BLOCK_SIZE_Y blockDim.y #define __kernel extern "C" __global__ #define __local __shared__ -#define __global +#define __global #define atom_add atomicAdd -#define ucl_inline static __inline__ __device__ +#define ucl_inline static __inline__ __device__ #endif diff --git a/lib/gpu/geryon/ucl_print.h b/lib/gpu/geryon/ucl_print.h index 87b3d3d7ff..98ae8a8c06 100644 --- a/lib/gpu/geryon/ucl_print.h +++ b/lib/gpu/geryon/ucl_print.h @@ -17,10 +17,10 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ - + // Only allow this file to be included by nvc_memory.h and ocl_memory.h #ifdef UCL_PRINT_ALLOW @@ -40,7 +40,7 @@ template <> struct _ucl_print<1> { } template static inline void p(mat_type &mat, const size_t rows, const size_t cols, - std::ostream &out, const std::string delim, + std::ostream &out, const std::string delim, const std::string row_delim) { int offset=0; int row_size=cols; @@ -58,12 +58,12 @@ template <> struct _ucl_print<1> { } template static inline void p(const mat_type &mat,const size_t rows,const size_t cols, - std::ostream &out,const std::string delim, + std::ostream &out,const std::string delim, const std::string row_delim, UCL_Device &dev) { - p(mat,rows,cols,out,delim,row_delim); + p(mat,rows,cols,out,delim,row_delim); } }; - + template struct _ucl_print { template static inline void p(mat_type &mat, const size_t n, std::ostream &out, @@ -83,7 +83,7 @@ template struct _ucl_print { } template static inline void p(mat_type &mat, const size_t rows, const size_t cols, - std::ostream &out, const std::string delim, + std::ostream &out, const std::string delim, const std::string row_delim) { UCL_H_Vec temp; temp.alloc(mat.rows()*mat.cols(),mat); @@ -91,12 +91,12 @@ template struct _ucl_print { ucl_copy(temp,mat,rows*cols,false); else ucl_copy(temp,mat,rows,cols,false); - _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim); + _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim); } template - static inline void p(const mat_type &mat, const size_t rows, + static inline void p(const mat_type &mat, const size_t rows, const size_t cols,std::ostream &out, - const std::string delim, + const std::string delim, const std::string row_delim, UCL_Device &dev) { UCL_H_Vec temp; temp.alloc(mat.rows()*mat.cols(),dev); @@ -104,9 +104,9 @@ template struct _ucl_print { ucl_copy(temp,mat,rows*cols,false); else ucl_copy(temp,mat,rows,cols,false); - _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim); + _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim); } -}; +}; // ------------------------------------------------------------------------- // - Non-const routines that do not require a device object @@ -123,13 +123,13 @@ inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out, } _ucl_print::p(mat,n,out,delim); } - + /// Outputs n elements of mat delimited by a space template inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) { ucl_print(mat,n,out," "); } - + /// Outputs n elements of mat delimited by a space to standard out template inline void ucl_print(mat_type &mat, const size_t n) { @@ -139,8 +139,8 @@ inline void ucl_print(mat_type &mat, const size_t n) { /// Outputs upper left rows and cols of mat delimited by the string delim template inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols, - std::ostream &out, const std::string delim, - const std::string row_delim) { + std::ostream &out, const std::string delim, + const std::string row_delim) { if (rows*cols>mat.numel()) { std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix " << "that only has " << mat.numel() << " elements."; @@ -148,17 +148,17 @@ inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols, } _ucl_print::p(mat,rows,cols,out,delim,row_delim); } - + /// Outputs upper left rows and cols of mat delimited by a space template inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols, std::ostream &out) { ucl_print(mat,rows,cols,out," ","\n"); } - + /// Outputs upper left rows and cols of mat delimited by a space to std out template -inline void ucl_print(mat_type &mat, const size_t rows, +inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols) { ucl_print(mat,rows,cols,std::cout," ","\n"); } @@ -177,7 +177,7 @@ inline void ucl_print(mat_type &mat, std::ostream &out) { else ucl_print(mat,mat.rows(),mat.cols(),out," ","\n"); } - + // ------------------------------------------------------------------------- // - Const routines that do not require a device object // ------------------------------------------------------------------------- @@ -193,14 +193,14 @@ inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out, } _ucl_print::p(mat,n,out,delim,dev); } - + /// Outputs n elements of mat delimited by a space template inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out, UCL_Device &dev) { ucl_print(mat,n,out," ",dev); } - + /// Outputs n elements of mat delimited by a space to standard out template inline void ucl_print(const mat_type &mat, const size_t n, @@ -211,7 +211,7 @@ inline void ucl_print(const mat_type &mat, const size_t n, /// Outputs upper left rows and cols of mat delimited by the string delim template inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols, - std::ostream &out, const std::string delim, + std::ostream &out, const std::string delim, const std::string row_delim, UCL_Device &dev) { if (rows*cols>mat.numel()) { std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix " @@ -220,17 +220,17 @@ inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols, } _ucl_print::p(mat,rows,cols,out,delim,row_delim,dev); } - + /// Outputs upper left rows and cols of mat delimited by a space template inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols, std::ostream &out, UCL_Device &dev) { ucl_print(mat,rows,cols,out," ","\n",dev); } - + /// Outputs upper left rows and cols of mat delimited by a space to std out template -inline void ucl_print(const mat_type &mat, const size_t rows, +inline void ucl_print(const mat_type &mat, const size_t rows, const size_t cols, UCL_Device &dev) { ucl_print(mat,rows,cols,std::cout," ","\n",dev); } @@ -256,27 +256,27 @@ inline void ucl_print(const mat_type &mat, std::ostream &out, UCL_Device &dev) { template inline std::ostream & operator << (std::ostream &out, UCL_H_Vec &mat) - { ucl_print(mat,out); return out; } + { ucl_print(mat,out); return out; } template inline std::ostream & operator << (std::ostream &out, UCL_H_Mat &mat) - { ucl_print(mat,out); return out; } + { ucl_print(mat,out); return out; } template inline std::ostream & operator << (std::ostream &out, UCL_D_Vec &mat) - { ucl_print(mat,out); return out; } + { ucl_print(mat,out); return out; } template inline std::ostream & operator << (std::ostream &out, UCL_D_Mat &mat) - { ucl_print(mat,out); return out; } + { ucl_print(mat,out); return out; } template inline std::ostream & operator << (std::ostream &out, UCL_Vector &mat) - { ucl_print(mat.host,out); return out; } + { ucl_print(mat.host,out); return out; } template inline std::ostream & operator << (std::ostream &out, UCL_Matrix &mat) - { ucl_print(mat.host,out); return out; } + { ucl_print(mat.host,out); return out; } #endif diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h index 0b8c0251c1..a10f3cdb3f 100644 --- a/lib/gpu/geryon/ucl_s_obj_help.h +++ b/lib/gpu/geryon/ucl_s_obj_help.h @@ -3,7 +3,7 @@ ------------------- W. Michael Brown - Helper routines for allocating memory for s-objects and performing + Helper routines for allocating memory for s-objects and performing host/device updates. (Different routines depending on whether the same type is used on the host and device). @@ -141,29 +141,29 @@ template <> struct _ucl_s_obj_help<1> { } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, + static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, const bool async) { ucl_copy(dst,src,cols,async); } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, + static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, command_queue &cq) { ucl_copy(dst,src,cols,cq); } template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 &buffer, const bool async) { ucl_copy(dst,src,rows,cols,async); } template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 &buffer, command_queue &cq) { ucl_copy(dst,src,rows,cols,cq); } - + template static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { if (device.kind()==UCL_VIEW) { @@ -181,7 +181,7 @@ template <> struct _ucl_s_obj_help<1> { } template - static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, + static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, const int cols) { if (device.kind()==UCL_VIEW) { device.view(host); @@ -255,7 +255,7 @@ template struct _ucl_s_obj_help { e1=_buffer.alloc(cols,cq,kind1); if (e1!=UCL_SUCCESS) return e1; - return device.alloc(cols,cq,kind2); + return device.alloc(cols,cq,kind2); } } @@ -314,7 +314,7 @@ template struct _ucl_s_obj_help { e1=_buffer.alloc(rows,cols,cq,kind1); if (e1!=UCL_SUCCESS) return e1; - return device.alloc(rows,cols,cq,kind2); + return device.alloc(rows,cols,cq,kind2); } } @@ -329,25 +329,25 @@ template struct _ucl_s_obj_help { } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, + static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, const bool async) { ucl_cast_copy(dst,src,cols,buffer,async); } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, + static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, command_queue &cq) { ucl_cast_copy(dst,src,cols,buffer,cq); } - + template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 &buffer, const bool async) { ucl_cast_copy(dst,src,rows,cols,buffer,async); } template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 &buffer, command_queue &cq) { ucl_cast_copy(dst,src,rows,cols,buffer,cq); } @@ -373,7 +373,7 @@ template struct _ucl_s_obj_help { } template - static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, + static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, const int cols) { int err=buff.resize(rows,cols); if (err!=UCL_SUCCESS) diff --git a/lib/gpu/geryon/ucl_types.h b/lib/gpu/geryon/ucl_types.h index 46be4bcb06..cb3dce8430 100644 --- a/lib/gpu/geryon/ucl_types.h +++ b/lib/gpu/geryon/ucl_types.h @@ -17,7 +17,7 @@ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ @@ -26,65 +26,65 @@ // Assign an integer id based on the data type: (int, float, double, etc) template struct _UCL_DATA_ID; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=1 }; - static inline const char * name() { return "double"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=double"; } + static inline const char * name() { return "double"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=double"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=2 }; - static inline const char * name() { return "float"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=float"; } + static inline const char * name() { return "float"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=float"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=3 }; - static inline const char * name() { return "unsigned"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; } + static inline const char * name() { return "unsigned"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=4 }; - static inline const char * name() { return "int"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=int"; } + static inline const char * name() { return "int"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=int"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=5 }; - static inline const char * name() { return "char"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=char"; } + static inline const char * name() { return "char"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=char"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=6 }; - static inline const char * name() { return "unsigned char"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; } + static inline const char * name() { return "unsigned char"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=7 }; - static inline const char * name() { return "short"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=short"; } + static inline const char * name() { return "short"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=short"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=8 }; - static inline const char * name() { return "unsigned short"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; } + static inline const char * name() { return "unsigned short"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=9 }; - static inline const char * name() { return "long"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=long"; } + static inline const char * name() { return "long"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=long"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=10 }; - static inline const char * name() { return "unsigned long"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; } + static inline const char * name() { return "unsigned long"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; } }; -template <> struct _UCL_DATA_ID { +template <> struct _UCL_DATA_ID { enum { id=11 }; - static inline const char * name() { return "long double"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; } + static inline const char * name() { return "long double"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; } }; -template struct _UCL_DATA_ID { +template struct _UCL_DATA_ID { enum { id=0 }; - static inline const char * name() { return "error_type"; } - static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; } + static inline const char * name() { return "error_type"; } + static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; } }; // Host memory allocation types @@ -97,7 +97,7 @@ enum UCL_MEMOPT { UCL_NOT_SPECIFIED }; -enum UCL_DEVICE_TYPE { +enum UCL_DEVICE_TYPE { UCL_DEFAULT, ///< Unknown device type UCL_CPU, ///< Device is a CPU UCL_GPU, ///< Device is a GPU @@ -111,7 +111,7 @@ enum UCL_ERROR_FLAG { UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found UCL_COMPILE_ERROR, ///< Error compiling kernel UCL_MEMORY_ERROR -}; +}; template const char * ucl_template_name() { return _UCL_DATA_ID::name(); } diff --git a/lib/gpu/geryon/ucl_vector.h b/lib/gpu/geryon/ucl_vector.h index 89f1528969..7fe2604de6 100644 --- a/lib/gpu/geryon/ucl_vector.h +++ b/lib/gpu/geryon/ucl_vector.h @@ -34,25 +34,25 @@ class UCL_Vector { ROW_MAJOR = 1, VECTOR = 1 }; - typedef hosttype data_type; + typedef hosttype data_type; /// Host Allocation UCL_H_Vec host; - + /// Device Allocation UCL_D_Vec device; - + UCL_Vector() { } ~UCL_Vector() { } /// Construct with n columns /** \sa alloc() **/ - UCL_Vector(const size_t cols, UCL_Device &acc, + UCL_Vector(const size_t cols, UCL_Device &acc, const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,cols,acc,kind1,kind2); } - + /// Set up the vector with 'cols' columns and reserve memory /** The kind1 parameter controls memory access from the host * - UCL_READ_WRITE - Specify that you will read and write from host @@ -89,12 +89,12 @@ class UCL_Vector { * \return UCL_SUCCESS if the memory allocation is successful **/ inline int alloc(const size_t cols, UCL_Device &acc, const enum UCL_MEMOPT kind1=UCL_READ_WRITE, - const enum UCL_MEMOPT kind2=UCL_READ_WRITE) + const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { return _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,cols,acc,kind1,kind2); } - + /// Free memory and set size to 0 - inline void clear() + inline void clear() { host.clear(); device.clear(); } /// Resize the allocation to contain cols elements @@ -106,7 +106,7 @@ class UCL_Vector { return _ucl_s_obj_help< ucl_same_type::ans >:: dev_resize(device,host,_buffer,cols); } - + /// Resize (only if bigger) the allocation to contain cols elements inline int resize_ib(const int new_cols) { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; } @@ -117,14 +117,14 @@ class UCL_Vector { inline void zero(const int n) { zero(n,cq()); } /// Set each element to zero (asynchronously on device) inline void zero(command_queue &cq) { - host.zero(); + host.zero(); if (device.kind()!=UCL_VIEW) device.zero(cq); else if (_buffer.numel()>0) _buffer.zero(); } /// Set first n elements to zero (asynchronously on device) - inline void zero(const int n, command_queue &cq) { - host.zero(n); - if (device.kind()!=UCL_VIEW) device.zero(n,cq); + inline void zero(const int n, command_queue &cq) { + host.zero(n); + if (device.kind()!=UCL_VIEW) device.zero(n,cq); else if (_buffer.numel()>0) _buffer.zero(); } @@ -135,27 +135,27 @@ class UCL_Vector { /// Get the number of columns inline size_t cols() const { return host.cols(); } /// Get the memory usage (bytes) of the s-object (including any buffers) - inline size_t host_mem_usage() + inline size_t host_mem_usage() { return host.row_bytes()+_buffer.row_bytes(); } /// Get the memory usage (bytes) of the s-object (including any buffers) - inline size_t device_mem_usage() + inline size_t device_mem_usage() { return device.row_bytes(); } - - + + /// Get element at index i inline hosttype & operator[](const int i) { return host[i]; } /// Get element at index i inline const hosttype & operator[](const int i) const { return host[i]; } - /// 2D access (row should always be 0) - inline hosttype & operator()(const int row, const int col) + /// 2D access (row should always be 0) + inline hosttype & operator()(const int row, const int col) { return host[col]; } - /// 2D access (row should always be 0) + /// 2D access (row should always be 0) inline const hosttype & operator()(const int row, const int col) const { return host[col]; } - + /// Returns pointer to memory pointer for allocation on host inline hosttype ** host_ptr() { return host.host_ptr(); } - + /// Return the default command queue/stream associated with this data inline command_queue & cq() { return host.cq(); } /// Change the default command queue associated with this data @@ -172,7 +172,7 @@ class UCL_Vector { /// Update the allocation on the host asynchronously - inline void update_host() + inline void update_host() { _ucl_s_obj_help< ucl_same_type::ans >:: copy(host,device,_buffer,true); } /// Update the allocation on the host (true for asynchronous copy) @@ -202,7 +202,7 @@ class UCL_Vector { /// Update the allocation on the device asynchronously - inline void update_device() + inline void update_device() { _ucl_s_obj_help< ucl_same_type::ans >:: copy(device,host,_buffer,true); } /// Update the allocation on the device (true for asynchronous copy) diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp index dd0b5d2424..bd8c7ef843 100644 --- a/lib/gpu/lal_answer.cpp +++ b/lib/gpu/lal_answer.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -24,7 +24,7 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false), } template -int AnswerT::bytes_per_atom() const { +int AnswerT::bytes_per_atom() const { int bytes=11*sizeof(acctyp); if (_rot) bytes+=4*sizeof(acctyp); @@ -38,19 +38,19 @@ bool AnswerT::alloc(const int inum) { _max_local=static_cast(static_cast(inum)*1.10); bool success=true; - + _ans_fields=4; if (_rot) _ans_fields+=4; - + // --------------------------- Device allocations success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY, UCL_READ_WRITE)==UCL_SUCCESS); success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY, UCL_READ_WRITE)==UCL_SUCCESS); _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes(); - - _allocated=true; + + _allocated=true; return success; } @@ -69,21 +69,21 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot, if (_charge) _e_fields++; _ev_fields=6+_e_fields; - + // Initialize atom and nbor data int ef_inum=inum; if (ef_inum==0) ef_inum=1000; - + // Initialize timers for the selected device time_answer.init(*dev); time_answer.zero(); _time_cast=0.0; _time_cpu_idle=0.0; - + return success && alloc(ef_inum); } - + template bool AnswerT::add_fields(const bool charge, const bool rot) { bool realloc=false; @@ -127,15 +127,15 @@ void AnswerT::clear() { template double AnswerT::host_memory_usage() const { int atom_bytes=4; - if (_charge) + if (_charge) atom_bytes+=1; - if (_rot) + if (_rot) atom_bytes+=4; int ans_bytes=atom_bytes+_ev_fields; return ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(Answer); } - + template void AnswerT::copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom) { @@ -144,8 +144,8 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag, _vflag=vflag; _ef_atom=ef_atom; _vf_atom=vf_atom; - - int csize=_ev_fields; + + int csize=_ev_fields; if (!eflag) csize-=_e_fields; if (!vflag) @@ -180,7 +180,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom, for (int i=0; i<_inum; i++) evdwl+=engv[i]; if (_ef_atom) - if (_ilist==NULL) + if (_ilist==NULL) for (int i=0; i<_inum; i++) eatom[i]+=engv[i]; else @@ -196,18 +196,18 @@ double AnswerT::energy_virial(double *eatom, double **vatom, if (_vf_atom) if (_ilist==NULL) { int ii=0; - for (int i=vstart; i -int AtomT::bytes_per_atom() const { +int AtomT::bytes_per_atom() const { int id_space=0; if (_gpu_nbor==1) id_space=2; @@ -51,7 +51,7 @@ bool AtomT::alloc(const int nall) { _max_atoms=static_cast(static_cast(nall)*1.10); bool success=true; - + // Ignore host/device transfers? _host_view=false; if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) { @@ -60,11 +60,11 @@ bool AtomT::alloc(const int nall) { assert(0==1); #endif } - + // Allocate storage for CUDPP sort #ifdef USE_CUDPP if (_gpu_nbor==1) { - CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); + CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); if (CUDPP_SUCCESS != result) return false; } @@ -110,7 +110,7 @@ bool AtomT::alloc(const int nall) { } else { success=success && (host_particle_id.alloc(_max_atoms,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && + success=success && (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); } if (_gpu_nbor==2 && _host_view) @@ -124,8 +124,8 @@ bool AtomT::alloc(const int nall) { gpu_bytes+=x.device.row_bytes(); if (gpu_bytes>_max_gpu_bytes) _max_gpu_bytes=gpu_bytes; - - _allocated=true; + + _allocated=true; return success; } @@ -135,7 +135,7 @@ bool AtomT::add_fields(const bool charge, const bool rot, bool success=true; // Ignore host/device transfers? int gpu_bytes=0; - + if (charge && _charge==false) { _charge=true; _other=true; @@ -179,7 +179,7 @@ bool AtomT::add_fields(const bool charge, const bool rot, _gpu_nbor=gpu_nbor; #ifdef USE_CUDPP if (_gpu_nbor==1) { - CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); + CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); if (CUDPP_SUCCESS != result) return false; } @@ -198,9 +198,9 @@ bool AtomT::add_fields(const bool charge, const bool rot, } else { success=success && (host_particle_id.alloc(_max_atoms,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && + success=success && (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); - } + } } return success; @@ -230,7 +230,7 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, int ef_nall=nall; if (ef_nall==0) ef_nall=2000; - + // Initialize timers for the selected device time_pos.init(*dev); time_q.init(*dev); @@ -241,14 +241,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, time_quat.zero(); time_vel.zero(); _time_cast=0.0; - + #ifdef GPU_CAST compile_kernels(*dev); #endif - + return success && alloc(ef_nall); } - + template void AtomT::clear_resize() { if (!_allocated) @@ -274,7 +274,7 @@ void AtomT::clear_resize() { #ifdef USE_CUDPP if (_gpu_nbor==1) cudppDestroyPlan(sort_plan); #endif - + if (_gpu_nbor==2) { host_particle_id.clear(); host_cell_id.clear(); @@ -305,21 +305,21 @@ void AtomT::clear() { template double AtomT::host_memory_usage() const { int atom_bytes=4; - if (_charge) + if (_charge) atom_bytes+=1; - if (_rot) + if (_rot) atom_bytes+=4; - if (_vel) + if (_vel) atom_bytes+=4; return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom); } - + // Sort arrays for neighbor list calculation template void AtomT::sort_neighbor(const int num_atoms) { #ifdef USE_CUDPP - CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), - (int *)dev_particle_id.begin(), + CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), + (int *)dev_particle_id.begin(), 8*sizeof(unsigned), num_atoms); if (CUDPP_SUCCESS != result) { printf("Error in cudppSort\n"); diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu index 2a78719ffb..28ff31c566 100644 --- a/lib/gpu/lal_atom.cu +++ b/lib/gpu/lal_atom.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -17,9 +17,9 @@ #include "lal_preprocessor.h" #endif -__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, +__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, const __global double *restrict x, - const __global int *restrict type, + const __global int *restrict type, const int nall) { int ii=GLOBAL_ID_X; diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 23112fe712..1b4e17d972 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -57,19 +57,19 @@ class Atom { /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } - + /// Memory usage per atom in this class - int bytes_per_atom() const; + int bytes_per_atom() const; /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions * \param gpu_nbor 0 if neighboring will be performed on host * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ - bool init(const int nall, const bool charge, const bool rot, - UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, + bool init(const int nall, const bool charge, const bool rot, + UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, const bool vel=false); - + /// Check if we have enough device storage and realloc if not /** Returns true if resized with any call during this timestep **/ inline bool resize(const int nall, bool &success) { @@ -81,7 +81,7 @@ class Atom { } return _resized; } - + /// If already initialized by another LAMMPS style, add fields as necessary /** \param rot True if atom storage needs quaternions * \param gpu_nbor 0 if neighboring will be performed on host @@ -89,28 +89,28 @@ class Atom { * gpu_nbor 2 if binning on host and neighboring on device **/ bool add_fields(const bool charge, const bool rot, const int gpu_nbor, const bool bonds, const bool vel=false); - + /// Returns true if GPU is using charges bool charge() { return _charge; } - + /// Returns true if GPU is using quaternions bool quaternion() { return _rot; } - + /// Returns true if GPU is using velocities bool velocity() { return _vel; } /// Only free matrices of length inum or nall for resizing void clear_resize(); - + /// Free all memory on host and device void clear(); - + /// Return the total amount of host memory used by class in bytes double host_memory_usage() const; /// Sort arrays for neighbor list calculation on device void sort_neighbor(const int num_atoms); - + /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); @@ -150,18 +150,18 @@ class Atom { total+=time_vel.total_seconds(); time_vel.zero_total(); } - + return total+_time_transfer/1000.0; } - + /// Return the total time for data cast/pack /** Zeros the time so that atom times are only included once **/ - inline double cast_time() + inline double cast_time() { double t=_time_cast; _time_cast=0.0; return t; } /// Pack LAMMPS atom type constants into matrix and copy to device template - inline void type_pack1(const int n, const int m_size, + inline void type_pack1(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one) { int ii=0; @@ -215,7 +215,7 @@ class Atom { view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } - + /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device template inline void type_pack4(const int n, const int m_size, @@ -239,7 +239,7 @@ class Atom { /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device template - inline void self_pack2(const int n, UCL_D_Vec &dev_v, + inline void self_pack2(const int n, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two) { for (int i=0; i(one[i][i]); @@ -279,7 +279,7 @@ class Atom { /// Copy positions and types to device asynchronously /** Copies nall() elements **/ - inline void add_x_data(double **host_ptr, int *host_type) { + inline void add_x_data(double **host_ptr, int *host_type) { time_pos.start(); if (_x_avail==false) { #ifdef GPU_CAST @@ -376,7 +376,7 @@ class Atom { /// Copy velocities and tags to device asynchronously /** Copies nall() elements **/ - inline void add_v_data(double **host_ptr, tagint *host_tag) { + inline void add_v_data(double **host_ptr, tagint *host_tag) { time_vel.start(); if (_v_avail==false) { #ifdef GPU_CAST @@ -407,8 +407,8 @@ class Atom { inline void add_transfer_time(double t) { _time_transfer+=t; } /// Return number of bytes used on device - inline double max_gpu_bytes() - { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } + inline double max_gpu_bytes() + { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } /// Returns true if the device is addressing memory on the host inline bool host_view() { return _host_view; } @@ -422,7 +422,7 @@ class Atom { /// Quaterions UCL_Vector quat; /// Velocities - UCL_Vector v; + UCL_Vector v; #ifdef GPU_CAST UCL_Vector x_cast; @@ -436,7 +436,7 @@ class Atom { /// Atom tag information for device nbor builds UCL_D_Vec dev_tag; - + /// Cell list identifiers for hybrid nbor builds UCL_H_Vec host_cell_id; /// Cell list identifiers for hybrid nbor builds @@ -444,7 +444,7 @@ class Atom { /// Device timers UCL_Timer time_pos, time_q, time_quat, time_vel; - + /// Geryon device UCL_Device *dev; @@ -456,19 +456,19 @@ class Atom { #endif bool _compiled; - + // True if data has been copied to device already bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized; bool alloc(const int nall); - + bool _allocated, _rot, _charge, _bonds, _vel, _other; int _max_atoms, _nall, _gpu_nbor; bool _host_view; double _time_cast, _time_transfer; - + double _max_gpu_bytes; - + #ifdef USE_CUDPP CUDPPConfiguration sort_config; CUDPPHandle sort_plan; diff --git a/lib/gpu/lal_balance.h b/lib/gpu/lal_balance.h index cf09cf86fb..e90e94bee1 100644 --- a/lib/gpu/lal_balance.h +++ b/lib/gpu/lal_balance.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -44,7 +44,7 @@ class Balance { _init_done=false; } } - + /// Return the timestep since initialization inline int timestep() { return _timestep; } @@ -96,7 +96,7 @@ class Balance { inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } } /// Calculate the new host/device split based on the cpu and device times - /** \note Only does calculation every _HD_BALANCE_EVERY timesteps + /** \note Only does calculation every _HD_BALANCE_EVERY timesteps (and first 10) **/ inline void balance(const double cpu_time); @@ -105,13 +105,13 @@ class Balance { balance(cpu_time); return get_gpu_count(ago,inum_full); } - + private: Device *_device; UCL_Timer _device_time; bool _init_done; int _gpu_nbor; - + bool _load_balance; double _actual_split, _avg_split, _desired_split, _max_split; int _avg_count; @@ -123,15 +123,15 @@ class Balance { #define BalanceT Balance template -void BalanceT::init(Device *gpu, +void BalanceT::init(Device *gpu, const int gpu_nbor, const double split) { clear(); _gpu_nbor=gpu_nbor; _init_done=true; - + _device=gpu; _device_time.init(*gpu->gpu); - + if (split<0.0) { _load_balance=true; _desired_split=0.90; @@ -163,7 +163,7 @@ int BalanceT::get_gpu_count(const int ago, const int inum_full) { _timestep++; return _inum; } - + template void BalanceT::balance(const double cpu_time) { if (_measure_this_step) { diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp index 191f218bd8..e59dae1a6f 100644 --- a/lib/gpu/lal_base_atomic.cpp +++ b/lib/gpu/lal_base_atomic.cpp @@ -9,10 +9,10 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ - + #include "lal_base_atomic.h" using namespace LAMMPS_AL; #define BaseAtomicT BaseAtomic @@ -63,13 +63,13 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, _nbor_data=&(nbor->dev_packed); } else _nbor_data=&(nbor->dev_nbor); - + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false, _threads_per_atom); if (success!=0) return success; - + ucl_device=device->gpu; atom=&device->atom; @@ -139,7 +139,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist, double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; - + return ilist; } @@ -188,7 +188,7 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full, zero_timers(); return; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -217,7 +217,7 @@ template int ** BaseAtomicT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -230,12 +230,12 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full, zero_timers(); return NULL; } - + hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -255,7 +255,7 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full, ans->copy_answers(eflag,vflag,eatom,vatom); device->add_ans_object(ans); hd_balancer.stop_timer(); - + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h index eaf55f46e2..e3e9829abc 100644 --- a/lib/gpu/lal_base_atomic.h +++ b/lib/gpu/lal_base_atomic.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -41,7 +41,7 @@ class BaseAtomic { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -49,8 +49,8 @@ class BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init_atomic(const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, const void *pair_program, const char *k_name); /// Estimate the overhead for GPU context changes and CPU driver @@ -80,7 +80,7 @@ class BaseAtomic { * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, + inline void resize_local(const int inum, const int host_inum, const int max_nbors, bool &success) { nbor->resize(inum,host_inum,max_nbors,success); } @@ -119,7 +119,7 @@ class BaseAtomic { /// Build neighbor list on device void build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success); /// Pair loop with host neighboring @@ -133,19 +133,19 @@ class BaseAtomic { int * compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); /// Pair loop with device neighboring int ** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success); - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index e7fe2b62f4..c6341f7d57 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -64,7 +64,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _nbor_data=&(nbor->dev_packed); } else _nbor_data=&(nbor->dev_nbor); - + int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false, _threads_per_atom); @@ -153,7 +153,7 @@ template inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success) { success=true; @@ -192,7 +192,7 @@ void BaseChargeT::compute(const int f_ago, const int inum_full, zero_timers(); return; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -226,7 +226,7 @@ template int** BaseChargeT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -240,12 +240,12 @@ int** BaseChargeT::compute(const int ago, const int inum_full, zero_timers(); return NULL; } - + hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -271,7 +271,7 @@ int** BaseChargeT::compute(const int ago, const int inum_full, ans->copy_answers(eflag,vflag,eatom,vatom); device->add_ans_object(ans); hd_balancer.stop_timer(); - + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h index e791507432..64c19554b9 100644 --- a/lib/gpu/lal_base_charge.h +++ b/lib/gpu/lal_base_charge.h @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -42,7 +42,7 @@ class BaseCharge { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -83,7 +83,7 @@ class BaseCharge { * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, + inline void resize_local(const int inum, const int host_inum, const int max_nbors, bool &success) { nbor->resize(inum,host_inum,max_nbors,success); } @@ -137,12 +137,12 @@ class BaseCharge { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 12e3b20d96..478f0092c7 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -65,7 +65,7 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall, _nbor_data=&(nbor->dev_packed); } else _nbor_data=&(nbor->dev_nbor); - + int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false, _threads_per_atom); @@ -155,7 +155,7 @@ template inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success) { success=true; @@ -194,7 +194,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full, zero_timers(); return; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -230,12 +230,12 @@ template int** BaseDipoleT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double *host_q, double **host_mu, + double *host_q, double **host_mu, double *boxlo, double *prd) { acc_timers(); if (inum_full==0) { @@ -245,12 +245,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full, zero_timers(); return NULL; } - + hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -279,7 +279,7 @@ int** BaseDipoleT::compute(const int ago, const int inum_full, ans->copy_answers(eflag,vflag,eatom,vatom); device->add_ans_object(ans); hd_balancer.stop_timer(); - + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h index 2e495c8747..b51c4303cf 100644 --- a/lib/gpu/lal_base_dipole.h +++ b/lib/gpu/lal_base_dipole.h @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -40,7 +40,7 @@ class BaseDipole { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -82,7 +82,7 @@ class BaseDipole { * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, + inline void resize_local(const int inum, const int host_inum, const int max_nbors, bool &success) { nbor->resize(inum,host_inum,max_nbors,success); } @@ -136,12 +136,12 @@ class BaseDipole { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double *charge, double **mu, double *boxlo, double *prd); - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index 0efb68a9fb..941f463b14 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -64,7 +64,7 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, _nbor_data=&(nbor->dev_packed); } else _nbor_data=&(nbor->dev_nbor); - + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false, _threads_per_atom,true); @@ -153,7 +153,7 @@ template inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success) { success=true; @@ -182,7 +182,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, tagint *tag, double **host_v, + bool &success, tagint *tag, double **host_v, const double dtinvsqrt, const int seed, const int timestep, const int nlocal, double *boxlo, double *prd) { acc_timers(); @@ -193,7 +193,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, zero_timers(); return; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -228,12 +228,12 @@ template int** BaseDPDT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, const double dtinvsqrt, + double **host_v, const double dtinvsqrt, const int seed, const int timestep, double *boxlo, double *prd) { acc_timers(); @@ -244,12 +244,12 @@ int** BaseDPDT::compute(const int ago, const int inum_full, zero_timers(); return NULL; } - + hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -276,7 +276,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full, ans->copy_answers(eflag,vflag,eatom,vatom); device->add_ans_object(ans); hd_balancer.stop_timer(); - + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h index 97640ed40e..7a75282d0a 100644 --- a/lib/gpu/lal_base_dpd.h +++ b/lib/gpu/lal_base_dpd.h @@ -40,7 +40,7 @@ class BaseDPD { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -81,7 +81,7 @@ class BaseDPD { * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, + inline void resize_local(const int inum, const int host_inum, const int max_nbors, bool &success) { nbor->resize(inum,host_inum,max_nbors,success); } @@ -129,20 +129,20 @@ class BaseDPD { int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, tagint *tag, - double **v, const double dtinvsqrt, const int seed, + double **v, const double dtinvsqrt, const int seed, const int timestep, const int nlocal, double *boxlo, double *prd); /// Pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **v, const double dtinvsqrt, const int seed, const int timestep, double *boxlo, double *prd); - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index 4200c02e1c..8918a3140c 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -70,7 +70,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, _gpu_host=1; _threads_per_atom=device->threads_per_atom(); - + int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,true, 1); @@ -113,7 +113,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, return -8; if (_multiple_forms && gpu_nbor!=0) return -9; - + if (_multiple_forms) ans->force.zero(); @@ -142,7 +142,7 @@ void BaseEllipsoidT::clear_base() { // Output any timing information output_times(); host_olist.clear(); - + if (_compiled) { k_nbor_fast.clear(); k_nbor.clear(); @@ -156,7 +156,7 @@ void BaseEllipsoidT::clear_base() { delete lj_program; _compiled=false; } - + time_nbor1.clear(); time_ellipsoid.clear(); time_nbor2.clear(); @@ -230,7 +230,7 @@ void BaseEllipsoidT::output_times() { if (times[6]>0) fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); + fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); @@ -241,10 +241,10 @@ void BaseEllipsoidT::output_times() { } // --------------------------------------------------------------------------- -// Pack neighbors to limit thread divergence for lj-lj and ellipse +// Pack neighbors to limit thread divergence for lj-lj and ellipse // --------------------------------------------------------------------------- template -void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, +void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, const int inum, const int form_low, const int form_high, const bool shared_types, int ntypes) { @@ -264,18 +264,18 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, // Copy neighbor list from host // --------------------------------------------------------------------------- template -void BaseEllipsoidT::reset_nbors(const int nall, const int inum, +void BaseEllipsoidT::reset_nbors(const int nall, const int inum, const int osize, int *ilist, int *numj, int *type, int **firstneigh, bool &success) { success=true; - + int mn=nbor->max_nbor_loop(osize,numj,ilist); resize_atom(nall,success); resize_local(inum,0,mn,osize,success); if (!success) return; - + if (_multiple_forms) { int p=0; for (int i=0; i inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success) { success=true; @@ -354,7 +354,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, zero_timers(); return NULL; } - + int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); @@ -394,7 +394,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { @@ -410,7 +410,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall ans->inum(inum); _last_ellipse=std::min(inum,_max_last_ellipse); host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -419,7 +419,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall return NULL; atom->cast_quat_data(host_quat[0]); hd_balancer.start_timer(); - } else { + } else { atom->cast_x_data(host_x,host_type); atom->cast_quat_data(host_quat[0]); hd_balancer.start_timer(); @@ -444,9 +444,9 @@ double BaseEllipsoidT::host_memory_usage_base() const { } template -void BaseEllipsoidT::compile_kernels(UCL_Device &dev, +void BaseEllipsoidT::compile_kernels(UCL_Device &dev, const void *ellipsoid_string, - const void *lj_string, + const void *lj_string, const char *kname, const bool e_s) { if (_compiled) return; diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h index e289430f43..7deeccbf44 100644 --- a/lib/gpu/lal_base_ellipsoid.h +++ b/lib/gpu/lal_base_ellipsoid.h @@ -42,7 +42,7 @@ class BaseEllipsoid { * \param gpu_split fraction of particles handled by device * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately * \param k_name name for the kernel for force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -68,7 +68,7 @@ class BaseEllipsoid { quat_tex.bind_float(atom->quat,4); lj_pos_tex.bind_float(atom->x,4); lj_quat_tex.bind_float(atom->quat,4); - } + } } /// Check if there is enough storage for neighbors and realloc if not @@ -78,7 +78,7 @@ class BaseEllipsoid { * \param olist_size size of list of particles from CPU neighboring * \note host_inum is 0 if the host is performing neighboring * \note if GPU is neighboring nlocal+host_inum=total number local particles - * \note if CPU is neighboring olist_size=total number of local particles + * \note if CPU is neighboring olist_size=total number of local particles * \note if GPU is neighboring olist_size=0 **/ inline void resize_local(const int nlocal, const int host_inum, const int max_nbors, const int olist_size, @@ -101,7 +101,7 @@ class BaseEllipsoid { /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear_base(); - + /// Output any timing information void output_times(); @@ -130,7 +130,7 @@ class BaseEllipsoid { ans->acc_timers(); } } - + /// Zero timers inline void zero_timers() { time_nbor1.zero(); @@ -148,9 +148,9 @@ class BaseEllipsoid { ans->zero_timers(); } - /// Pack neighbors to limit thread divergence for lj-lj and ellipse + /// Pack neighbors to limit thread divergence for lj-lj and ellipse void pack_nbors(const int GX, const int BX, const int start, const int inum, - const int form_low, const int form_high, + const int form_low, const int form_high, const bool shared_types, int ntypes); /// Copy neighbor list from host @@ -174,17 +174,17 @@ class BaseEllipsoid { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **host_quat); /// Build neighbor list on accelerator - void build_nbor_list(const int inum, const int host_inum, const int nall, + void build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, bool &success); - - // -------------------------- DEVICE DATA ------------------------- + + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; @@ -207,7 +207,7 @@ class BaseEllipsoid { /// Atom Data Atom *atom; - // --------------------------- TYPE DATA -------------------------- + // --------------------------- TYPE DATA -------------------------- /// cut_form.x = cutsq, cut_form.y = form UCL_D_Vec cut_form; @@ -240,7 +240,7 @@ class BaseEllipsoid { double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - // True if we want to use fast GB-sphere or sphere-sphere calculations + // True if we want to use fast GB-sphere or sphere-sphere calculations bool _multiple_forms; int **_host_form; int _last_ellipse, _max_last_ellipse; diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index c41aad7b58..14f642e55b 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -12,7 +12,7 @@ begin : Tue April 2, 2013 email : brownw@ornl.gov ***************************************************************************/ - + #include "lal_base_three.h" using namespace LAMMPS_AL; #define BaseThreeT BaseThree @@ -45,7 +45,7 @@ int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const { #ifdef THREE_CONCURRENT b+=ans2->bytes_per_atom(); #endif - return b; + return b; } template @@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall, gpu_nbor=1; else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) gpu_nbor=2; + _gpu_nbor=gpu_nbor; int _gpu_host=0; int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); @@ -76,7 +77,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _nbor_data=&(nbor->dev_nbor); if (_threads_per_atom*_threads_per_atom>device->warp_size()) return -10; - + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,false, _threads_per_atom); @@ -93,7 +94,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall, return -3; ans2->cq(_end_command_queue); #endif - + _block_pair=device->pair_block_size(); _block_size=device->block_ellipse(); compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end); @@ -111,7 +112,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall, #ifdef THREE_CONCURRENT _max_an_bytes+=ans2->gpu_bytes(); #endif - + return 0; } @@ -158,7 +159,7 @@ void BaseThreeT::clear_atomic() { // --------------------------------------------------------------------------- template int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist, - int *ilist, int *numj, int **firstneigh, + int *ilist, int *numj, int **firstneigh, bool &success) { success=true; @@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist, if (!success) return NULL; - nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size()); + // originally the requirement that nall == nlist was enforced + // to allow direct indexing neighbors of neighbors after re-arrangement +// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size()); + + // now the requirement is removed, allowing to work within pair hybrid + nbor->get_host(nlist,ilist,numj,firstneigh,block_size()); double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); #ifdef THREE_CONCURRENT @@ -176,7 +182,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist, #endif if (bytes>_max_an_bytes) _max_an_bytes=bytes; - + return ilist; } @@ -185,11 +191,11 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist, // --------------------------------------------------------------------------- template inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum, - const int nall, double **host_x, - int *host_type, double *sublo, - double *subhi, tagint *tag, - int **nspecial, tagint **special, - bool &success) { + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + bool &success) { success=true; resize_atom(inum,nall,success); resize_local(nall,host_inum,nbor->max_nbors(),success); @@ -214,11 +220,11 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall, +void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall, const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, + const bool vatom, int &host_start, const double cpu_time, bool &success) { acc_timers(); if (nlist==0) { @@ -228,9 +234,9 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall, zero_timers(); return; } - + int ago=hd_balancer.ago_first(f_ago); - int inum=hd_balancer.balance(ago,nlocal,cpu_time); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); #ifdef THREE_CONCURRENT ans2->inum(inum); @@ -270,7 +276,7 @@ template int ** BaseThreeT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, + int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, @@ -283,7 +289,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, zero_timers(); return NULL; } - + hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); @@ -291,7 +297,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, ans2->inum(inum); #endif host_start=inum; - + // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, @@ -321,7 +327,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, device->add_ans_object(ans2); #endif hd_balancer.stop_timer(); - + return nbor->host_jlist.begin()-host_start; } @@ -352,7 +358,7 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str, k_three_end.cq(ucl_device->cq(_end_command_queue)); k_three_end_vatom.cq(ucl_device->cq(_end_command_queue)); #endif - + _compiled=true; } diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h index 0af290469a..4f27ecdf92 100644 --- a/lib/gpu/lal_base_three.h +++ b/lib/gpu/lal_base_three.h @@ -44,7 +44,7 @@ class BaseThree { * \param gpu_split fraction of particles handled by device * \param k_two name for the kernel for 2-body force calculation * \param k_three name for the kernel for 3-body force calculation - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -53,8 +53,8 @@ class BaseThree { * - -5 Double precision is not supported on card * - -10 if invalid thread_per_atom setting **/ int init_three(const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, const void *pair_program, const char *k_two, const char *k_three_center, const char *k_three_end); @@ -88,7 +88,7 @@ class BaseThree { * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, + inline void resize_local(const int inum, const int host_inum, const int max_nbors, bool &success) { nbor->resize(inum,host_inum,max_nbors,success); } @@ -133,33 +133,33 @@ class BaseThree { /// Build neighbor list on device int build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success); /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, const int nall, + void compute(const int f_ago, const int inum_full, const int nall, const int nlist, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); /// Pair loop with device neighboring - int * compute(const int ago, const int inum_full, const int nall, + int * compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); /// Pair loop with device neighboring int ** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success); - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage Device *device; @@ -186,7 +186,7 @@ class BaseThree { Answer *ans; #ifdef THREE_CONCURRENT Answer *ans2; - #endif + #endif // --------------------------- NBOR DATA ---------------------------- @@ -205,15 +205,16 @@ class BaseThree { protected: bool _compiled; int _block_pair, _block_size, _threads_per_atom, _end_command_queue; + int _gpu_nbor; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const void *pair_string, + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k_two, const char *k_three_center, const char *k_three_end); - virtual void loop(const bool _eflag, const bool _vflag, + virtual void loop(const bool _eflag, const bool _vflag, const int evatom) = 0; }; diff --git a/lib/gpu/lal_beck.cpp b/lib/gpu/lal_beck.cpp index 062c095957..165a02b71a 100644 --- a/lib/gpu/lal_beck.cpp +++ b/lib/gpu/lal_beck.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,17 +33,17 @@ BeckT::Beck() : BaseAtomic(), _allocated(false) { } template -BeckT::~Beck() { +BeckT::~Beck() { clear(); } - + template int BeckT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int BeckT::init(const int ntypes, +int BeckT::init(const int ntypes, double **host_cutsq, double **host_aa, double **host_alpha, double **host_beta, double **host_AA, double **host_BB, @@ -126,7 +126,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu index 7ccefd8859..7d72128b5f 100644 --- a/lib/gpu/lal_beck.cu +++ b/lib/gpu/lal_beck.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,7 +24,7 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_beck(const __global numtyp4 *restrict x_, +__kernel void k_beck(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict beck1, const __global numtyp4 *restrict beck2, const int lj_types, @@ -50,20 +50,20 @@ __kernel void k_beck(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -133,7 +133,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -143,7 +143,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_, beck1[tid]=beck1_in[tid]; beck2[tid]=beck2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -152,7 +152,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_beck.h b/lib/gpu/lal_beck.h index fa56db2402..db26bebeb0 100644 --- a/lib/gpu/lal_beck.h +++ b/lib/gpu/lal_beck.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Beck : public BaseAtomic { public: Beck(); - ~Beck(); + ~Beck(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,8 +41,8 @@ class Beck : public BaseAtomic { double **host_aa, double **host_alpha, double **host_beta, double **host_AA, double **host_BB, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -67,7 +67,7 @@ class Beck : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp index 28ca0df346..226c2d477b 100644 --- a/lib/gpu/lal_beck_ext.cpp +++ b/lib/gpu/lal_beck_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -77,7 +77,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa, cell_size, gpu_split, screen); BLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,8 +102,8 @@ int ** beck_gpu_compute_n(const int ago, const int inum_full, return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void beck_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp index 55cb24d3b0..7c1ed944d3 100644 --- a/lib/gpu/lal_born.cpp +++ b/lib/gpu/lal_born.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,10 +33,10 @@ BornT::Born() : BaseAtomic(), _allocated(false) { } template -BornT::~Born() { +BornT::~Born() { clear(); } - + template int BornT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,12 +44,12 @@ int BornT::bytes_per_atom(const int max_nbors) const { template int BornT::init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_born1, double **host_born2, + double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **host_sigma, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -80,7 +80,7 @@ int BornT::init(const int ntypes, double **host_cutsq, coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, - host_d,host_offset); + host_d,host_offset); cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, @@ -102,18 +102,18 @@ void BornT::reinit(const int ntypes, double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **host_offset) { - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv, host_born1,host_born2,host_born3); this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c, - host_d,host_offset); + host_d,host_offset); } template @@ -151,7 +151,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -169,7 +169,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) { } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &coeff1, &coeff2, - &cutsq_sigma, &_lj_types, &sp_lj, + &cutsq_sigma, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu index 5f917be846..0ca7fea5fe 100644 --- a/lib/gpu/lal_born.cu +++ b/lib/gpu/lal_born.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,16 +24,16 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_born(const __global numtyp4 *restrict x_, +__kernel void k_born(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp2 *restrict cutsq_sigma, - const int lj_types, - const __global numtyp *restrict sp_lj_in, + const int lj_types, + const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -51,20 +51,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; - energy+=factor_lj*(e-coeff2[mtype].w); + energy+=factor_lj*(e-coeff2[mtype].w); } if (vflag>0) { virial[0] += delx*delx*force; @@ -113,20 +113,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_born_fast(const __global numtyp4 *restrict x_, +__kernel void k_born_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, + const __global numtyp4 *restrict coeff2_in, const __global numtyp2 *restrict cutsq_sigma, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -137,7 +137,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_, if (eflag>0) coeff2[tid]=coeff2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -146,7 +146,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; - energy+=factor_lj*(e-coeff2[mtype].w); + energy+=factor_lj*(e-coeff2[mtype].w); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h index 6fed6461d2..685f4d87a9 100644 --- a/lib/gpu/lal_born.h +++ b/lib/gpu/lal_born.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Born : public BaseAtomic { public: Born(); - ~Born(); + ~Born(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,20 +38,20 @@ class Born : public BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_born1, double **host_born2, + double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, - double **host_d, double **host_sigma, + double **host_d, double **host_sigma, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **host_offset); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -77,7 +77,7 @@ class Born : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp index 94becf8c69..68695c4938 100644 --- a/lib/gpu/lal_born_coul_long.cpp +++ b/lib/gpu/lal_born_coul_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,17 +37,17 @@ template BornCoulLongT::~BornCoulLongT() { clear(); } - + template int BornCoulLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, - double **host_a, double **host_c, double **host_d, - double **host_sigma, double **host_offset, +int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **host_sigma, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -84,12 +84,12 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, - host_d,host_offset); - + host_d,host_offset); + cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, host_cut_ljsq,host_sigma); - + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; @@ -142,7 +142,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -157,15 +157,15 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) { &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, - &cutsq_sigma, &_cut_coulsq, &_qqrd2e, + &cutsq_sigma, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->q, + &nbor_pitch, &this->atom->q, &cutsq_sigma, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu index 3d74f2087a..4cb4ea448f 100644 --- a/lib/gpu/lal_born_coul_long.cu +++ b/lib/gpu/lal_born_coul_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_born_long(const __global numtyp4 *restrict x_, +__kernel void k_born_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, - const __global numtyp4 *restrict cutsq_sigma, + const __global numtyp4 *restrict cutsq_sigma, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -64,14 +64,14 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - if (rsq < cut_coulsq) - e_coul += prefactor*(_erfc-factor_coul); - if (rsq < coeff1[mtype].w) { - numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv - + coeff2[mtype].z*r2inv*r6inv; - energy+=factor_lj*(e-coeff2[mtype].w); - } - } - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); - } // if ii -} - -__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, - const __global numtyp *restrict q_, - const __global numtyp4 *restrict cutsq_sigma, - const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, const int t_per_atom) { - int tid, ii, offset; - atom_info(t_per_atom,ii,tid,offset); - - __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp sp_lj[8]; - if (tid<8) - sp_lj[tid]=sp_lj_in[tid]; - if (tid0) - coeff2[tid]=coeff2_in[tid]; - } - - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; - acctyp4 f; - f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; - - __syncthreads(); - - if (ii0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < coeff1[mtype].w) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + + coeff2[mtype].z*r2inv*r6inv; + energy+=factor_lj*(e-coeff2[mtype].w); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp4 *restrict cutsq_sigma, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + coeff2[tid]=coeff2_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double **host_cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, - double **host_a, double **host_c, double **host_d, + int init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, double **host_sigma, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -59,12 +59,12 @@ class BornCoulLong : public BaseCharge { // --------------------------- TYPE DATA -------------------------- - /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, + /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, /// coeff1.w = born3 UCL_D_Vec coeff1; /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset UCL_D_Vec coeff2; - /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, + /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, /// cutsq_sigma.z = sigma UCL_D_Vec cutsq_sigma; /// Special LJ values [0-3] and Special Coul values [4-7] @@ -73,7 +73,7 @@ class BornCoulLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp index 382e9a2b2c..feb7472e74 100644 --- a/lib/gpu/lal_born_coul_long_ext.cpp +++ b/lib/gpu/lal_born_coul_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,9 +30,9 @@ static BornCoulLong BORNCLMF; int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, - double **sigma, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, + double **sigma, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { @@ -58,10 +58,10 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, - host_born3, host_a, host_c, host_d, sigma, offset, - special_lj, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_cut_ljsq, host_cut_coulsq, + init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, offset, + special_lj, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BORNCLMF.device->world_barrier(); @@ -78,14 +78,14 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, - host_born3, host_a, host_c, host_d, sigma, offset, - special_lj, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_cut_ljsq, host_cut_coulsq, + init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, offset, + special_lj, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BORNCLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,7 +102,7 @@ void borncl_gpu_clear() { int** borncl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -112,8 +112,8 @@ int** borncl_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void borncl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp index 7615c1dd53..7ebd7b744f 100644 --- a/lib/gpu/lal_born_coul_wolf.cpp +++ b/lib/gpu/lal_born_coul_wolf.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,17 +37,17 @@ template BornCoulWolfT::~BornCoulWolfT() { clear(); } - + template int BornCoulWolfT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, - double **host_a, double **host_c, double **host_d, - double **host_sigma, double **host_offset, +int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **host_sigma, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -84,12 +84,12 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, - host_d,host_offset); - + host_d,host_offset); + cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, host_cut_ljsq,host_sigma); - + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; @@ -144,7 +144,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -157,17 +157,17 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) { &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, - &cutsq_sigma, &_cut_coulsq, &_qqrd2e, - &_alf, &_e_shift, &_f_shift, + &cutsq_sigma, &_cut_coulsq, &_qqrd2e, + &_alf, &_e_shift, &_f_shift, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq_sigma, &_cut_coulsq, - &_qqrd2e, &_alf, &_e_shift, &_f_shift, + &_qqrd2e, &_alf, &_e_shift, &_f_shift, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu index e7706b408a..0dc7d08c63 100644 --- a/lib/gpu/lal_born_coul_wolf.cu +++ b/lib/gpu/lal_born_coul_wolf.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -31,21 +31,21 @@ texture q_tex; #define MY_PIS (acctyp)1.77245385090551602729 -__kernel void k_born_wolf(const __global numtyp4 *restrict x_, +__kernel void k_born_wolf(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, - const __global numtyp4 *restrict cutsq_sigma, + const __global numtyp4 *restrict cutsq_sigma, const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp alf, const numtyp e_shift, + const numtyp alf, const numtyp e_shift, const numtyp f_shift, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -67,20 +67,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -108,12 +108,12 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_, numtyp forcecoul, forceborn, force, r6inv, prefactor; numtyp v_sh = (numtyp)0.0; numtyp rexp = (numtyp)0.0; - + if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq numtyp r = ucl_sqrt(rsq); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); r6inv = r2inv*r2inv*r2inv; - forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv + forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv)*factor_lj; } else forceborn = (numtyp)0.0; @@ -147,7 +147,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_, numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; energy+=factor_lj*(e-coeff2[mtype].w); - } + } } if (vflag>0) { virial[0] += delx*delx*force; @@ -165,20 +165,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, +__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, + const __global numtyp4 *restrict coeff2_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, const __global numtyp4 *restrict cutsq_sigma, const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp alf, const numtyp e_shift, + const numtyp alf, const numtyp e_shift, const numtyp f_shift, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -193,7 +193,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, if (eflag>0) coeff2[tid]=coeff2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -201,23 +201,23 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -244,12 +244,12 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, numtyp forcecoul, forceborn, force, r6inv, prefactor; numtyp v_sh = (numtyp)0.0; numtyp rexp = (numtyp)0.0; - + if (rsq < cutsq_sigma[mtype].y) { numtyp r = ucl_sqrt(rsq); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); r6inv = r2inv*r2inv*r2inv; - forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv + forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv)*factor_lj; } else forceborn = (numtyp)0.0; diff --git a/lib/gpu/lal_born_coul_wolf.h b/lib/gpu/lal_born_coul_wolf.h index 9e02d23233..4b2406b989 100644 --- a/lib/gpu/lal_born_coul_wolf.h +++ b/lib/gpu/lal_born_coul_wolf.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,19 +30,19 @@ class BornCoulWolf : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double **host_cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, - double **host_a, double **host_c, double **host_d, + int init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, double **host_sigma, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double alf, const double e_shift, @@ -60,12 +60,12 @@ class BornCoulWolf : public BaseCharge { // --------------------------- TYPE DATA -------------------------- - /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, + /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, /// coeff1.w = born3 UCL_D_Vec coeff1; /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset UCL_D_Vec coeff2; - /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, + /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, /// cutsq_sigma.z = sigma UCL_D_Vec cutsq_sigma; /// Special LJ values [0-3] and Special Coul values [4-7] @@ -74,7 +74,7 @@ class BornCoulWolf : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift; diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp index b56c526119..254b1c905b 100644 --- a/lib/gpu/lal_born_coul_wolf_ext.cpp +++ b/lib/gpu/lal_born_coul_wolf_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,7 +28,7 @@ static BornCoulWolf BORNCWMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, double **host_born3, + double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **sigma, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, @@ -60,9 +60,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, if (world_me==0) init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, - host_cut_coulsq, host_special_coul, qqrd2e, + host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); BORNCWMF.device->world_barrier(); @@ -79,15 +79,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, - host_born3, host_a, host_c, host_d, sigma, - offset, special_lj, inum, nall, 300, + init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, + offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, - host_cut_coulsq, host_special_coul, qqrd2e, + host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); BORNCWMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -104,7 +104,7 @@ void borncw_gpu_clear() { int** borncw_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -114,8 +114,8 @@ int** borncw_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void borncw_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp index 6bd51e6d68..b1ebf5804c 100644 --- a/lib/gpu/lal_born_ext.cpp +++ b/lib/gpu/lal_born_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static Born BORNMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_born1, double **host_born2, - double **host_born3, double **host_a, double **host_c, - double **host_d, double **sigma, + double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **sigma, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { @@ -56,7 +56,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -75,13 +75,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, - host_born3, host_a, host_c, host_d, sigma, + init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BORNMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,24 +102,24 @@ void born_gpu_reinit(const int ntypes, double **host_rhoinv, int world_me=BORNMF.device->world_me(); int gpu_rank=BORNMF.device->gpu_rank(); int procs_per_gpu=BORNMF.device->procs_per_gpu(); - + if (world_me==0) BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, offset); - + BORNMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } void born_gpu_clear() { - BORNMF.clear(); + BORNMF.clear(); } int ** born_gpu_compute_n(const int ago, const int inum_full, @@ -132,8 +132,8 @@ int ** born_gpu_compute_n(const int ago, const int inum_full, return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void born_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp index f66759ee3a..0da4068d51 100644 --- a/lib/gpu/lal_buck.cpp +++ b/lib/gpu/lal_buck.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,10 +33,10 @@ BuckT::Buck() : BaseAtomic(), _allocated(false) { } template -BuckT::~Buck() { +BuckT::~Buck() { clear(); } - + template int BuckT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,11 +44,11 @@ int BuckT::bytes_per_atom(const int max_nbors) const { template int BuckT::init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -79,7 +79,7 @@ int BuckT::init(const int ntypes, double **host_cutsq, coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, - host_offset); + host_offset); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -95,14 +95,14 @@ template void BuckT::reinit(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset) { - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv, host_buck1,host_buck2,host_cutsq); this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c, @@ -143,7 +143,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -154,13 +154,13 @@ void BuckT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu index 955547e598..c1e1c7d7e2 100644 --- a/lib/gpu/lal_buck.cu +++ b/lib/gpu/lal_buck.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,15 +24,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_buck(const __global numtyp4 *restrict x_, +__kernel void k_buck(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, - const int lj_types, + const __global numtyp4 *restrict coeff2, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -50,20 +50,20 @@ __kernel void k_buck(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; - energy+=factor_lj*(e-coeff2[mtype].z); + energy+=factor_lj*(e-coeff2[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -111,19 +111,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_buck_fast(const __global numtyp4 *restrict x_, +__kernel void k_buck_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -134,7 +134,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_, if (eflag>0) coeff2[tid]=coeff2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -143,7 +143,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; - energy+=factor_lj*(e-coeff2[mtype].z); + energy+=factor_lj*(e-coeff2[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h index ebcd72d990..3b84066355 100644 --- a/lib/gpu/lal_buck.h +++ b/lib/gpu/lal_buck.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Buck : public BaseAtomic { public: Buck(); - ~Buck(); + ~Buck(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,18 +38,18 @@ class Buck : public BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -72,7 +72,7 @@ class Buck : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp index bec640e7a6..e4f829fc5c 100644 --- a/lib/gpu/lal_buck_coul.cpp +++ b/lib/gpu/lal_buck_coul.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,10 +33,10 @@ BuckCoulT::BuckCoul() : BaseCharge(), _allocated(false) { } template -BuckCoulT::~BuckCoul() { +BuckCoulT::~BuckCoul() { clear(); } - + template int BuckCoulT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,11 +44,11 @@ int BuckCoulT::bytes_per_atom(const int max_nbors) const { template int BuckCoulT::init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e) { @@ -81,21 +81,21 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq, coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, - host_offset); - + host_offset); + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq, host_cut_ljsq, host_cut_coulsq); - + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; host_write[i+4]=host_special_coul[i]; } ucl_copy(sp_lj,host_write,8,false); - + _qqrd2e = qqrd2e; - + _allocated=true; this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes(); return 0; @@ -135,7 +135,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -147,12 +147,12 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, &this->atom->q, + &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu index 87604a02ea..6f0d414825 100644 --- a/lib/gpu/lal_buck_coul.cu +++ b/lib/gpu/lal_buck_coul.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_buck_coul(const __global numtyp4 *restrict x_, +__kernel void k_buck_coul(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_ , - const __global numtyp4 *restrict cutsq, + const __global numtyp4 *restrict cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -63,21 +63,21 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) coeff2[tid]=coeff2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -180,7 +180,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_buck_coul.h b/lib/gpu/lal_buck_coul.h index e4bf59107c..3f8428bfe1 100644 --- a/lib/gpu/lal_buck_coul.h +++ b/lib/gpu/lal_buck_coul.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class BuckCoul : public BaseCharge { public: BuckCoul(); - ~BuckCoul(); + ~BuckCoul(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,11 +38,11 @@ class BuckCoul : public BaseCharge { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e); @@ -71,11 +71,11 @@ class BuckCoul : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; - + numtyp _qqrd2e; - + private: bool _allocated; void loop(const bool _eflag, const bool _vflag); diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp index dd696fc6bb..e5a5e1315b 100644 --- a/lib/gpu/lal_buck_coul_ext.cpp +++ b/lib/gpu/lal_buck_coul_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,8 +28,8 @@ static BuckCoul BUCKCMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, @@ -57,9 +57,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); @@ -77,14 +77,14 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); BUCKCMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -96,12 +96,12 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, } void buckc_gpu_clear() { - BUCKCMF.clear(); + BUCKCMF.clear(); } int ** buckc_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -111,8 +111,8 @@ int ** buckc_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void buckc_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp index 4aa720132a..81faada116 100644 --- a/lib/gpu/lal_buck_coul_long.cpp +++ b/lib/gpu/lal_buck_coul_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template BuckCoulLongT::~BuckCoulLongT() { clear(); } - + template int BuckCoulLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,8 +45,8 @@ int BuckCoulLongT::bytes_per_atom(const int max_nbors) const { template int BuckCoulLongT::init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, double **host_offset, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -83,11 +83,11 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq, coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, - host_offset); - + host_offset); + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); - + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; @@ -139,7 +139,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -150,16 +150,16 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, - &cutsq, &_cut_coulsq, &_qqrd2e, + &cutsq, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, &cutsq, + &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu index fc68d12471..da3237a31f 100644 --- a/lib/gpu/lal_buck_coul_long.cu +++ b/lib/gpu/lal_buck_coul_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, +__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, - const __global numtyp4 *restrict coeff2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, - const __global numtyp *restrict cutsq, + const __global numtyp *restrict cutsq, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -64,14 +64,14 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - if (rsq < cut_coulsq) - e_coul += prefactor*(_erfc-factor_coul); - if (rsq < coeff1[mtype].w) { - numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; - energy+=factor_lj*(e-coeff2[mtype].z); - } - } - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, - vflag,ans,engv); - } // if ii -} - -__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict coeff1_in, - const __global numtyp4 *restrict coeff2_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp *restrict q_, - const __global numtyp *restrict cutsq, - const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald, - const int t_per_atom) { - int tid, ii, offset; - atom_info(t_per_atom,ii,tid,offset); - - __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __local numtyp sp_lj[8]; - if (tid<8) - sp_lj[tid]=sp_lj_in[tid]; - if (tid0) - coeff2[tid]=coeff2_in[tid]; - } - - acctyp energy=(acctyp)0; - acctyp e_coul=(acctyp)0; - acctyp4 f; - f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp virial[6]; - for (int i=0; i<6; i++) - virial[i]=(acctyp)0; - - __syncthreads(); - - if (ii0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < coeff1[mtype].w) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; + energy+=factor_lj*(e-coeff2[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, + const numtyp cut_coulsq, + const numtyp qqrd2e, const numtyp g_ewald, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + coeff2[tid]=coeff2_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,11 +38,11 @@ class BuckCoulLong : public BaseCharge { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_rhoinv, double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_rhoinv, double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -71,7 +71,7 @@ class BuckCoulLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp index 9c0c331ee1..28a89746b3 100644 --- a/lib/gpu/lal_buck_coul_long_ext.cpp +++ b/lib/gpu/lal_buck_coul_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,7 +28,7 @@ static BuckCoulLong BUCKCLMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, + double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, @@ -58,8 +58,8 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); @@ -77,13 +77,13 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, - host_a, host_c, offset, special_lj, inum, nall, 300, + init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BUCKCLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -100,7 +100,7 @@ void buckcl_gpu_clear() { int** buckcl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -110,8 +110,8 @@ int** buckcl_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void buckcl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp index 75c88e8dbe..336aab6d4c 100644 --- a/lib/gpu/lal_buck_ext.cpp +++ b/lib/gpu/lal_buck_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,8 +28,8 @@ static Buck BUCKMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, - double **host_a, double **host_c, + double **host_buck1, double **host_buck2, + double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { @@ -55,7 +55,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, int init_ok=0; if (world_me==0) - init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -73,12 +73,12 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, + init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BUCKMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,24 +98,24 @@ void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv, int world_me=BUCKMF.device->world_me(); int gpu_rank=BUCKMF.device->gpu_rank(); int procs_per_gpu=BUCKMF.device->procs_per_gpu(); - + if (world_me==0) BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset); - + BUCKMF.device->world_barrier(); for (int i=0; igpu_barrier(); } } void buck_gpu_clear() { - BUCKMF.clear(); + BUCKMF.clear(); } int ** buck_gpu_compute_n(const int ago, const int inum_full, @@ -128,8 +128,8 @@ int ** buck_gpu_compute_n(const int ago, const int inum_full, return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void buck_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_cg_cmm.cpp b/lib/gpu/lal_cg_cmm.cpp index 96455888f0..d361e32b09 100644 --- a/lib/gpu/lal_cg_cmm.cpp +++ b/lib/gpu/lal_cg_cmm.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -33,23 +33,23 @@ CGCMMT::CGCMM() : BaseAtomic(), _allocated(false) { } template -CGCMMT::~CGCMM() { +CGCMMT::~CGCMM() { clear(); } - + template int CGCMMT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int CGCMMT::init(const int ntypes, double **host_cutsq, - int **host_cg_type, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, +int CGCMMT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -75,12 +75,12 @@ int CGCMMT::init(const int ntypes, double **host_cutsq, host_write[i]=0.0; lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY); - this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, + this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, host_cg_type,host_lj1,host_lj2); lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -126,7 +126,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -138,7 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu index 8f89f74d22..70d2ab6092 100644 --- a/lib/gpu/lal_cg_cmm.cu +++ b/lib/gpu/lal_cg_cmm.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -24,15 +24,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, +__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, + const __global numtyp4 *restrict lj3, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -50,20 +50,20 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii class CGCMM : public BaseAtomic { public: CGCMM(); - ~CGCMM(); + ~CGCMM(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class CGCMM : public BaseAtomic { int init(const int ntypes, double **host_cutsq, int **host_cg_type, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); @@ -66,7 +66,7 @@ class CGCMM : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _cmm_types; private: diff --git a/lib/gpu/lal_cg_cmm_ext.cpp b/lib/gpu/lal_cg_cmm_ext.cpp index 0d2c3d8fbf..b6fc110b15 100644 --- a/lib/gpu/lal_cg_cmm_ext.cpp +++ b/lib/gpu/lal_cg_cmm_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static CGCMM CMMMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, - double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { CMMMF.clear(); @@ -55,7 +55,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, int init_ok=0; if (world_me==0) - init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, + init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -78,7 +78,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, maxspecial, cell_size, gpu_split, screen); CMMMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -103,8 +103,8 @@ int** cmm_gpu_compute_n(const int ago, const int inum_full, return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void cmm_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_cg_cmm_long.cpp b/lib/gpu/lal_cg_cmm_long.cpp index 92e6bd04b5..14b5b7622c 100644 --- a/lib/gpu/lal_cg_cmm_long.cpp +++ b/lib/gpu/lal_cg_cmm_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,22 +37,22 @@ template CGCMMLongT::~CGCMMLong() { clear(); } - + template int CGCMMLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int CGCMMLongT::init(const int ntypes, double **host_cutsq, - int **host_cg_type, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, +int CGCMMLongT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, - double **host_cut_ljsq, + double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { @@ -137,7 +137,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -149,13 +149,13 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, &this->atom->q, - &_cut_coulsq, &_qqrd2e, &_g_ewald, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu index ae8b6cda47..f6942d1809 100644 --- a/lib/gpu/lal_cg_cmm_long.cu +++ b/lib/gpu/lal_cg_cmm_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -29,12 +29,12 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, +__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, + const __global numtyp4 *restrict lj3, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, @@ -70,7 +70,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; @@ -136,7 +136,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].y) { energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- lj3[mtype].w; - } + } } if (vflag>0) { virial[0] += delx*delx*force; @@ -154,17 +154,17 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, - const __global numtyp *restrict q_, + const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -179,7 +179,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, lj1[tid]=lj1_in[tid]; lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -187,16 +187,16 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_cg_cmm_long.h b/lib/gpu/lal_cg_cmm_long.h index bde5c79c74..aa0cbfbaf0 100644 --- a/lib/gpu/lal_cg_cmm_long.h +++ b/lib/gpu/lal_cg_cmm_long.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class CGCMMLong : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class CGCMMLong : public BaseCharge { int init(const int ntypes, double **host_cutsq, int ** cg_type, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -58,7 +58,7 @@ class CGCMMLong : public BaseCharge { // --------------------------- TYPE DATA -------------------------- - /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, + /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, UCL_D_Vec lj1; /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset UCL_D_Vec lj3; @@ -68,7 +68,7 @@ class CGCMMLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_cg_cmm_long_ext.cpp b/lib/gpu/lal_cg_cmm_long_ext.cpp index 966588bf9b..ee0a0269e5 100644 --- a/lib/gpu/lal_cg_cmm_long_ext.cpp +++ b/lib/gpu/lal_cg_cmm_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static CGCMMLong CMMLMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, @@ -58,7 +58,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, int init_ok=0; if (world_me==0) init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, + host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e,g_ewald); @@ -82,7 +82,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); CMMLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -99,7 +99,7 @@ void cmml_gpu_clear() { int** cmml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -109,8 +109,8 @@ int** cmml_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q,boxlo,prd); -} - +} + void cmml_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp index 157072dc22..9cd032b3c6 100644 --- a/lib/gpu/lal_charmm_long.cpp +++ b/lib/gpu/lal_charmm_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template CHARMMLongT::~CHARMMLong() { clear(); } - + template int CHARMMLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,9 +45,9 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const { template int CHARMMLongT::init(const int ntypes, - double host_cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double host_cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -144,7 +144,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -153,17 +153,17 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, - &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu index dde50da300..244131f833 100644 --- a/lib/gpu/lal_charmm_long.cu +++ b/lib/gpu/lal_charmm_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -31,14 +31,14 @@ texture q_tex; __kernel void k_charmm_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const int lj_types, + const int lj_types, const __global numtyp *restrict sp_lj, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp denom_lj, @@ -61,7 +61,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; @@ -93,7 +93,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); if (rsq > cut_lj_innersq) { switch1 = (cut_ljsq-rsq); - numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ + numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ denom_lj; switch1 *= switch1; switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ @@ -130,7 +130,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, if (rsq > cut_lj_innersq) e *= switch1; energy+=factor_lj*e; - } + } } if (vflag>0) { virial[0] += delx*delx*force; @@ -148,19 +148,19 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, const __global numtyp2 *restrict ljd_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp denom_lj, - const numtyp cut_bothsq, const numtyp cut_ljsq, + const numtyp cut_bothsq, const numtyp cut_ljsq, const numtyp cut_lj_innersq, const int t_per_atom) { int tid, ii, offset; @@ -174,7 +174,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, ljd[tid]=ljd_in[tid]; if (tid+BLOCK_BIO_PAIR cut_lj_innersq) { switch1 = (cut_ljsq-rsq); - numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ + numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ denom_lj; switch1 *= switch1; switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ diff --git a/lib/gpu/lal_charmm_long.h b/lib/gpu/lal_charmm_long.h index 201a5c3694..011083db13 100644 --- a/lib/gpu/lal_charmm_long.h +++ b/lib/gpu/lal_charmm_long.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class CHARMMLong : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,12 +40,12 @@ class CHARMMLong : public BaseCharge { int init(const int ntypes, double host_cut_bothsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald, - const double cut_lj_innersq, const double denom_lj, + const double cut_lj_innersq, const double denom_lj, double **epsilon, double **sigma, const bool mix_arithmetic); /// Clear all host and device data @@ -70,7 +70,7 @@ class CHARMMLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e, _g_ewald, _denom_lj; diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp index 807988a3e8..e24c650be4 100644 --- a/lib/gpu/lal_charmm_long_ext.cpp +++ b/lib/gpu/lal_charmm_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -87,7 +87,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, sigma, mix_arithmetic); CRMLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -104,7 +104,7 @@ void crml_gpu_clear() { int** crml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -114,14 +114,14 @@ int** crml_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void crml_gpu_compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q, const int nlocal, + bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q, diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp index 28045217d3..fb2b643e5e 100644 --- a/lib/gpu/lal_colloid.cpp +++ b/lib/gpu/lal_colloid.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,23 +33,23 @@ ColloidT::Colloid() : BaseAtomic(), _allocated(false) { } template -ColloidT::~Colloid() { +ColloidT::~Colloid() { clear(); } - + template int ColloidT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int ColloidT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, double **host_a12, - double **host_a1, double **host_a2, - double **host_d1, double **host_d2, +int ColloidT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, double **host_a12, + double **host_a1, double **host_a2, + double **host_d1, double **host_d2, double **host_sigma3, double **host_sigma6, int **host_form, const int nlocal, const int nall, const int max_nbors, @@ -97,7 +97,7 @@ int ColloidT::init(const int ntypes, UCL_H_Vec dview_form(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); for (int i=0; iucl_device),UCL_READ_ONLY); for (int i=0; i(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -170,9 +170,9 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) { } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &colloid1, &colloid2, &form, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, &vflag, + &colloid1, &colloid2, &form, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu index a4d6c8bf33..28a9809b19 100644 --- a/lib/gpu/lal_colloid.cu +++ b/lib/gpu/lal_colloid.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,18 +24,18 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_colloid(const __global numtyp4 *restrict x_, +__kernel void k_colloid(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global numtyp4 *restrict colloid1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global numtyp4 *restrict colloid1, const __global numtyp4 *restrict colloid2, - const __global int *form, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *form, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -53,20 +53,20 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -176,22 +176,22 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, +__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global numtyp4 *restrict colloid1_in, + const __global numtyp4 *restrict colloid1_in, const __global numtyp4 *restrict colloid2_in, - const __global int *form_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global int *form_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; @@ -208,7 +208,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -217,7 +217,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_colloid.h b/lib/gpu/lal_colloid.h index 416beabcdf..dfbd4dbadd 100644 --- a/lib/gpu/lal_colloid.h +++ b/lib/gpu/lal_colloid.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Colloid : public BaseAtomic { public: Colloid(); - ~Colloid(); + ~Colloid(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,11 +40,11 @@ class Colloid : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - double **host_a12, double **host_a1, double **host_a2, - double **host_d1, double **host_d2, double **host_sigma3, - double **host_sigma6, int **host_form, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double **host_a12, double **host_a1, double **host_a2, + double **host_d1, double **host_d2, double **host_sigma3, + double **host_sigma6, int **host_form, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -65,7 +65,7 @@ class Colloid : public BaseAtomic { UCL_D_Vec lj3; /// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2 UCL_D_Vec colloid1; - /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, + /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, /// colloid2.w = sigma6 UCL_D_Vec colloid2; /// form @@ -76,7 +76,7 @@ class Colloid : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp index ea83cb6417..8e1b18e72f 100644 --- a/lib/gpu/lal_colloid_ext.cpp +++ b/lib/gpu/lal_colloid_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -29,9 +29,9 @@ static Colloid COLLMF; // --------------------------------------------------------------------------- int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, - double **host_a12, double **host_a1, double **host_a2, - double **host_d1, double **host_d2, double **host_sigma3, + double **offset, double *special_lj, + double **host_a12, double **host_a1, double **host_a2, + double **host_d1, double **host_d2, double **host_sigma3, double **host_sigma6, int **host_form, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { @@ -57,9 +57,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, int init_ok=0; if (world_me==0) - init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, host_a12, host_a1, - host_a2, host_d1, host_d2, host_sigma3, + host_a2, host_d1, host_d2, host_sigma3, host_sigma6, host_form, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -78,13 +78,13 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, } if (gpu_rank==i && world_me!=0) init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, host_a12, host_a1, host_a2, - host_d1, host_d2, host_sigma3, host_sigma6, host_form, + offset, special_lj, host_a12, host_a1, host_a2, + host_d1, host_d2, host_sigma3, host_sigma6, host_form, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); COLLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -109,8 +109,8 @@ int ** colloid_gpu_compute_n(const int ago, const int inum_full, return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void colloid_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_coul.cpp b/lib/gpu/lal_coul.cpp index 53fb3dae82..a06a29e610 100644 --- a/lib/gpu/lal_coul.cpp +++ b/lib/gpu/lal_coul.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndtrung@umich.edu ***************************************************************************/ @@ -37,7 +37,7 @@ template CoulT::~Coul() { clear(); } - + template int CoulT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -75,7 +75,7 @@ int CoulT::init(const int ntypes, double **host_scale, double **host_cutsq, scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale); - + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); @@ -97,10 +97,10 @@ void CoulT::reinit(const int ntypes, double **host_scale) { // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale); } @@ -138,7 +138,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -149,14 +149,14 @@ void CoulT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu index e955922a7c..503e674c81 100644 --- a/lib/gpu/lal_coul.cu +++ b/lib/gpu/lal_coul.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ndtrung@umich.edu // ***************************************************************************/ @@ -33,14 +33,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale, const int lj_types, const __global numtyp *restrict sp_cl_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, - const __global numtyp *restrict q_, - const __global numtyp *restrict cutsq, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -50,7 +50,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_, sp_cl[1]=sp_cl_in[1]; sp_cl[2]=sp_cl_in[2]; sp_cl[3]=sp_cl_in[3]; - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -58,13 +58,13 @@ __kernel void k_coul(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,13 +39,13 @@ class Coul : public BaseCharge { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_scale, double **host_cutsq, double *host_special_coul, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const double qqrd2e); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_scale); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -68,7 +68,7 @@ class Coul : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_coul_debye.cpp b/lib/gpu/lal_coul_debye.cpp index 990dff6db9..9098aeacb1 100644 --- a/lib/gpu/lal_coul_debye.cpp +++ b/lib/gpu/lal_coul_debye.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndtrung@umich.edu ***************************************************************************/ @@ -37,7 +37,7 @@ template CoulDebyeT::~CoulDebye() { clear(); } - + template int CoulDebyeT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -87,7 +87,7 @@ int CoulDebyeT::init(const int ntypes, double **host_scale, _qqrd2e=qqrd2e; _kappa=kappa; - + _allocated=true; this->_max_bytes=cutsq.row_bytes()+scale.row_bytes()+sp_cl.row_bytes(); return 0; @@ -98,10 +98,10 @@ void CoulDebyeT::reinit(const int ntypes, double **host_scale) { // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale); } @@ -139,7 +139,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -156,9 +156,9 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) { } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, &cutsq, + &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &_kappa, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu index 0e4c0ea2d0..464a1b18de 100644 --- a/lib/gpu/lal_coul_debye.cu +++ b/lib/gpu/lal_coul_debye.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ndtrung@umich.edu // ***************************************************************************/ @@ -31,16 +31,16 @@ texture q_tex; __kernel void k_coul_debye(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale, - const int lj_types, + const int lj_types, const __global numtyp *restrict sp_cl_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp *restrict q_ , - const __global numtyp *restrict cutsq, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const numtyp kappa, const int t_per_atom) { int tid, ii, offset; @@ -59,27 +59,27 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,14 +39,14 @@ class CoulDebye : public BaseCharge { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_scale, double **host_cutsq, double *host_special_coul, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const double qqrd2e, const double kappa); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_scale); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -69,7 +69,7 @@ class CoulDebye : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e,_kappa; diff --git a/lib/gpu/lal_coul_debye_ext.cpp b/lib/gpu/lal_coul_debye_ext.cpp index ced08b63e4..af9156c24c 100644 --- a/lib/gpu/lal_coul_debye_ext.cpp +++ b/lib/gpu/lal_coul_debye_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndtrung@umich.edu ***************************************************************************/ @@ -75,7 +75,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq, maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa); CDEMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -93,16 +93,16 @@ void cdebye_gpu_reinit(const int ntypes, double **host_scale) { int world_me=CDEMF.device->world_me(); int gpu_rank=CDEMF.device->gpu_rank(); int procs_per_gpu=CDEMF.device->procs_per_gpu(); - + if (world_me==0) CDEMF.reinit(ntypes, host_scale); - + CDEMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } @@ -123,8 +123,8 @@ int** cdebye_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void cdebye_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp index ca81d32b2d..32c4342fbe 100644 --- a/lib/gpu/lal_coul_dsf.cpp +++ b/lib/gpu/lal_coul_dsf.cpp @@ -37,18 +37,18 @@ template CoulDSFT::~CoulDSF() { clear(); } - + template int CoulDSFT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, - const int max_nbors, const int maxspecial, +int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double e_shift, const double f_shift, + const double qqrd2e, const double e_shift, const double f_shift, const double alpha) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -123,7 +123,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -134,15 +134,15 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &this->_threads_per_atom); diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu index fc5bf5f138..82c44cd382 100644 --- a/lib/gpu/lal_coul_dsf.cu +++ b/lib/gpu/lal_coul_dsf.cu @@ -31,18 +31,18 @@ texture q_tex; #define MY_PIS (acctyp)1.77245385090551602729 -__kernel void k_coul_dsf(const __global numtyp4 *restrict x_, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, +__kernel void k_coul_dsf(const __global numtyp4 *restrict x_, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_ , const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp e_shift, const numtyp f_shift, + const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -60,19 +60,19 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -102,9 +102,9 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_, numtyp erfcd = ucl_exp(-alpha*alpha*rsq); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r); erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd; - forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + + forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + rsq*f_shift-factor_coul); - + force = forcecoul * r2inv; f.x+=delx*force; @@ -131,17 +131,17 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, +__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp e_shift, const numtyp f_shift, + const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -149,7 +149,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, __local numtyp sp_lj[4]; if (tid<4) sp_lj[tid]=sp_lj_in[tid]; - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -157,25 +157,25 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } - + for ( ; nbor { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, const int nlocal, const int nall, - const int max_nbors, const int maxspecial, + int init(const int ntypes, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double e_shift, const double f_shift, + const double qqrd2e, const double e_shift, const double f_shift, const double alpha); /// Clear all host and device data @@ -62,7 +62,7 @@ class CoulDSF : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp index e65a090a16..026dd924c9 100644 --- a/lib/gpu/lal_coul_dsf_ext.cpp +++ b/lib/gpu/lal_coul_dsf_ext.cpp @@ -27,11 +27,11 @@ static CoulDSF CDMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int cdsf_gpu_init(const int ntypes, const int inum, const int nall, +int cdsf_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double e_shift, const double f_shift, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double e_shift, const double f_shift, const double alpha) { CDMF.clear(); gpu_mode=CDMF.device->gpu_mode(); @@ -55,8 +55,8 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall, int init_ok=0; if (world_me==0) - init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_cut_coulsq, host_special_coul, + init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); CDMF.device->world_barrier(); @@ -73,12 +73,12 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_cut_coulsq, host_special_coul, + init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); CDMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -95,7 +95,7 @@ void cdsf_gpu_clear() { int** cdsf_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -105,8 +105,8 @@ int** cdsf_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void cdsf_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_coul_ext.cpp b/lib/gpu/lal_coul_ext.cpp index 291546d5b1..f03d8fcdfc 100644 --- a/lib/gpu/lal_coul_ext.cpp +++ b/lib/gpu/lal_coul_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndtrung@umich.edu ***************************************************************************/ @@ -75,7 +75,7 @@ int coul_gpu_init(const int ntypes, double **host_scale, maxspecial, cell_size, gpu_split, screen, qqrd2e); COULMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -93,16 +93,16 @@ void coul_gpu_reinit(const int ntypes, double **host_scale) { int world_me=COULMF.device->world_me(); int gpu_rank=COULMF.device->gpu_rank(); int procs_per_gpu=COULMF.device->procs_per_gpu(); - + if (world_me==0) COULMF.reinit(ntypes, host_scale); - + COULMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } @@ -113,7 +113,7 @@ void coul_gpu_clear() { int** coul_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -123,8 +123,8 @@ int** coul_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void coul_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp index d6e16a9668..b4c6a44d2f 100644 --- a/lib/gpu/lal_coul_long.cpp +++ b/lib/gpu/lal_coul_long.cpp @@ -36,7 +36,7 @@ template CoulLongT::~CoulLong() { clear(); } - + template int CoulLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -51,7 +51,7 @@ int CoulLongT::init(const int ntypes, double **host_scale, const double qqrd2e, const double g_ewald) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, - gpu_split,_screen,coul_long,"k_coul_long"); + gpu_split,_screen,coul_long,"k_coul_long"); if (success!=0) return success; @@ -67,13 +67,13 @@ int CoulLongT::init(const int ntypes, double **host_scale, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; iucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale); - + sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_coul[i]; @@ -129,7 +129,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -141,13 +141,13 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu index 12bbbee7d2..365195e00c 100644 --- a/lib/gpu/lal_coul_long.cu +++ b/lib/gpu/lal_coul_long.cu @@ -123,16 +123,16 @@ texture q_tex; #endif -__kernel void k_coul_long(const __global numtyp4 *restrict x_, +__kernel void k_coul_long(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale, const int lj_types, - const __global numtyp *restrict sp_cl_in, + const __global numtyp *restrict sp_cl_in, const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { @@ -216,15 +216,15 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale_in, const __global numtyp *restrict sp_cl_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, + __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h index 52ed60111b..d12198fccc 100644 --- a/lib/gpu/lal_coul_long.h +++ b/lib/gpu/lal_coul_long.h @@ -30,7 +30,7 @@ class CoulLong : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,13 +40,13 @@ class CoulLong : public BaseCharge { int init(const int ntypes, double **scale, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald); - + const double gpu_split, FILE *screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **scale); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp index 5552dc2437..06c102b2d1 100644 --- a/lib/gpu/lal_coul_long_ext.cpp +++ b/lib/gpu/lal_coul_long_ext.cpp @@ -95,16 +95,16 @@ void cl_gpu_reinit(const int ntypes, double **host_scale) { int world_me=CLMF.device->world_me(); int gpu_rank=CLMF.device->gpu_rank(); int procs_per_gpu=CLMF.device->procs_per_gpu(); - + if (world_me==0) CLMF.reinit(ntypes, host_scale); - + CLMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } @@ -114,28 +114,28 @@ void cl_gpu_clear() { } int** cl_gpu_compute_n(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd) { + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, - subhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, ilist, jnum, cpu_time, success, - host_q, boxlo, prd); + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); } void cl_gpu_compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd) { + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, - firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_q,nlocal,boxlo,prd); + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); } double cl_gpu_bytes() { diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index f326657e31..e95f2b30ef 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -45,8 +45,8 @@ DeviceT::~Device() { template int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, - const double p_split, const int nthreads, + const int last_gpu, const int gpu_mode, + const double p_split, const int nthreads, const int t_per_atom, const double cell_size, char *ocl_vendor, const int block_pair) { _nthreads=nthreads; @@ -83,8 +83,8 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world); std::string node_string=std::string(node_name); - - // Get the number of procs per node + + // Get the number of procs per node std::map name_map; std::map::iterator np; for (int i=0; i<_world_size; i++) { @@ -104,12 +104,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, split_id=split_num; split_num++; } - + // Set up a per node communicator and find rank within MPI_Comm node_comm; - MPI_Comm_split(_comm_world, split_id, 0, &node_comm); + MPI_Comm_split(_comm_world, split_id, 0, &node_comm); int node_rank; - MPI_Comm_rank(node_comm,&node_rank); + MPI_Comm_rank(node_comm,&node_rank); // set the device ID _procs_per_gpu=static_cast(ceil(static_cast(procs_per_node)/ @@ -120,7 +120,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, _time_device=true; if (_procs_per_gpu>1) _time_device=false; - + // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_rank(_comm_gpu,&_gpu_rank); @@ -128,12 +128,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) return -2; - + #ifndef CUDA_PROXY if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false) return -7; #endif - + if (gpu->set(my_gpu)!=UCL_SUCCESS) return -6; @@ -144,7 +144,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, if (set_ocl_params(ocl_vendor)!=0) return -11; - + int flag=0; for (int i=0; i<_procs_per_gpu; i++) { if (_gpu_rank==i) @@ -162,7 +162,7 @@ int DeviceT::set_ocl_params(char *ocl_vendor) { s_vendor=ocl_vendor; if (s_vendor=="none") s_vendor="generic"; - + if (s_vendor=="kepler") { _ocl_vendor_name="NVIDIA Kepler"; #if defined (__APPLE__) || defined(MACOSX) @@ -170,19 +170,19 @@ int DeviceT::set_ocl_params(char *ocl_vendor) { #else _ocl_vendor_string="-DKEPLER_OCL"; #endif - } else if (s_vendor=="fermi") { + } else if (s_vendor=="fermi") { _ocl_vendor_name="NVIDIA Fermi"; _ocl_vendor_string="-DFERMI_OCL"; - } else if (s_vendor=="cypress") { + } else if (s_vendor=="cypress") { _ocl_vendor_name="AMD Cypress"; _ocl_vendor_string="-DCYPRESS_OCL"; - } else if (s_vendor=="phi") { + } else if (s_vendor=="phi") { _ocl_vendor_name="Intel Phi"; _ocl_vendor_string="-DPHI_OCL"; - } else if (s_vendor=="intel") { + } else if (s_vendor=="intel") { _ocl_vendor_name="Intel CPU"; _ocl_vendor_string="-DINTEL_OCL"; - } else if (s_vendor=="generic") { + } else if (s_vendor=="generic") { _ocl_vendor_name="GENERIC"; _ocl_vendor_string="-DGENERIC_OCL"; } else { @@ -220,10 +220,10 @@ int DeviceT::set_ocl_params(char *ocl_vendor) { template int DeviceT::init(Answer &ans, const bool charge, - const bool rot, const int nlocal, + const bool rot, const int nlocal, const int host_nlocal, const int nall, Neighbor *nbor, const int maxspecial, - const int gpu_host, const int max_nbors, + const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut, const int threads_per_atom, const bool vel) { if (!_device_init) @@ -254,7 +254,7 @@ int DeviceT::init(Answer &ans, const bool charge, // Initialize atom and nbor data if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel)) return -3; - + _data_in_estimate++; if (charge) _data_in_estimate++; @@ -272,12 +272,12 @@ int DeviceT::init(Answer &ans, const bool charge, if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel)) return -3; } - + if (!ans.init(ef_nlocal,charge,rot,*gpu)) return -3; if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, - *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, + *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, _block_cell_id, _block_nbor_build, threads_per_atom, _warp_size, _time_device, compile_string())) return -3; @@ -294,7 +294,7 @@ template int DeviceT::init(Answer &ans, const int nlocal, const int nall) { if (!_device_init) - return -1; + return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) return -5; @@ -361,7 +361,7 @@ void DeviceT::init_message(FILE *screen, const char *name, if (i==first_gpu) sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ ("; - else + else sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+ toa(gpu->clock_rate(i))+" GHZ ("; if (sizeof(PRECISION)==4) { @@ -381,7 +381,7 @@ void DeviceT::init_message(FILE *screen, const char *name, } template -void DeviceT::estimate_gpu_overhead(const int kernel_calls, +void DeviceT::estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, double &gpu_driver_overhead) { UCL_H_Vec *host_data_in=NULL, *host_data_out=NULL; @@ -394,38 +394,38 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls, dev_data_in=new UCL_D_Vec[_data_in_estimate]; timers_in=new UCL_Timer[_data_in_estimate]; } - + if (_data_out_estimate>0) { host_data_out=new UCL_H_Vec[_data_out_estimate]; dev_data_out=new UCL_D_Vec[_data_out_estimate]; timers_out=new UCL_Timer[_data_out_estimate]; } - + if (kernel_calls>0) { kernel_data=new UCL_D_Vec[kernel_calls]; timers_kernel=new UCL_Timer[kernel_calls]; } - + for (int i=0; i<_data_in_estimate; i++) { host_data_in[i].alloc(1,*gpu); dev_data_in[i].alloc(1,*gpu); timers_in[i].init(*gpu); - } - + } + for (int i=0; i<_data_out_estimate; i++) { host_data_out[i].alloc(1,*gpu); dev_data_out[i].alloc(1,*gpu); timers_out[i].init(*gpu); - } - + } + for (int i=0; isync(); gpu_barrier(); @@ -439,7 +439,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls, ucl_copy(dev_data_in[i],host_data_in[i],true); timers_in[i].stop(); } - + for (int i=0; i0) { delete [] host_data_out; delete [] dev_data_out; delete [] timers_out; } - + if (kernel_calls>0) { delete [] kernel_data; delete [] timers_kernel; } -} +} template -void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, - Neighbor &nbor, const double avg_split, +void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, + Neighbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, - const double driver_overhead, + const double driver_overhead, const int threads_per_atom, FILE *screen) { double single[9], times[9]; int post_final=0; @@ -557,14 +557,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, } template -void DeviceT::output_kspace_times(UCL_Timer &time_in, +void DeviceT::output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, UCL_Timer &time_map, UCL_Timer &time_rho, UCL_Timer &time_interp, - Answer &ans, - const double max_bytes, - const double cpu_time, + Answer &ans, + const double max_bytes, + const double cpu_time, const double idle_time, FILE *screen) { double single[8], times[8]; @@ -650,8 +650,8 @@ int DeviceT::compile_kernels() { int flag=0; if (_compiled) - return flag; - + return flag; + dev_program=new UCL_Program(*gpu); int success=dev_program->load_string(device,compile_string().c_str()); if (success!=UCL_SUCCESS) @@ -664,7 +664,7 @@ int DeviceT::compile_kernels() { k_info.set_size(1,1); k_info.run(&gpu_lib_data); gpu_lib_data.update_host(false); - + _ptx_arch=static_cast(gpu_lib_data[0])/100.0; #ifndef USE_OPENCL if (_ptx_arch>gpu->arch() || floor(_ptx_arch)arch())) @@ -705,7 +705,7 @@ int DeviceT::compile_kernels() { if (_threads_per_charge & (_threads_per_charge - 1)) _threads_per_charge=1; - return flag; + return flag; } template @@ -718,12 +718,12 @@ template class Device; Device global_device; int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, + const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, - const int t_per_atom, const double cell_size, + const int t_per_atom, const double cell_size, char *opencl_vendor, const int block_pair) { return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, - particle_split,nthreads,t_per_atom, + particle_split,nthreads,t_per_atom, cell_size,opencl_vendor,block_pair); } diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu index 28b58f7760..6761b23fbb 100644 --- a/lib/gpu/lal_device.cu +++ b/lib/gpu/lal_device.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -17,10 +17,10 @@ #include "lal_preprocessor.h" #endif -__kernel void kernel_zero(__global int *restrict mem, +__kernel void kernel_zero(__global int *restrict mem, int numel) { int ii=GLOBAL_ID_X; - + if (ii class PPPM; template class Device { public: Device(); - ~Device(); - + ~Device(); + /// Initialize the device for use by this process /** Sets up a per-device MPI communicator for load balancing and initializes - * the device (>=first_gpu and <=last_gpu) that this proc will be using + * the device (>=first_gpu and <=last_gpu) that this proc will be using * Returns: * - 0 if successfull * - -2 if GPU not found * - -4 if GPU library not compiled for GPU * - -6 if GPU could not be initialized for use - * - -7 if accelerator sharing is not currently allowed on system + * - -7 if accelerator sharing is not currently allowed on system * - -11 if vendor_string has the wrong number of parameters **/ - int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, + int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, - const int t_per_atom, const double cell_size, + const int t_per_atom, const double cell_size, char *vendor_string, const int block_pair); /// Initialize the device for Atom and Neighbor storage @@ -62,9 +62,9 @@ class Device { * 1 if gpu_nbor is true, and host needs a half nbor list, * 2 if gpu_nbor is true, and host needs a full nbor list * \param max_nbors Initial number of rows in the neighbor matrix - * \param cell_size cutoff+skin + * \param cell_size cutoff+skin * \param pre_cut True if cutoff test will be performed in separate kernel - * than the force kernel + * than the force kernel * \param threads_per_atom value to be used by the neighbor list only * * Returns: @@ -113,25 +113,25 @@ class Device { /// Returns true if double precision is supported on card inline bool double_precision() { return gpu->double_precision(); } - + /// Output a message with timing information - void output_times(UCL_Timer &time_pair, Answer &ans, - Neighbor &nbor, const double avg_split, + void output_times(UCL_Timer &time_pair, Answer &ans, + Neighbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, - const double driver_overhead, + const double driver_overhead, const int threads_per_atom, FILE *screen); /// Output a message with timing information void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, UCL_Timer & time_map, UCL_Timer & time_rho, - UCL_Timer &time_interp, - Answer &ans, + UCL_Timer &time_interp, + Answer &ans, const double max_bytes, const double cpu_time, const double cpu_idle_time, FILE *screen); /// Clear all memory on host and device associated with atom and nbor data void clear(); - + /// Clear all memory on host and device void clear_device(); @@ -149,24 +149,24 @@ class Device { while (ans_queue.empty()==false) { evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul); ans_queue.pop(); - } + } return evdw; } return 0.0; } /// Start timer on host - inline void start_host_timer() + inline void start_host_timer() { _cpu_full=MPI_Wtime(); _host_timer_started=true; } - + /// Stop timer on host - inline void stop_host_timer() { + inline void stop_host_timer() { if (_host_timer_started) { - _cpu_full=MPI_Wtime()-_cpu_full; + _cpu_full=MPI_Wtime()-_cpu_full; _host_timer_started=false; } } - + /// Return host time inline double host_time() { return _cpu_full; } @@ -239,8 +239,8 @@ class Device { /// Number of threads executing concurrently on same multiproc inline int warp_size() const { return _warp_size; } - // -------------------- SHARED DEVICE ROUTINES -------------------- - // Perform asynchronous zero of integer array + // -------------------- SHARED DEVICE ROUTINES -------------------- + // Perform asynchronous zero of integer array void zero(UCL_D_Vec &mem, const int numel) { int num_blocks=static_cast(ceil(static_cast(numel)/ _block_pair)); @@ -248,25 +248,25 @@ class Device { k_zero.run(&mem,&numel); } - // -------------------------- DEVICE DATA ------------------------- + // -------------------------- DEVICE DATA ------------------------- /// Geryon Device UCL_Device *gpu; enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH}; - // --------------------------- ATOM DATA -------------------------- + // --------------------------- ATOM DATA -------------------------- /// Atom Data Atom atom; // --------------------------- NBOR DATA ---------------------------- - + /// Neighbor Data NeighborShared _neighbor_shared; // ------------------------ LONG RANGE DATA ------------------------- - + // Long Range Data int _long_range_precompute; PPPM *pppm_single; @@ -282,7 +282,7 @@ class Device { pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge, boxlo,prd); } - + inline std::string compile_string() { return _ocl_compile_string; } private: @@ -290,7 +290,7 @@ class Device { int _init_count; bool _device_init, _host_timer_started, _time_device; MPI_Comm _comm_world, _comm_replica, _comm_gpu; - int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, + int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, _replica_size; int _gpu_mode, _first_device, _last_device, _nthreads; double _particle_split; @@ -310,10 +310,10 @@ class Device { int compile_kernels(); int _data_in_estimate, _data_out_estimate; - + std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string; int set_ocl_params(char *); - + template inline std::string toa(const t& in) { std::ostringstream o; diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp index e96e15eaf9..c97b76c820 100644 --- a/lib/gpu/lal_dipole_lj.cpp +++ b/lib/gpu/lal_dipole_lj.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template DipoleLJT::~DipoleLJ() { clear(); } - + template int DipoleLJT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,8 +45,8 @@ int DipoleLJT::bytes_per_atom(const int max_nbors) const { template int DipoleLJT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, @@ -138,7 +138,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -151,7 +151,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) { &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, + &ainum, &nbor_pitch, &this->atom->q, &this->atom->quat, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { @@ -160,8 +160,8 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) { &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->q, - &this->atom->quat, &cutsq, + &nbor_pitch, &this->atom->q, + &this->atom->quat, &cutsq, &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu index b6483d1ef8..745bdb7f27 100644 --- a/lib/gpu/lal_dipole_lj.cu +++ b/lib/gpu/lal_dipole_lj.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -132,17 +132,17 @@ texture mu_tex; #endif -__kernel void k_dipole_lj(const __global numtyp4 *restrict x_, +__kernel void k_dipole_lj(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const __global numtyp4 *restrict mu_, const __global numtyp *restrict cutsq, @@ -171,14 +171,14 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; - r7inv = r5inv*r2inv; + r7inv = r5inv*r2inv; pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; @@ -251,7 +251,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x; forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y; forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z; - + numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); @@ -263,12 +263,12 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, // dipole-charge if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pre1 = (numtyp)3.0*qj*r5inv * pidotr; pre2 = qj*r3inv; - + forcecoul.x += pre2*mui.x - pre1*delx; forcecoul.y += pre2*mui.y - pre1*dely; forcecoul.z += pre2*mui.z - pre1*delz; @@ -276,7 +276,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } - + // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; @@ -284,7 +284,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr; pre2 = qtmp*r3inv; - + forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; @@ -306,12 +306,12 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, tor.z+=fq*ticoul.z; if (eflag>0) { - acctyp e = (acctyp)0.0; + acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { e = qtmp*qj*rinv; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr; - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr; @@ -322,7 +322,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].z) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); energy+=factor_lj*(e-lj3[mtype].z); - } + } } if (vflag>0) { virial[0] += delx*force.x; @@ -340,19 +340,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, +__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, const __global numtyp4 *restrict mu_, - const __global numtyp *restrict _cutsq, + const __global numtyp *restrict _cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -369,7 +369,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -381,16 +381,16 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; r7inv = r5inv*r2inv; pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; @@ -463,7 +463,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x; forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y; forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z; - + numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); @@ -474,13 +474,13 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, } // dipole-charge - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pre1 = (numtyp)3.0*qj*r5inv * pidotr; pre2 = qj*r3inv; - + forcecoul.x += pre2*mui.x - pre1*delx; forcecoul.y += pre2*mui.y - pre1*dely; forcecoul.z += pre2*mui.z - pre1*delz; @@ -488,7 +488,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } - + // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; @@ -496,7 +496,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr; pre2 = qtmp*r3inv; - + forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; @@ -519,12 +519,12 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, tor.z+=fq*ticoul.z; if (eflag>0) { - acctyp e = (acctyp)0; + acctyp e = (acctyp)0; if (rsq < lj1[mtype].w) { e = qtmp*qj*rinv; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr; - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr; diff --git a/lib/gpu/lal_dipole_lj.h b/lib/gpu/lal_dipole_lj.h index b08b7a8669..615784ee8b 100644 --- a/lib/gpu/lal_dipole_lj.h +++ b/lib/gpu/lal_dipole_lj.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class DipoleLJ : public BaseDipole { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class DipoleLJ : public BaseDipole { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, @@ -70,7 +70,7 @@ class DipoleLJ : public BaseDipole { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp index 55bbe0b804..76722a20b4 100644 --- a/lib/gpu/lal_dipole_lj_ext.cpp +++ b/lib/gpu/lal_dipole_lj_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -81,7 +81,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e); DPLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,18 +98,18 @@ void dpl_gpu_clear() { int** dpl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double **host_mu, + bool &success, double *host_q, double **host_mu, double *boxlo, double *prd) { return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, host_mu, boxlo, prd); -} - +} + void dpl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp index 5a145dc762..a33f38084f 100644 --- a/lib/gpu/lal_dipole_lj_sf.cpp +++ b/lib/gpu/lal_dipole_lj_sf.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template DipoleLJSFT::~DipoleLJSF() { clear(); } - + template int DipoleLJSFT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,8 +45,8 @@ int DipoleLJSFT::bytes_per_atom(const int max_nbors) const { template int DipoleLJSFT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, @@ -138,7 +138,7 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -151,17 +151,17 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) { &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, + &ainum, &nbor_pitch, &this->atom->q, &this->atom->quat, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, - &this->atom->quat, &cutsq, + &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, + &this->atom->quat, &cutsq, &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu index 8469ed9ac9..9847e84823 100644 --- a/lib/gpu/lal_dipole_lj_sf.cu +++ b/lib/gpu/lal_dipole_lj_sf.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -133,20 +133,20 @@ texture mu_tex; #endif -__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, +__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_ , const __global numtyp4 *restrict mu_, - const __global numtyp *restrict cutsq, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -172,14 +172,14 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; - + pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; - + afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv; pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr); aforcecoul.x = pre1*delx; aforcecoul.y = pre1*dely; aforcecoul.z = pre1*delz; - + bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+ (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv; presf = (numtyp)2.0*r2inv*pidotr*pjdotr; bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx); bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely); bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz); - + forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x); forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y); forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z); - + pre2 = (numtyp)3.0*bfac*r5inv*pjdotr; pre4 = -bfac*r3inv; numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); - + ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely); ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz); ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx); @@ -285,12 +285,12 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, // dipole-charge if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; rcutcoul2inv=ucl_recip(lj1[mtype].w); pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv); - pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); pre2 = qj*r3inv * pqfac; @@ -301,7 +301,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } - + // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; @@ -309,10 +309,10 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; rcutcoul2inv=ucl_recip(lj1[mtype].w); pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv); - qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); pre2 = qtmp*r3inv * qpfac; - + forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; @@ -334,13 +334,13 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, tor.z+=fq*ticoul.z; if (eflag>0) { - acctyp e = (acctyp)0.0; + acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv); e = qtmp*qj*rinv*fac*fac; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr); - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr * pqfac; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr * qpfac; @@ -350,12 +350,12 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].z) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) + - rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - + rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv + - rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + + rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + (numtyp)4.0*lj3[mtype].y); energy+=factor_lj*e; - } + } } if (vflag>0) { virial[0] += delx*force.x; @@ -372,19 +372,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, +__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const __global numtyp *restrict q_, const __global numtyp4 *restrict mu_, - const __global numtyp *restrict _cutsq, + const __global numtyp *restrict _cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; @@ -402,7 +402,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -414,16 +414,16 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; - + pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; - + afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv; pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr); aforcecoul.x = pre1*delx; aforcecoul.y = pre1*dely; aforcecoul.z = pre1*delz; - + bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+ (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv; presf = (numtyp)2.0*r2inv*pidotr*pjdotr; bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx); bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely); bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz); - + forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x); forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y); forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z); - + pre2 = (numtyp)3.0*bfac*r5inv*pjdotr; pre4 = -bfac*r3inv; @@ -529,11 +529,11 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, // dipole-charge if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { - r3inv = r2inv*rinv; + r3inv = r2inv*rinv; r5inv = r3inv*r2inv; pidotr = mui.x*delx + mui.y*dely + mui.z*delz; pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv); - pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); pre2 = qj*r3inv * pqfac; @@ -544,7 +544,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, ticoul.y += pre2 * (mui.z*delx - mui.x*delz); ticoul.z += pre2 * (mui.x*dely - mui.y*delx); } - + // charge-dipole if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { r3inv = r2inv*rinv; @@ -552,10 +552,10 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv); - qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); pre2 = qtmp*r3inv * qpfac; - + forcecoul.x += pre1*delx - pre2*muj.x; forcecoul.y += pre1*dely - pre2*muj.y; forcecoul.z += pre1*delz - pre2*muj.z; @@ -577,13 +577,13 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, tor.z+=fq*ticoul.z; if (eflag>0) { - acctyp e = (acctyp)0.0; + acctyp e = (acctyp)0.0; if (rsq < lj1[mtype].w) { numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv); e = qtmp*qj*rinv*fac*fac; if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr); - if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) e += -qj*r3inv*pidotr * pqfac; if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) e += qtmp*r3inv*pjdotr * qpfac; @@ -593,12 +593,12 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, if (rsq < lj1[mtype].z) { e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) + - rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - + rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv + - rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + + rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + (numtyp)4.0*lj3[mtype].y); energy+=factor_lj*e; - } + } } if (vflag>0) { virial[0] += delx*force.x; diff --git a/lib/gpu/lal_dipole_lj_sf.h b/lib/gpu/lal_dipole_lj_sf.h index 83cea4c2a4..20357385a2 100644 --- a/lib/gpu/lal_dipole_lj_sf.h +++ b/lib/gpu/lal_dipole_lj_sf.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class DipoleLJSF : public BaseDipole { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class DipoleLJSF : public BaseDipole { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, @@ -70,7 +70,7 @@ class DipoleLJSF : public BaseDipole { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp index 8abf78c903..68b935ff38 100644 --- a/lib/gpu/lal_dipole_lj_sf_ext.cpp +++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -81,7 +81,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e); DPLSFMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,18 +98,18 @@ void dplsf_gpu_clear() { int** dplsf_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double **host_mu, + bool &success, double *host_q, double **host_mu, double *boxlo, double *prd) { return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, host_mu, boxlo, prd); -} - +} + void dplsf_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_dpd.cpp b/lib/gpu/lal_dpd.cpp index 3736f89323..4f6f2d641f 100644 --- a/lib/gpu/lal_dpd.cpp +++ b/lib/gpu/lal_dpd.cpp @@ -33,23 +33,23 @@ DPDT::DPD() : BaseDPD(), _allocated(false) { } template -DPDT::~DPD() { +DPDT::~DPD() { clear(); } - + template int DPDT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int DPDT::init(const int ntypes, - double **host_cutsq, double **host_a0, - double **host_gamma, double **host_sigma, +int DPDT::init(const int ntypes, + double **host_cutsq, double **host_a0, + double **host_gamma, double **host_sigma, double **host_cut, double *host_special_lj, - const bool tstat_only, - const int nlocal, const int nall, - const int max_nbors, const int maxspecial, + const bool tstat_only, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; @@ -76,7 +76,7 @@ int DPDT::init(const int ntypes, coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a0,host_gamma, - host_sigma,host_cut); + host_sigma,host_cut); UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); @@ -90,7 +90,7 @@ int DPDT::init(const int ntypes, _tstat_only = 0; if (tstat_only) _tstat_only=1; - + _allocated=true; this->_max_bytes=coeff.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes(); return 0; @@ -130,7 +130,7 @@ void DPDT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -147,8 +147,8 @@ void DPDT::loop(const bool _eflag, const bool _vflag) { &this->_tstat_only, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, &this->_timestep, &this->_tstat_only, @@ -164,7 +164,7 @@ void DPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma, UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma, - host_sigma,host_cut); + host_sigma,host_cut); } - + template class DPD; diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu index 209bc0233e..e32404ff5c 100644 --- a/lib/gpu/lal_dpd.cu +++ b/lib/gpu/lal_dpd.cu @@ -37,7 +37,7 @@ texture vel_tex; #define _USE_UNIFORM_SARU_LCG #endif -// References: +// References: // 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. // 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. // PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 @@ -49,9 +49,9 @@ texture vel_tex; #define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ // specifically implemented for steps = 1; high = 1.0; low = -1.0 -// returns uniformly distributed random numbers u in [-1.0;1.0] -// using the inherent LCG, then multiply u with sqrt(3) to "match" -// with a normal random distribution. +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. // Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) // Curly brackets to make variables local to the scope. #ifdef _USE_UNIFORM_SARU_LCG @@ -80,8 +80,8 @@ texture vel_tex; #endif // specifically implemented for steps = 1; high = 1.0; low = -1.0 -// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 -// then multiply u with sqrt(3) to "match" with a normal random distribution +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution // Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) #ifdef _USE_UNIFORM_SARU_TEA8 #define SQRT3 (numtyp)1.7320508075688772935274463 @@ -119,7 +119,7 @@ texture vel_tex; #endif // specifically implemented for steps = 1; high = 1.0; low = -1.0 -// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], // and uses the polar method (Marsaglia's) to transform to a normal random value // This is used to compared with CPU DPD using RandMars::gaussian() #ifdef _USE_GAUSSIAN_SARU_LCG @@ -160,20 +160,20 @@ texture vel_tex; randnum = r2*fac; \ } #endif - -__kernel void k_dpd(const __global numtyp4 *restrict x_, + +__kernel void k_dpd(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff, - const int lj_types, - const __global numtyp *restrict sp_lj, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp4 *restrict v_, const __global numtyp *restrict cutsq, - const numtyp dtinvsqrt, const int seed, + const numtyp dtinvsqrt, const int seed, const int timestep, const int tstat_only, const int t_per_atom) { int tid, ii, offset; @@ -185,13 +185,13 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii tag2) { tag1 = jtag; tag2 = itag; } - - numtyp randnum = (numtyp)0.0; + + numtyp randnum = (numtyp)0.0; saru(tag1, tag2, seed, timestep, randnum); // conservative force = a0 * wd, or 0 if tstat only @@ -244,7 +244,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, force -= coeff[mtype].y*wd*wd*dot*rinv; force += coeff[mtype].z*wd*randnum*dtinvsqrt; force*=factor_dpd*rinv; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; @@ -254,7 +254,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]); // eng shifted to 0.0 at cutoff numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd; - energy+=factor_dpd*e; + energy+=factor_dpd*e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -272,23 +272,23 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_dpd_fast(const __global numtyp4 *restrict x_, +__kernel void k_dpd_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff_in, - const __global numtyp *restrict sp_lj_in, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp4 *restrict v_, const __global numtyp *restrict cutsq, - const numtyp dtinvsqrt, const int seed, + const numtyp dtinvsqrt, const int seed, const int timestep, const int tstat_only, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -296,7 +296,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, if (tid tag2) { tag1 = jtag; tag2 = itag; } - - numtyp randnum = (numtyp)0.0; + + numtyp randnum = (numtyp)0.0; saru(tag1, tag2, seed, timestep, randnum); // conservative force = a0 * wd, or 0 if tstat only @@ -364,7 +364,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, force -= coeff[mtype].y*wd*wd*dot*rinv; force += coeff[mtype].z*wd*randnum*dtinvsqrt; force*=factor_dpd*rinv; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; @@ -374,7 +374,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_, // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]); // eng shifted to 0.0 at cutoff numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd; - energy+=factor_dpd*e; + energy+=factor_dpd*e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_dpd.h b/lib/gpu/lal_dpd.h index 449d7b1d8c..42ef854522 100644 --- a/lib/gpu/lal_dpd.h +++ b/lib/gpu/lal_dpd.h @@ -24,23 +24,23 @@ template class DPD : public BaseDPD { public: DPD(); - ~DPD(); + ~DPD(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double **host_cutsq, double **host_a0, + int init(const int ntypes, double **host_cutsq, double **host_a0, double **host_gamma, double **host_sigma, double **host_cut, double *host_special_lj, bool tstat_only, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, + const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -52,11 +52,11 @@ class DPD : public BaseDPD { /// Total host memory used by library for pair style double host_memory_usage() const; - + /// Update coeff if needed (tstat only) void update_coeff(int ntypes, double **host_a0, double **host_gamma, double **host_sigma, double **host_cut); - + // --------------------------- TYPE DATA -------------------------- /// coeff.x = a0, coeff.y = gamma, coeff.z = sigma, coeff.w = cut @@ -70,12 +70,12 @@ class DPD : public BaseDPD { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; - + /// Only used for thermostat int _tstat_only; - + private: bool _allocated; void loop(const bool _eflag, const bool _vflag); diff --git a/lib/gpu/lal_dpd_ext.cpp b/lib/gpu/lal_dpd_ext.cpp index 327074d087..26bbb660b8 100644 --- a/lib/gpu/lal_dpd_ext.cpp +++ b/lib/gpu/lal_dpd_ext.cpp @@ -54,7 +54,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, int init_ok=0; if (world_me==0) - init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, + init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, host_cut, special_lj, tstat_only, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -72,12 +72,12 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, + init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, host_cut, special_lj, tstat_only, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); DPDMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -95,25 +95,25 @@ void dpd_gpu_clear() { int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, const double dtinvsqrt, + double **host_v, const double dtinvsqrt, const int seed, const int timestep, double *boxlo, double *prd) { return DPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, ilist, jnum, cpu_time, success, + vatom, host_start, ilist, jnum, cpu_time, success, host_v, dtinvsqrt, seed, timestep, boxlo, prd); -} - +} + void dpd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, tagint *tag, - double **host_v, const double dtinvsqrt, - const int seed, const int timestep, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, const int nlocal, double *boxlo, double *prd) { DPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index c856a8e667..b83972f4db 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -9,10 +9,10 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ - + #if defined(USE_OPENCL) #include "eam_cl.h" #elif defined(USE_CUDART) @@ -33,7 +33,7 @@ using namespace LAMMPS_AL; extern Device device; template -EAMT::EAM() : BaseAtomic(), +EAMT::EAM() : BaseAtomic(), _compiled_energy(false), _allocated(false) { } @@ -41,46 +41,46 @@ template EAMT::~EAM() { clear(); } - + template int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, - double ***host_frho_spline, double rdr, double rdrho, + double ***host_frho_spline, double rdr, double rdrho, double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen) + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, gpu_split,_screen,eam,"k_eam"); - + if (success!=0) return success; - + // allocate fp - + int ef_nall=nall; if (ef_nall==0) ef_nall=2000; _max_fp_size=static_cast(static_cast(ef_nall)*1.10); _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - + k_energy.set_function(*(this->pair_program),"k_energy"); k_energy_fast.set_function(*(this->pair_program),"k_energy_fast"); fp_tex.get_texture(*(this->pair_program),"fp_tex"); fp_tex.bind_float(_fp,1); _compiled_energy = true; - + // Initialize timers for selected GPU time_pair2.init(*(this->ucl_device)); time_pair2.zero(); - + time_fp1.init(*(this->ucl_device)); time_fp1.zero(); - + time_fp2.init(*(this->ucl_device)); time_fp2.zero(); @@ -93,7 +93,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, lj_types=max_shared_types; shared_types=true; } - + _ntypes=lj_types; _cutforcesq=host_cutforcesq; _rdr=rdr; @@ -104,26 +104,26 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, _nz2r=nz2r; _nfrho=nfrho; _nr=nr; - + UCL_H_Vec dview_type(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; iucl_device),UCL_READ_ONLY); - + for (int i=0; i dview_type2frho(lj_types,*(this->ucl_device), UCL_WRITE_ONLY); @@ -136,7 +136,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, // pack frho_spline UCL_H_Vec dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ix dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ix dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ixucl_device),UCL_READ_ONLY); ucl_copy(z2r_spline1,dview_z2r_spline,false); z2r_spline1_tex.get_texture(*(this->pair_program),"z2r_sp1_tex"); z2r_spline1_tex.bind_float(z2r_spline1,4); - + for (int ix=0; ixucl_device),UCL_READ_ONLY); ucl_copy(z2r_spline2,dview_z2r_spline,false); z2r_spline2_tex.get_texture(*(this->pair_program),"z2r_sp2_tex"); @@ -241,7 +241,7 @@ void EAMT::clear() { if (!_allocated) return; _allocated=false; - + type2rhor_z2r.clear(); type2frho.clear(); rhor_spline1.clear(); @@ -250,13 +250,13 @@ void EAMT::clear() { frho_spline2.clear(); z2r_spline1.clear(); z2r_spline2.clear(); - + _fp.clear(); - + time_pair2.clear(); time_fp1.clear(); time_fp2.clear(); - + if (_compiled_energy) { k_energy_fast.clear(); k_energy.clear(); @@ -283,20 +283,20 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { this->acc_timers(); - + if (this->device->time_device()) { // Put time from the second part to the total time_pair this->time_pair.add_time_to_total(time_pair2.time()); - + // Add transfer time from device -> host after part 1 this->atom->add_transfer_time(time_fp1.time()); - + // Add transfer time from host -> device before part 2 this->atom->add_transfer_time(time_fp2.time()); } - + // ------------------- Resize FP Array for EAM -------------------- - + if (nall>_max_fp_size) { _max_fp_size=static_cast(static_cast(nall)*1.10); _fp.resize(_max_fp_size); @@ -313,7 +313,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, this->zero_timers(); return; } - + int ago=this->hd_balancer.ago_first(f_ago); int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); this->ans->inum(inum); @@ -326,7 +326,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, if (!success) return; } - + this->atom->cast_x_data(host_x,host_type); this->atom->add_x_data(host_x,host_type); @@ -345,36 +345,36 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, // --------------------------------------------------------------------------- template int** EAMT::compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, + double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, int &inum, + const double cpu_time, bool &success, int &inum, void **fp_ptr) { this->acc_timers(); - + if (this->device->time_device()) { // Put time from the second part to the total time_pair this->time_pair.add_time_to_total(time_pair2.time()); - + // Add transfer time from device -> host after part 1 this->atom->add_transfer_time(time_fp1.time()); - + // Add transfer time from host -> device before part 2 this->atom->add_transfer_time(time_fp2.time()); } // ------------------- Resize FP Array for EAM -------------------- - + if (nall>_max_fp_size) { _max_fp_size=static_cast(static_cast(nall)*1.10); _fp.resize(_max_fp_size); fp_tex.bind_float(_fp,1); - } - *fp_ptr=_fp.host.begin(); + } + *fp_ptr=_fp.host.begin(); // ----------------------------------------------------------------- - + if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -382,14 +382,14 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, this->zero_timers(); return NULL; } - + // load balance, returning the atom count on the device (inum) this->hd_balancer.balance(cpu_time); inum=this->hd_balancer.get_gpu_count(ago,inum_full); this->ans->inum(inum); host_start=inum; - - // Build neighbor list on GPU if necessary + + // Build neighbor list on GPU if necessary if (ago==0) { this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); @@ -403,14 +403,14 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, *jnum=this->nbor->host_acc.begin(); loop(eflag,vflag); - + // copy fp from device to host for comm _nlocal=inum_full; time_fp1.start(); _fp.update_host(inum_full,true); time_fp1.stop(); time_fp1.sync_stop(); - + return this->nbor->host_jlist.begin()-host_start; } @@ -420,20 +420,20 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, template void EAMT::compute2(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom) { - if (this->ans->inum()==0) + if (this->ans->inum()==0) return; - + this->hd_balancer.start_timer(); time_fp2.start(); this->add_fp_data(); time_fp2.stop(); - + loop2(eflag,vflag); if (ilist == NULL) this->ans->copy_answers(eflag,vflag,eatom,vatom); else this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist); - + this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); } @@ -455,27 +455,27 @@ void EAMT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); - + if (shared_types) { this->k_energy_fast.set_size(GX,BX); this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho, - &rhor_spline2, &frho_spline1,&frho_spline2, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &rhor_spline2, &frho_spline1,&frho_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, &this->ans->engv, &eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho, &_nr, &this->_threads_per_atom); } else { this->k_energy.set_size(GX,BX); this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho, - &rhor_spline2, &frho_spline1, &frho_spline2, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, + &rhor_spline2, &frho_spline1, &frho_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, &this->ans->engv,&eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho, &_nr, &this->_threads_per_atom); @@ -501,25 +501,25 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair2.start(); - + if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r, - &rhor_spline1, &z2r_spline1, &z2r_spline2, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &rhor_spline1, &z2r_spline1, &z2r_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr, &_nr, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, + this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, &z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu index 054b3ca6db..13440b7d45 100644 --- a/lib/gpu/lal_eam.cu +++ b/lib/gpu/lal_eam.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov nguyentd@ornl.gov // ***************************************************************************/ @@ -82,7 +82,7 @@ texture z2r_sp2_tex; engv[ii]=energy; \ } \ } - + #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom, \ offset, elag, vflag, ans, engv) \ if (t_per_atom>1) { \ @@ -188,37 +188,37 @@ texture z2r_sp2_tex; #endif -__kernel void k_energy(const __global numtyp4 *restrict x_, +__kernel void k_energy(const __global numtyp4 *restrict x_, const __global int2 *restrict type2rhor_z2r, - const __global int *restrict type2frho, - const __global numtyp4 *restrict rhor_spline2, + const __global int *restrict type2frho, + const __global numtyp4 *restrict rhor_spline2, const __global numtyp4 *restrict frho_spline1, const __global numtyp4 *restrict frho_spline2, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global numtyp *restrict fp_, - __global acctyp *restrict engv, + __global numtyp *restrict fp_, + __global acctyp *restrict engv, const int eflag, const int inum, const int nbor_pitch, - const int ntypes, const numtyp cutforcesq, - const numtyp rdr, const numtyp rdrho, + const int ntypes, const numtyp cutforcesq, + const numtyp rdr, const numtyp rdrho, const numtyp rhomax, const int nrho, const int nr, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + acctyp rho = (acctyp)0; acctyp energy = (acctyp)0; - + if (ii { public: EAM(); ~EAM(); - + /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,11 +41,11 @@ class EAM : public BaseAtomic { int init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double rdr, - double rdrho, double rhomax, int nrhor, int nrho, int nz2r, - int nfrho, int nr, const int nlocal, const int nall, + double rdrho, double rhomax, int nrhor, int nrho, int nz2r, + int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen); - + // Copy charges to device asynchronously inline void add_fp_data() { int nghost=this->atom->nall()-_nlocal; @@ -57,7 +57,7 @@ class EAM : public BaseAtomic { ucl_copy(dev_view,host_view,nghost,true); } } - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -67,7 +67,7 @@ class EAM : public BaseAtomic { /// Total host memory used by library for pair style double host_memory_usage() const; - + /// Pair loop with host neighboring void compute(const int f_ago, const int inum_full, const int, const int nall, double **host_x, int *host_type, int *ilist, int *numj, @@ -75,23 +75,23 @@ class EAM : public BaseAtomic { const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr); - + /// Pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, int &inum, void **fp_ptr); /// Pair loop with host neighboring - void compute2(int *ilist, const bool eflag, const bool vflag, + void compute2(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom); - + // ------------------------- DEVICE KERNELS ------------------------- UCL_Kernel k_energy, k_energy_fast; - + // --------------------------- TEXTURES ----------------------------- UCL_Texture fp_tex; UCL_Texture rhor_spline1_tex, rhor_spline2_tex; @@ -99,37 +99,37 @@ class EAM : public BaseAtomic { UCL_Texture z2r_spline1_tex, z2r_spline2_tex; // --------------------------- DEVICE DATA -------------------------- - + /// Device Timers UCL_Timer time_pair2, time_fp1, time_fp2; - + // --------------------------- TYPE DATA -------------------------- - + UCL_D_Vec type2rhor_z2r; UCL_D_Vec type2frho; - + UCL_D_Vec z2r_spline1, z2r_spline2; UCL_D_Vec frho_spline1, frho_spline2; UCL_D_Vec rhor_spline1, rhor_spline2; - + numtyp _cutforcesq,_rdr,_rdrho, _rhomax; - + int _nfrho,_nrhor,_nrho,_nz2r,_nr; - + /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - - /// Number of atom types + + /// Number of atom types int _ntypes; - + int _max_fp_size; - + /// True of energy kernels are compiled bool _compiled_energy; - + /// Per-atom arrays UCL_Vector _fp; - + protected: bool _allocated; int _nlocal; diff --git a/lib/gpu/lal_eam_alloy_ext.cpp b/lib/gpu/lal_eam_alloy_ext.cpp index 282f93afeb..9209ed5c26 100644 --- a/lib/gpu/lal_eam_alloy_ext.cpp +++ b/lib/gpu/lal_eam_alloy_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ @@ -27,14 +27,14 @@ static EAM EAMALMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, +int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, - double rdr, double rdrho, double rhomax, int nrhor, - int nrho, int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double rdr, double rdrho, double rhomax, int nrhor, + int nrho, int nz2r, int nfrho, int nr, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size) { EAMALMF.clear(); gpu_mode=EAMALMF.device->gpu_mode(); @@ -46,11 +46,11 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, int procs_per_gpu=EAMALMF.device->procs_per_gpu(); // disable host/device split for now - if (gpu_split != 1.0) + if (gpu_split != 1.0) return -8; - + fp_size=sizeof(PRECISION); - + EAMALMF.device->init_message(screen,"eam/alloy",first_gpu,last_gpu); bool message=false; @@ -66,7 +66,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, if (world_me==0) init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -86,12 +86,12 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, if (gpu_rank==i && world_me!=0) init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMALMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -108,7 +108,7 @@ void eam_alloy_gpu_clear() { int ** eam_alloy_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -117,10 +117,10 @@ int ** eam_alloy_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, inum, fp_ptr); -} +} -void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal, - const int nall, double **host_x, int *host_type, +void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp index d56f750e2f..1b5602f808 100644 --- a/lib/gpu/lal_eam_ext.cpp +++ b/lib/gpu/lal_eam_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ @@ -27,14 +27,14 @@ static EAM EAMMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int eam_gpu_init(const int ntypes, double host_cutforcesq, +int eam_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, - double rdr, double rdrho, double rhomax, int nrhor, - int nrho, int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double rdr, double rdrho, double rhomax, int nrhor, + int nrho, int nz2r, int nfrho, int nr, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size) { EAMMF.clear(); gpu_mode=EAMMF.device->gpu_mode(); @@ -46,11 +46,11 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, int procs_per_gpu=EAMMF.device->procs_per_gpu(); // disable host/device split for now - if (gpu_split != 1.0) + if (gpu_split != 1.0) return -8; - + fp_size=sizeof(PRECISION); - + EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu); bool message=false; @@ -66,7 +66,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, if (world_me==0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -86,12 +86,12 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, if (gpu_rank==i && world_me!=0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -108,7 +108,7 @@ void eam_gpu_clear() { int ** eam_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -117,10 +117,10 @@ int ** eam_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, inum, fp_ptr); -} +} -void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, - const int nall, double **host_x, int *host_type, +void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, diff --git a/lib/gpu/lal_eam_fs_ext.cpp b/lib/gpu/lal_eam_fs_ext.cpp index 4992f3ab98..b9e25466aa 100644 --- a/lib/gpu/lal_eam_fs_ext.cpp +++ b/lib/gpu/lal_eam_fs_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ @@ -27,14 +27,14 @@ static EAM EAMFSMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, +int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, - double rdr, double rdrho, double rhomax, int nrhor, - int nrho, int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double rdr, double rdrho, double rhomax, int nrhor, + int nrho, int nz2r, int nfrho, int nr, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size) { EAMFSMF.clear(); gpu_mode=EAMFSMF.device->gpu_mode(); @@ -46,11 +46,11 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, int procs_per_gpu=EAMFSMF.device->procs_per_gpu(); // disable host/device split for now - if (gpu_split != 1.0) + if (gpu_split != 1.0) return -8; - + fp_size=sizeof(PRECISION); - + EAMFSMF.device->init_message(screen,"eam/fs",first_gpu,last_gpu); bool message=false; @@ -66,7 +66,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, if (world_me==0) init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -86,12 +86,12 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, if (gpu_rank==i && world_me!=0) init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMFSMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -108,7 +108,7 @@ void eam_fs_gpu_clear() { int ** eam_fs_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -117,10 +117,10 @@ int ** eam_fs_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, inum, fp_ptr); -} +} -void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal, - const int nall, double **host_x, int *host_type, +void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h index b33f087212..71668f5e02 100644 --- a/lib/gpu/lal_ellipsoid_extra.h +++ b/lib/gpu/lal_ellipsoid_extra.h @@ -245,8 +245,8 @@ ucl_inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans) ucl_inline numtyp gpu_det3(const numtyp m[9]) { - numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - - m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + + numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - + m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + m[6]*m[1]*m[5] - m[6]*m[2]*m[4]; return ans; }; @@ -255,7 +255,7 @@ ucl_inline numtyp gpu_det3(const numtyp m[9]) diagonal matrix times a full matrix ------------------------------------------------------------------------- */ -ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], +ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], numtyp ans[9]) { ans[0] = shape.x*m[0]; @@ -421,7 +421,7 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans, t = aug[9]/aug[5]; aug[10]-=t*aug[6]; aug[11]-=t*aug[7]; - + if (aug[10] == (numtyp)0.0) *error_flag=2; @@ -440,11 +440,11 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans, quat = [w i j k] ------------------------------------------------------------------------- */ -ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, +ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, numtyp mat[9]) { numtyp4 q; fetch4(q,qi,quat_tex); - + numtyp w2 = q.x*q.x; numtyp i2 = q.y*q.y; numtyp j2 = q.z*q.z; @@ -463,7 +463,7 @@ ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, mat[1] = twoij+twokw; mat[4] = w2-i2+j2-k2; mat[7] = twojk-twoiw; - + mat[2] = twoik-twojw; mat[5] = twojk+twoiw; mat[8] = w2-i2-j2+k2; @@ -561,7 +561,7 @@ ucl_inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9]) ------------------------------------------------------------------------- */ ucl_inline void gpu_times_column3(const numtyp m[9], const numtyp v[3], - numtyp ans[3]) + numtyp ans[3]) { ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2]; ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2]; diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu index 30d864aecc..cac77f5dd3 100644 --- a/lib/gpu/lal_ellipsoid_nbor.cu +++ b/lib/gpu/lal_ellipsoid_nbor.cu @@ -29,14 +29,14 @@ texture pos_tex; // -- Only unpack neighbors matching the specified inclusive range of forms // -- Only unpack neighbors within cutoff // --------------------------------------------------------------------------- -__kernel void kernel_nbor(const __global numtyp4 *restrict x_, - const __global numtyp2 *restrict cut_form, - const int ntypes, +__kernel void kernel_nbor(const __global numtyp4 *restrict x_, + const __global numtyp2 *restrict cut_form, + const int ntypes, __global int *dev_nbor, - const int nbor_pitch, const int start, const int inum, - const __global int *dev_ij, + const int nbor_pitch, const int start, const int inum, + const __global int *dev_ij, const int form_low, const int form_high) { - + // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X+start; @@ -47,11 +47,11 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_, nbor+=nbor_pitch; int nbor_end=nbor+fast_mul(numj,nbor_pitch); int packed=ii+nbor_pitch+nbor_pitch; - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul(iw,ntypes); - int newj=0; + int newj=0; for ( ; nbor=form_low && form[mtype]<=form_high) { // Compute r12; numtyp rsq=jx.x-ix.x; diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp index 342ec4ecda..1ef215d7ff 100644 --- a/lib/gpu/lal_gauss.cpp +++ b/lib/gpu/lal_gauss.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,19 +33,19 @@ GaussT::Gauss() : BaseAtomic(), _allocated(false) { } template -GaussT::~Gauss() { +GaussT::~Gauss() { clear(); } - + template int GaussT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int GaussT::init(const int ntypes, - double **host_cutsq, double **host_a, - double **host_b, double **host_offset, +int GaussT::init(const int ntypes, + double **host_cutsq, double **host_a, + double **host_b, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -75,7 +75,7 @@ int GaussT::init(const int ntypes, gauss1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b, - host_cutsq,host_offset); + host_cutsq,host_offset); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -94,12 +94,12 @@ void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b, - host_cutsq,host_offset); + host_cutsq,host_offset); } template @@ -135,7 +135,7 @@ void GaussT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu index 6accf36a06..98e71ea413 100644 --- a/lib/gpu/lal_gauss.cu +++ b/lib/gpu/lal_gauss.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,14 +24,14 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_gauss(const __global numtyp4 *restrict x_, +__kernel void k_gauss(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict gauss1, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -49,20 +49,20 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - + numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - gauss1[mtype].w); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -108,18 +108,18 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_gauss_fast(const __global numtyp4 *restrict x_, +__kernel void k_gauss_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict gauss1_in, - const __global numtyp *restrict sp_lj_in, + const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -127,7 +127,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_, if (tid0) { - numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - + numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - gauss1[mtype].w); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h index 1fd58adae5..d023310c6d 100644 --- a/lib/gpu/lal_gauss.h +++ b/lib/gpu/lal_gauss.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Gauss : public BaseAtomic { public: Gauss(); - ~Gauss(); + ~Gauss(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,16 +38,16 @@ class Gauss : public BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double **host_a, double **host_b, double **host_offset, + double **host_a, double **host_b, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_a, double **host_b, double **host_offset); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -68,7 +68,7 @@ class Gauss : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp index 7c15a12591..7fa4b68870 100644 --- a/lib/gpu/lal_gauss_ext.cpp +++ b/lib/gpu/lal_gauss_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -27,9 +27,9 @@ static Gauss GLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, - double **host_b, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, +int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, + double **host_b, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { GLMF.clear(); @@ -54,7 +54,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, int init_ok=0; if (world_me==0) - init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, + init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -77,7 +77,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, cell_size, gpu_split, screen); GLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -96,16 +96,16 @@ void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a, int world_me=GLMF.device->world_me(); int gpu_rank=GLMF.device->gpu_rank(); int procs_per_gpu=GLMF.device->procs_per_gpu(); - + if (world_me==0) GLMF.reinit(ntypes, cutsq, host_a, host_b, offset); - + GLMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } @@ -124,8 +124,8 @@ int ** gauss_gpu_compute_n(const int ago, const int inum_full, return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void gauss_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp index 1d38810ae8..ba15af672e 100644 --- a/lib/gpu/lal_gayberne.cpp +++ b/lib/gpu/lal_gayberne.cpp @@ -37,21 +37,21 @@ GayBerneT::GayBerne() : BaseEllipsoid(), } template -GayBerneT::~GayBerne() { +GayBerneT::~GayBerne() { clear(); } - + template int GayBerneT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom(max_nbors); } template -int GayBerneT::init(const int ntypes, const double gamma, - const double upsilon, const double mu, - double **host_shape, double **host_well, - double **host_cutsq, double **host_sigma, - double **host_epsilon, double *host_lshape, +int GayBerneT::init(const int ntypes, const double gamma, + const double upsilon, const double mu, + double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, const double *host_special_lj, @@ -84,27 +84,27 @@ int GayBerneT::init(const int ntypes, const double gamma, sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write, - host_sigma,host_epsilon); + host_sigma,host_epsilon); this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write, - host_cutsq,h_form); + host_cutsq,h_form); lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq,h_form); + host_cutsq,h_form); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY); dev_error.zero(); - + // Allocate, cast and asynchronous memcpy of constant data // Copy data for bonded interactions gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY); - host_write[0]=static_cast(gamma); + host_write[0]=static_cast(gamma); host_write[1]=static_cast(upsilon); host_write[2]=static_cast(mu); host_write[3]=static_cast(host_special_lj[0]); @@ -117,7 +117,7 @@ int GayBerneT::init(const int ntypes, const double gamma, UCL_H_Vec d_view; d_view.view(host_lshape,lshape.numel(),*(this->ucl_device)); ucl_copy(lshape,d_view,false); - + // Copy shape, well, sigma, epsilon, and cutsq onto GPU // - cast if necessary shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); @@ -138,7 +138,7 @@ int GayBerneT::init(const int ntypes, const double gamma, } view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device)); ucl_copy(well,view4,false); - + _allocated=true; this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+ lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+ @@ -155,7 +155,7 @@ void GayBerneT::clear() { UCL_H_Vec err_flag(1,*(this->ucl_device)); ucl_copy(err_flag,dev_error,false); if (err_flag[0] == 2) - std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; err_flag.clear(); _allocated=false; @@ -170,7 +170,7 @@ void GayBerneT::clear() { well.clear(); lshape.clear(); gamma_upsilon_mu.clear(); - + this->clear_base(); } @@ -196,7 +196,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=0, NGX; int stride=this->nbor->nbor_pitch(); int ainum=this->ans->inum(); @@ -209,17 +209,17 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { (BX/this->_threads_per_atom))); NGX=static_cast(ceil(static_cast(this->_last_ellipse)/BX)); this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE, - ELLIPSE_ELLIPSE,_shared_types,_lj_types); + ELLIPSE_ELLIPSE,_shared_types,_lj_types); this->time_nbor1.stop(); this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->gamma_upsilon_mu, - &this->sigma_epsilon, &this->_lj_types, - &this->lshape, &this->nbor->dev_nbor, &stride, + &this->sigma_epsilon, &this->_lj_types, + &this->lshape, &this->nbor->dev_nbor, &stride, &this->ans->force, &ainum, &this->ans->engv, - &this->dev_error, &eflag, &vflag, + &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &this->_threads_per_atom); this->time_ellipsoid.stop(); @@ -242,18 +242,18 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { NGX=static_cast(ceil(static_cast(this->ans->inum()- this->_last_ellipse)/BX)); this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(), - SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types); + SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types); this->time_nbor2.stop(); this->time_ellipsoid2.start(); this->k_sphere_ellipsoid.set_size(GX,BX); this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat, - &this->shape, &this->well, - &this->gamma_upsilon_mu, - &this->sigma_epsilon, &this->_lj_types, - &this->lshape, &this->nbor->dev_nbor, - &stride, &this->ans->force, - &this->ans->engv, &this->dev_error, + &this->shape, &this->well, + &this->gamma_upsilon_mu, + &this->sigma_epsilon, &this->_lj_types, + &this->lshape, &this->nbor->dev_nbor, + &stride, &this->ans->force, + &this->ans->engv, &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); this->time_ellipsoid2.stop(); @@ -264,28 +264,28 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->ans->force.zero(); this->ans->engv.zero(); this->time_nbor1.stop(); - this->time_ellipsoid.start(); + this->time_ellipsoid.start(); this->time_ellipsoid.stop(); this->time_nbor2.start(); this->time_nbor2.stop(); this->time_ellipsoid2.start(); this->time_ellipsoid2.stop(); } - + // ------------ LJ --------------- this->time_lj.start(); if (this->_last_ellipseans->inum()) { if (this->_shared_types) { this->k_lj_fast.set_size(GX,BX); - this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, - &this->gamma_upsilon_mu, &stride, + this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, + &this->gamma_upsilon_mu, &stride, &this->nbor->dev_packed, &this->ans->force, - &this->ans->engv, &this->dev_error, &eflag, + &this->ans->engv, &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); } else { this->k_lj.set_size(GX,BX); - this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, + this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, &this->_lj_types, &this->gamma_upsilon_mu, &stride, &this->nbor->dev_packed, &this->ans->force, &this->ans->engv, &this->dev_error, &eflag, @@ -300,12 +300,12 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { NGX=static_cast(ceil(static_cast(this->ans->inum())/BX)); this->time_nbor1.start(); this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE, - ELLIPSE_ELLIPSE,_shared_types,_lj_types); + ELLIPSE_ELLIPSE,_shared_types,_lj_types); this->time_nbor1.stop(); - this->time_ellipsoid.start(); + this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, - &this->shape, &this->well, &this->gamma_upsilon_mu, + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->gamma_upsilon_mu, &this->sigma_epsilon, &this->_lj_types, &this->lshape, &this->nbor->dev_nbor, &stride, &this->ans->force, &ainum, &this->ans->engv, &this->dev_error, diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu index 1a7e69eeba..dc6e00ec82 100644 --- a/lib/gpu/lal_gayberne.cu +++ b/lib/gpu/lal_gayberne.cu @@ -17,93 +17,93 @@ #include "lal_ellipsoid_extra.h" #endif -ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, +ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, numtyp ans[9]) { numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; den = ucl_recip(den); - + ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- - m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ - m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- - m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ - m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; - + m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ + m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- + m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ + m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; + ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ - (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- - (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- - m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ - m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; - + (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- + (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- + m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ + m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; + ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- - m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- - m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ - (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ - m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; - + m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- + m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ + (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ + m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; + ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ - m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ - m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- - m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- - m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; - + m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ + m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- + m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- + m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; + ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ - (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- - (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ - m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- - m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; - + (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- + (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ + m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- + m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; + ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- - m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ - (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ - m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- - (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; - + m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ + (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ + m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- + (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; + ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ - (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ - m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- - m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- - m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; - + (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ + m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- + m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- + m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; + ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- - (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ - (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- - m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ - m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; - + (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ + (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- + m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ + m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; + ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- - m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- - m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ - (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ - m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; + m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- + m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ + (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ + m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; } __kernel void k_gayberne(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict q, - const __global numtyp4 *restrict shape, - const __global numtyp4 *restrict well, - const __global numtyp *restrict gum, - const __global numtyp2 *restrict sig_eps, - const int ntypes, - const __global numtyp *restrict lshape, - const __global int *dev_nbor, - const int stride, - __global acctyp4 *restrict ans, - const int astride, - __global acctyp *restrict engv, - __global int *restrict err_flag, + const __global numtyp4 *restrict shape, + const __global numtyp4 *restrict well, + const __global numtyp *restrict gum, + const __global numtyp2 *restrict sig_eps, + const int ntypes, + const __global numtyp *restrict lshape, + const __global int *dev_nbor, + const int stride, + __global acctyp4 *restrict ans, + const int astride, + __global acctyp *restrict engv, + __global int *restrict err_flag, const int eflag, const int vflag, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; - sp_lj[0]=gum[3]; - sp_lj[1]=gum[4]; - sp_lj[2]=gum[5]; - sp_lj[3]=gum[6]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; acctyp energy=(acctyp)0; acctyp4 f; @@ -124,7 +124,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; numtyp a1[9], b1[9], g1[9]; @@ -159,7 +159,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, numtyp a2[9]; gpu_quat_to_mat_trans(q,j,a2); - + numtyp u_r, dUr[3], tUr[3], eta, teta[3]; { // Compute U_r, dUr, eta, and teta // Compute g12 @@ -173,7 +173,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, } { // Compute U_r and dUr - + // Compute kappa numtyp kappa[3]; gpu_mldivide3(g12,r12,kappa,err_flag); @@ -189,7 +189,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, kappa[2]*=ir; // energy - + // compute u_r and dUr numtyp uslj_rsq; { @@ -203,7 +203,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, kappa[0]*=r; kappa[1]*=r; kappa[2]*=r; - + int mtype=fast_mul(ntypes,itype)+jtype; numtyp sigma = sig_eps[mtype].x; numtyp epsilon = sig_eps[mtype].y; @@ -235,14 +235,14 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, } } } - + // Compute eta { eta = (numtyp)2.0*lshape[itype]*lshape[jtype]; numtyp det_g12 = gpu_det3(g12); eta = ucl_powr(eta/det_g12,gum[1]); } - + // Compute teta numtyp temp[9], tempv[3], tempv2[3]; compute_eta_torque(g12,a1,ishape,temp); @@ -255,7 +255,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, teta[0] = tempv2[0]; teta[1] = tempv2[1]; teta[2] = tempv2[2]; - + tempv[0] = temp1*temp[3]; tempv[1] = temp1*temp[4]; tempv[2] = temp1*temp[5]; @@ -272,7 +272,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, teta[1] += tempv2[1]; teta[2] += tempv2[2]; } - + numtyp chi, dchi[3], tchi[3]; { // Compute chi and dchi @@ -355,7 +355,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; - + } // for nbor store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, vflag,ans,engv); diff --git a/lib/gpu/lal_gayberne.h b/lib/gpu/lal_gayberne.h index dacaf74282..8792f1f1db 100644 --- a/lib/gpu/lal_gayberne.h +++ b/lib/gpu/lal_gayberne.h @@ -25,14 +25,14 @@ template class GayBerne : public BaseEllipsoid { public: GayBerne(); - ~GayBerne(); + ~GayBerne(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device + * \param gpu_split fraction of particles handled by device * \return false if there is not sufficient memory or device init prob - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,18 +41,18 @@ class GayBerne : public BaseEllipsoid { * - -5 Double precision is not supported on card **/ int init(const int ntypes, const double gamma, const double upsilon, const double mu, double **host_shape, - double **host_well, double **host_cutsq, double **host_sigma, + double **host_well, double **host_cutsq, double **host_sigma, double **host_epsilon, double *host_lshape, int **h_form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - const double *host_special_lj, const int nlocal, const int nall, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + const double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); - + /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; @@ -61,8 +61,8 @@ class GayBerne : public BaseEllipsoid { /// Device Error Flag - Set if a bad matrix inversion occurs UCL_D_Vec dev_error; - - // --------------------------- TYPE DATA -------------------------- + + // --------------------------- TYPE DATA -------------------------- /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form UCL_D_Vec lj1; @@ -72,12 +72,12 @@ class GayBerne : public BaseEllipsoid { UCL_D_Vec sigma_epsilon; // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ... UCL_D_Vec gamma_upsilon_mu; - + /// If atom type constants fit in shared memory, use fast kernels bool _shared_types; int _lj_types; - - // --------------------------- ATOM DATA -------------------------- + + // --------------------------- ATOM DATA -------------------------- /// Aspherical Const Data for Atoms UCL_D_Vec shape, well; diff --git a/lib/gpu/lal_gayberne_ext.cpp b/lib/gpu/lal_gayberne_ext.cpp index e674fb376b..451550e7ef 100644 --- a/lib/gpu/lal_gayberne_ext.cpp +++ b/lib/gpu/lal_gayberne_ext.cpp @@ -33,7 +33,7 @@ int gb_gpu_init(const int ntypes, const double gamma, double **epsilon, double *host_lshape, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { GBMF.clear(); @@ -58,16 +58,16 @@ int gb_gpu_init(const int ntypes, const double gamma, int init_ok=0; if (world_me==0) - init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, - sigma, epsilon, host_lshape, form, host_lj1, - host_lj2, host_lj3, host_lj4, offset, special_lj, + init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, + sigma, epsilon, host_lshape, form, host_lj1, + host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); GBMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); - + for (int i=0; igpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -105,8 +105,8 @@ void gb_gpu_clear() { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **host_quat); @@ -117,8 +117,8 @@ int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { - return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, - tag, nspecial, special, eflag, vflag, eatom, vatom, + return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, + tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_quat); } diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu index 9b33b5f7f3..7925b72784 100644 --- a/lib/gpu/lal_gayberne_lj.cu +++ b/lib/gpu/lal_gayberne_lj.cu @@ -18,30 +18,30 @@ #endif __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict q, + const __global numtyp4 *restrict q, const __global numtyp4 *restrict shape, - const __global numtyp4 *restrict well, - const __global numtyp *restrict gum, + const __global numtyp4 *restrict well, + const __global numtyp *restrict gum, const __global numtyp2 *restrict sig_eps, - const int ntypes, + const int ntypes, const __global numtyp *restrict lshape, - const __global int *dev_nbor, + const __global int *dev_nbor, const int stride, - __global acctyp4 *restrict ans, + __global acctyp4 *restrict ans, __global acctyp *restrict engv, - __global int *restrict err_flag, + __global int *restrict err_flag, const int eflag, const int vflag, - const int start, const int inum, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; __local numtyp sp_lj[4]; - sp_lj[0]=gum[3]; - sp_lj[1]=gum[4]; - sp_lj[2]=gum[5]; - sp_lj[3]=gum[6]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; acctyp energy=(acctyp)0; acctyp4 f; @@ -58,16 +58,16 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; - + numtyp oner=shape[itype].x; numtyp one_well=well[itype].x; - + numtyp factor_lj; for ( ; nbor0) { numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); - energy+=factor_lj*(e-lj3[ii].z); + energy+=factor_lj*(e-lj3[ii].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -332,33 +332,33 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict gum, - const int stride, +__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1_in, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict gum, + const int stride, const __global int *dev_ij, - __global acctyp4 *restrict ans, + __global acctyp4 *restrict ans, __global acctyp *restrict engv, - __global int *restrict err_flag, - const int eflag, const int vflag, - const int start, const int inum, + __global int *restrict err_flag, + const int eflag, const int vflag, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; - __local numtyp sp_lj[4]; + __local numtyp sp_lj[4]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (tid<4) - sp_lj[tid]=gum[tid+3]; + sp_lj[tid]=gum[tid+3]; if (tid0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; @@ -367,9 +367,9 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp index 6c6e145319..978b33e5d7 100644 --- a/lib/gpu/lal_lj.cpp +++ b/lib/gpu/lal_lj.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -33,20 +33,20 @@ LJT::LJ() : BaseAtomic(), _allocated(false) { } template -LJT::~LJ() { +LJT::~LJ() { clear(); } - + template int LJT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int LJT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, +int LJT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -76,11 +76,11 @@ int LJT::init(const int ntypes, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq); + host_cutsq); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -99,10 +99,10 @@ void LJT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, host_cutsq); this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, @@ -143,7 +143,7 @@ void LJT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -155,12 +155,12 @@ void LJT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu index 9569cb0fd7..5838ac95cf 100644 --- a/lib/gpu/lal_lj.cu +++ b/lib/gpu/lal_lj.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -24,15 +24,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_lj(const __global numtyp4 *restrict x_, +__kernel void k_lj(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -44,19 +44,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -101,19 +101,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict sp_lj_in, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; @@ -124,7 +124,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -133,7 +133,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h index 63a3e8a6c9..01ce85c8ea 100644 --- a/lib/gpu/lal_lj.h +++ b/lib/gpu/lal_lj.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class LJ : public BaseAtomic { public: LJ(); - ~LJ(); + ~LJ(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,15 +40,15 @@ class LJ : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -71,7 +71,7 @@ class LJ : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp index 70e46b9fe1..191f211ae4 100644 --- a/lib/gpu/lal_lj96.cpp +++ b/lib/gpu/lal_lj96.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -36,7 +36,7 @@ template LJ96T::~LJ96() { clear(); } - + template int LJ96T::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,9 +44,9 @@ int LJ96T::bytes_per_atom(const int max_nbors) const { template int LJ96T::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -76,11 +76,11 @@ int LJ96T::init(const int ntypes, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq); + host_cutsq); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -126,7 +126,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -138,7 +138,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu index b219b8bf0d..3bb7750022 100644 --- a/lib/gpu/lal_lj96.cu +++ b/lib/gpu/lal_lj96.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -26,13 +26,13 @@ texture pos_tex; __kernel void k_lj96(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -50,20 +50,20 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -109,15 +109,15 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj96_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj96_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -132,30 +132,30 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj96.h b/lib/gpu/lal_lj96.h index 7d51e287d3..3fdea5265e 100644 --- a/lib/gpu/lal_lj96.h +++ b/lib/gpu/lal_lj96.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class LJ96 : public BaseAtomic { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class LJ96 : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -66,7 +66,7 @@ class LJ96 : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp index 14c32ef95e..5c4a58c5e8 100644 --- a/lib/gpu/lal_lj96_ext.cpp +++ b/lib/gpu/lal_lj96_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -77,7 +77,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, cell_size, gpu_split, screen); LJ96MF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,8 +102,8 @@ int** lj96_gpu_compute_n(const int ago, const int inum_full, return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void lj96_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp index ef59843c4a..497e5989ad 100644 --- a/lib/gpu/lal_lj_class2_long.cpp +++ b/lib/gpu/lal_lj_class2_long.cpp @@ -38,7 +38,7 @@ template LJClass2LongT::~LJClass2Long() { clear(); } - + template int LJClass2LongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -46,8 +46,8 @@ int LJClass2LongT::bytes_per_atom(const int max_nbors) const { template int LJClass2LongT::init(const int ntypes, double **host_cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -80,11 +80,11 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq, host_cut_ljsq); + host_cutsq, host_cut_ljsq); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { @@ -136,7 +136,7 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -145,11 +145,11 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, - &_cut_coulsq, &_qqrd2e, &_g_ewald, + &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu index e16de3a327..41ceca35d7 100644 --- a/lib/gpu/lal_lj_class2_long.cu +++ b/lib/gpu/lal_lj_class2_long.cu @@ -32,15 +32,15 @@ texture q_tex; __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj3, - const int lj_types, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const __global numtyp *restrict q_, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -63,14 +63,14 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -147,20 +147,20 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const __global numtyp *restrict q_, - const numtyp cut_coulsq, + const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, + const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -175,7 +175,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -183,16 +183,16 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class LJClass2Long : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -68,7 +68,7 @@ class LJClass2Long : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp index 4bb3aad7ad..6ed15126d9 100644 --- a/lib/gpu/lal_lj_class2_long_ext.cpp +++ b/lib/gpu/lal_lj_class2_long_ext.cpp @@ -82,7 +82,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); C2CLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -99,7 +99,7 @@ void c2cl_gpu_clear() { int** c2cl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -109,8 +109,8 @@ int** c2cl_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void c2cl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp index 8030f3cfc2..a8255318bd 100644 --- a/lib/gpu/lal_lj_coul.cpp +++ b/lib/gpu/lal_lj_coul.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template LJCoulT::~LJCoul() { clear(); } - + template int LJCoulT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,9 +45,9 @@ int LJCoulT::bytes_per_atom(const int max_nbors) const { template int LJCoulT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -79,11 +79,11 @@ int LJCoulT::init(const int ntypes, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cut_ljsq, host_cut_coulsq); + host_cut_ljsq, host_cut_coulsq); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); @@ -138,7 +138,7 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -149,14 +149,14 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu index 364203db22..5c7f0da46f 100644 --- a/lib/gpu/lal_lj_coul.cu +++ b/lib/gpu/lal_lj_coul.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_lj_coul(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, + const __global numtyp4 *restrict lj3, + const int lj_types, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, - const __global numtyp *restrict q_, - const __global numtyp *restrict cutsq, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -63,14 +63,14 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -140,16 +140,16 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_, __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const __global numtyp *restrict q_, - const __global numtyp *restrict _cutsq, + const __global numtyp *restrict _cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -166,7 +166,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -174,16 +174,16 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class LJCoul : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, @@ -70,7 +70,7 @@ class LJCoul : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp index 135a4dfd9d..92167f314f 100644 --- a/lib/gpu/lal_lj_coul_debye.cpp +++ b/lib/gpu/lal_lj_coul_debye.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template LJCoulDebyeT::~LJCoulDebye() { clear(); } - + template int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,9 +45,9 @@ int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const { template int LJCoulDebyeT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -80,11 +80,11 @@ int LJCoulDebyeT::init(const int ntypes, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cut_ljsq, host_cut_coulsq); + host_cut_ljsq, host_cut_coulsq); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); @@ -98,7 +98,7 @@ int LJCoulDebyeT::init(const int ntypes, _qqrd2e=qqrd2e; _kappa=kappa; - + _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ sp_lj.row_bytes(); @@ -140,7 +140,7 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -157,9 +157,9 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) { } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->q, &cutsq, + &ainum, &nbor_pitch, &this->atom->q, &cutsq, &_qqrd2e, &_kappa, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu index 308504c6c8..91b105b3da 100644 --- a/lib/gpu/lal_lj_coul_debye.cu +++ b/lib/gpu/lal_lj_coul_debye.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,19 +29,19 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_lj_debye(const __global numtyp4 *restrict x_, +__kernel void k_lj_debye(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp *restrict q_ , - const __global numtyp *restrict cutsq, + const __global numtyp *restrict cutsq, const numtyp qqrd2e, const numtyp kappa, const int t_per_atom) { int tid, ii, offset; @@ -64,14 +64,14 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -147,15 +147,15 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_, __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, - const __global numtyp *restrict q_, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, const __global numtyp *restrict _cutsq, const numtyp qqrd2e, const numtyp kappa, const int t_per_atom) { @@ -174,7 +174,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -182,16 +182,16 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,7 +40,7 @@ class LJCoulDebye : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, @@ -70,7 +70,7 @@ class LJCoulDebye : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e,_kappa; diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp index 67f5a0075f..3a0a3593e7 100644 --- a/lib/gpu/lal_lj_coul_debye_ext.cpp +++ b/lib/gpu/lal_lj_coul_debye_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,7 +33,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e, + double *host_special_coul, const double qqrd2e, const double kappa) { LJCDMF.clear(); gpu_mode=LJCDMF.device->gpu_mode(); @@ -82,7 +82,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e, kappa); LJCDMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -99,7 +99,7 @@ void ljcd_gpu_clear() { int** ljcd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -109,8 +109,8 @@ int** ljcd_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void ljcd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp index 3b5cc09805..b803101b9e 100644 --- a/lib/gpu/lal_lj_coul_ext.cpp +++ b/lib/gpu/lal_lj_coul_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -81,7 +81,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e); LJCMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,7 +98,7 @@ void ljc_gpu_clear() { int** ljc_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -108,8 +108,8 @@ int** ljc_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void ljc_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp index 03f32a5fd0..29d648bed2 100644 --- a/lib/gpu/lal_lj_coul_long.cpp +++ b/lib/gpu/lal_lj_coul_long.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template LJCoulLongT::~LJCoulLong() { clear(); } - + template int LJCoulLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,9 +45,9 @@ int LJCoulLongT::bytes_per_atom(const int max_nbors) const { template int LJCoulLongT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -80,11 +80,11 @@ int LJCoulLongT::init(const int ntypes, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq, host_cut_ljsq); + host_cutsq, host_cut_ljsq); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { @@ -109,10 +109,10 @@ void LJCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, host_cutsq, host_cut_ljsq); this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, @@ -153,7 +153,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -162,7 +162,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu index e0aa2e8a58..0e25bb2dbc 100644 --- a/lib/gpu/lal_lj_coul_long.cu +++ b/lib/gpu/lal_lj_coul_long.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -29,17 +29,17 @@ texture q_tex; #define q_tex q_ #endif -__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { @@ -63,14 +63,14 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -145,14 +145,14 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp *restrict q_, @@ -171,7 +171,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -179,16 +179,16 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class LJCoulLong : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); @@ -73,7 +73,7 @@ class LJCoulLong : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp index dc93365f22..6f8b5c9fe1 100644 --- a/lib/gpu/lal_lj_coul_long_ext.cpp +++ b/lib/gpu/lal_lj_coul_long_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -82,7 +82,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); LJCLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,15 +102,15 @@ void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, int world_me=LJCLMF.device->world_me(); int gpu_rank=LJCLMF.device->gpu_rank(); int procs_per_gpu=LJCLMF.device->procs_per_gpu(); - + if (world_me==0) - LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_cut_ljsq); LJCLMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } @@ -122,7 +122,7 @@ void ljcl_gpu_clear() { int** ljcl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -132,8 +132,8 @@ int** ljcl_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void ljcl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_lj_coul_msm.cpp b/lib/gpu/lal_lj_coul_msm.cpp index dd045b7970..1358de9ee1 100644 --- a/lib/gpu/lal_lj_coul_msm.cpp +++ b/lib/gpu/lal_lj_coul_msm.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -37,7 +37,7 @@ template LJCoulMSMT::~LJCoulMSM() { clear(); } - + template int LJCoulMSMT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,8 +45,8 @@ int LJCoulMSMT::bytes_per_atom(const int max_nbors) const { template int LJCoulMSMT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, double **host_gcons, double **host_dgcons, double **host_offset, double *host_special_lj, const int nlocal, @@ -81,11 +81,11 @@ int LJCoulMSMT::init(const int ntypes, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq, host_cut_ljsq); + host_cutsq, host_cut_ljsq); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); // pack gcons and dgcons int nrows, ncols; @@ -93,11 +93,11 @@ int LJCoulMSMT::init(const int ntypes, ncols = 7; UCL_H_Vec dview_gcons(nrows*ncols,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ixucl_device),UCL_READ_ONLY); ucl_copy(gcons,dview_gcons,false); gcons_tex.get_texture(*(this->pair_program),"gcons_tex"); @@ -107,11 +107,11 @@ int LJCoulMSMT::init(const int ntypes, ncols = 6; UCL_H_Vec dview_dgcons(nrows*ncols,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int ix=0; ixucl_device),UCL_READ_ONLY); ucl_copy(dgcons,dview_dgcons,false); dgcons_tex.get_texture(*(this->pair_program),"dgcons_tex"); @@ -170,7 +170,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -179,7 +179,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu index 0c7c3cdace..3f73c6f47d 100644 --- a/lib/gpu/lal_lj_coul_msm.cu +++ b/lib/gpu/lal_lj_coul_msm.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -80,19 +80,19 @@ ucl_inline numtyp dgamma(const numtyp rho, const int order, return ((numtyp)-1.0/rho/rho); } -__kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj3, const __global numtyp *restrict gcons, const __global numtyp *restrict dgcons, const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const int order, const int t_per_atom) { @@ -116,20 +116,20 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { virial[0] += delx*delx*force; @@ -199,7 +199,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict gcons, @@ -227,7 +227,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -235,16 +235,16 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,8 +41,8 @@ class LJCoulMSM : public BaseCharge { double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_gcons, double **host_dgcons, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const int order, const double qqrd2e); @@ -65,14 +65,14 @@ class LJCoulMSM : public BaseCharge { UCL_D_Vec lj3; /// Special LJ values [0-3] and Special Coul values [4-7] UCL_D_Vec sp_lj; - + UCL_D_Vec gcons, dgcons; UCL_Texture gcons_tex, dgcons_tex; - + /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e; diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp index ecf3254cf9..bf520e4dc5 100644 --- a/lib/gpu/lal_lj_coul_msm_ext.cpp +++ b/lib/gpu/lal_lj_coul_msm_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -84,7 +84,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, host_cut_coulsq, host_special_coul, order, qqrd2e); LJCMLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -101,7 +101,7 @@ void ljcm_gpu_clear() { int** ljcm_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -111,8 +111,8 @@ int** ljcm_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void ljcm_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_lj_cubic.cpp b/lib/gpu/lal_lj_cubic.cpp index 25f83166e1..21ea22845c 100644 --- a/lib/gpu/lal_lj_cubic.cpp +++ b/lib/gpu/lal_lj_cubic.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -33,21 +33,21 @@ LJCubicT::LJCubic() : BaseAtomic(), _allocated(false) { } template -LJCubicT::~LJCubic() { +LJCubicT::~LJCubic() { clear(); } - + template int LJCubicT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int LJCubicT::init(const int ntypes, +int LJCubicT::init(const int ntypes, double **host_cutsq, double **host_cut_inner_sq, - double **host_cut_inner, double **host_sigma, - double **host_epsilon, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, + double **host_cut_inner, double **host_sigma, + double **host_epsilon, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -77,11 +77,11 @@ int LJCubicT::init(const int ntypes, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq); + host_cutsq); lj2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj2,host_write,host_cut_inner_sq, - host_cut_inner,host_sigma,host_epsilon); + host_cut_inner,host_sigma,host_epsilon); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack2(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4); @@ -132,7 +132,7 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -144,12 +144,12 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj2, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &lj2, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_lj_cubic.cu b/lib/gpu/lal_lj_cubic.cu index 420689383f..a4b1992f33 100644 --- a/lib/gpu/lal_lj_cubic.cu +++ b/lib/gpu/lal_lj_cubic.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ndactrung@gmail.com // ***************************************************************************/ @@ -31,16 +31,16 @@ texture pos_tex; #define _DPHIDS (numtyp)2.6899009 // gradient at s #define _A3 (numtyp)27.93357 // cubic coefficient -__kernel void k_lj_cubic(const __global numtyp4 *restrict x_, +__kernel void k_lj_cubic(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, const __global numtyp4 *restrict lj2, - const __global numtyp2 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + const __global numtyp2 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -52,19 +52,19 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e; - if (rsq <= lj2[mtype].x) + if (rsq <= lj2[mtype].x) e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); else e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -122,20 +122,20 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj2_in, - const __global numtyp2 *restrict lj3_in, - const __global numtyp *restrict sp_lj_in, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global numtyp2 *restrict lj3_in, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp2 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; @@ -148,7 +148,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -157,7 +157,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii0) { numtyp e; - if (rsq <= lj2[mtype].x) + if (rsq <= lj2[mtype].x) e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); else e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj_cubic.h b/lib/gpu/lal_lj_cubic.h index 0fefc727eb..818fb3581b 100644 --- a/lib/gpu/lal_lj_cubic.h +++ b/lib/gpu/lal_lj_cubic.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -24,13 +24,13 @@ template class LJCubic : public BaseAtomic { public: LJCubic(); - ~LJCubic(); + ~LJCubic(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,11 +39,11 @@ class LJCubic : public BaseAtomic { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double **host_cut_inner_sq, double **host_cut_inner, double **host_sigma, double **host_epsilon, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -68,7 +68,7 @@ class LJCubic : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj_cubic_ext.cpp b/lib/gpu/lal_lj_cubic_ext.cpp index 518f706781..efbcee0a9f 100644 --- a/lib/gpu/lal_lj_cubic_ext.cpp +++ b/lib/gpu/lal_lj_cubic_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -27,11 +27,11 @@ static LJCubic LJCubicLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, +int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, double **cut_inner, double **sigma, double **epsilon, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double *special_lj, - const int inum, const int nall, const int max_nbors, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double *special_lj, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { LJCubicLMF.clear(); @@ -81,7 +81,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, cell_size, gpu_split, screen); LJCubicLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -106,8 +106,8 @@ int ** ljcb_gpu_compute_n(const int ago, const int inum_full, return LJCubicLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void ljcb_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp index 1b8fdeabb0..1efac3e821 100644 --- a/lib/gpu/lal_lj_dsf.cpp +++ b/lib/gpu/lal_lj_dsf.cpp @@ -37,22 +37,22 @@ template LJDSFT::~LJDSF() { clear(); } - + template int LJDSFT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, +int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, - double **host_offset, double *host_special_lj, + double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, - const double e_shift, const double f_shift, + const double e_shift, const double f_shift, const double alpha) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -84,11 +84,11 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cut_ljsq, host_cutsq); + host_cut_ljsq, host_cutsq); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { @@ -138,7 +138,7 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -149,15 +149,15 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, &this->_threads_per_atom); diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu index 5e0cd4aca9..323576fe77 100644 --- a/lib/gpu/lal_lj_dsf.cu +++ b/lib/gpu/lal_lj_dsf.cu @@ -31,20 +31,20 @@ texture q_tex; #define MY_PIS (acctyp)1.77245385090551602729 -__kernel void k_lj_dsf(const __global numtyp4 *restrict x_, +__kernel void k_lj_dsf(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, + const int nbor_pitch, const __global numtyp *restrict q_ , const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp e_shift, const numtyp f_shift, + const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -66,20 +66,20 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -119,7 +119,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, numtyp erfcd = ucl_exp(-alpha*alpha*rsq); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r); erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd; - forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + + forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + rsq*f_shift-factor_coul); } else forcecoul = (numtyp)0.0; @@ -156,19 +156,19 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp e_shift, const numtyp f_shift, + const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -183,7 +183,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -191,23 +191,23 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -246,7 +246,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, numtyp erfcd = ucl_exp(-alpha*alpha*rsq); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r); erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd; - forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + + forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + rsq*f_shift-factor_coul); } else forcecoul = (numtyp)0.0; diff --git a/lib/gpu/lal_lj_dsf.h b/lib/gpu/lal_lj_dsf.h index 5badf543c4..0195898ca4 100644 --- a/lib/gpu/lal_lj_dsf.h +++ b/lib/gpu/lal_lj_dsf.h @@ -30,7 +30,7 @@ class LJDSF : public BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,11 +40,11 @@ class LJDSF : public BaseCharge { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double e_shift, const double f_shift, + const double qqrd2e, const double e_shift, const double f_shift, const double alpha); /// Clear all host and device data @@ -69,7 +69,7 @@ class LJDSF : public BaseCharge { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; numtyp _qqrd2e; diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp index 719a792d7f..25802e7544 100644 --- a/lib/gpu/lal_lj_dsf_ext.cpp +++ b/lib/gpu/lal_lj_dsf_ext.cpp @@ -34,7 +34,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, - const double e_shift, const double f_shift, + const double e_shift, const double f_shift, const double alpha) { LJDMF.clear(); gpu_mode=LJDMF.device->gpu_mode(); @@ -85,7 +85,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, f_shift, alpha); LJDMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,7 +102,7 @@ void ljd_gpu_clear() { int** ljd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -112,8 +112,8 @@ int** ljd_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); -} - +} + void ljd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp index 03526bc095..34a4d71c0b 100644 --- a/lib/gpu/lal_lj_expand.cpp +++ b/lib/gpu/lal_lj_expand.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ibains@nvidia.com ***************************************************************************/ @@ -36,7 +36,7 @@ template LJExpandT::~LJExpand() { clear(); } - + template int LJExpandT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -76,11 +76,11 @@ int LJExpandT::init(const int ntypes, double **host_cutsq, lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq, host_shift); + host_cutsq, host_shift); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -97,17 +97,17 @@ void LJExpandT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double **host_shift) { - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, host_cutsq, host_shift); - + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, host_offset); } @@ -146,7 +146,7 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -155,15 +155,15 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu index 6b79db2323..9281ad27bd 100644 --- a/lib/gpu/lal_lj_expand.cu +++ b/lib/gpu/lal_lj_expand.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ibains@nvidia.com // ***************************************************************************/ @@ -26,15 +26,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_lj_expand(const __global numtyp4 *restrict x_, +__kernel void k_lj_expand(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -52,20 +52,20 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -113,15 +113,15 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, +__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -136,30 +136,30 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, if (eflag>0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(numtyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h index 0d0ae0b2e6..a732a3a686 100644 --- a/lib/gpu/lal_lj_expand.h +++ b/lib/gpu/lal_lj_expand.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ibains@nvidia.com ***************************************************************************/ @@ -30,7 +30,7 @@ class LJExpand : public BaseAtomic { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,15 +40,15 @@ class LJExpand : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double **host_shift, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double **host_shift); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -71,7 +71,7 @@ class LJExpand : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp index 5303149d1f..94a57192b9 100644 --- a/lib/gpu/lal_lj_expand_ext.cpp +++ b/lib/gpu/lal_lj_expand_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ibains@nvidia.com ***************************************************************************/ @@ -30,7 +30,7 @@ static LJExpand LJEMF; int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double **shift, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { LJEMF.clear(); @@ -78,7 +78,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, cell_size, gpu_split,screen); LJEMF.device->world_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -98,12 +98,12 @@ int lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, int world_me=LJEMF.device->world_me(); int gpu_rank=LJEMF.device->gpu_rank(); int procs_per_gpu=LJEMF.device->procs_per_gpu(); - + if (world_me==0) LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, shift); LJEMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -97,11 +97,11 @@ void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, int world_me=LJLMF.device->world_me(); int gpu_rank=LJLMF.device->gpu_rank(); int procs_per_gpu=LJLMF.device->procs_per_gpu(); - + if (world_me==0) LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset); LJLMF.device->world_barrier(); - + for (int i=0; i LJGROMACST::~LJGROMACS() { clear(); } - + template int LJGROMACST::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -47,11 +47,11 @@ template int LJGROMACST::init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, double **host_ljsw5, + double **host_ljsw4, double **host_ljsw5, double **cut_inner, double **cut_inner_sq) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -134,7 +134,7 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -146,16 +146,16 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types, + this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types, &sp_lj, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu index f20d8634a5..93dc3d9456 100644 --- a/lib/gpu/lal_lj_gromacs.cu +++ b/lib/gpu/lal_lj_gromacs.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -35,8 +35,8 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -59,7 +59,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; @@ -83,7 +83,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, if (rsq lj1[mtype].w) { @@ -91,7 +91,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_, t = r - lj3[mtype].z; numtyp fswitch = r*t*t*(ljsw[mtype].x + ljsw[mtype].y*t); force_lj += fswitch; - } + } force = factor_lj*force_lj * r2inv; @@ -149,22 +149,22 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_, lj3[tid]=lj3_in[tid]; ljsw[tid]=ljsw_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii lj1[mtype].w) { @@ -196,7 +196,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_, t = r - lj3[mtype].z; numtyp fswitch = r*t*t*(ljsw[mtype].x + ljsw[mtype].y*t); force_lj += fswitch; - } + } force = factor_lj*force_lj * r2inv; diff --git a/lib/gpu/lal_lj_gromacs.h b/lib/gpu/lal_lj_gromacs.h index dc949be4a9..1e0f72dafc 100644 --- a/lib/gpu/lal_lj_gromacs.h +++ b/lib/gpu/lal_lj_gromacs.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ class LJGROMACS : public BaseAtomic { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,11 +40,11 @@ class LJGROMACS : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, double **host_ljsw5, + double **host_ljsw4, double **host_ljsw5, double **cut_inner, double **cut_inner_sq); /// Clear all host and device data @@ -71,7 +71,7 @@ class LJGROMACS : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_lj_gromacs_ext.cpp b/lib/gpu/lal_lj_gromacs_ext.cpp index b5eb0038b7..53b93bfdff 100644 --- a/lib/gpu/lal_lj_gromacs_ext.cpp +++ b/lib/gpu/lal_lj_gromacs_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,7 +33,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, - double **host_ljsw4, double **host_ljsw5, + double **host_ljsw4, double **host_ljsw5, double **cut_inner, double **cut_inner_sq) { LJGRMMF.clear(); gpu_mode=LJGRMMF.device->gpu_mode(); @@ -59,7 +59,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, if (world_me==0) LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, special_lj, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, + gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq); LJGRMMF.device->world_barrier(); @@ -78,11 +78,11 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, if (gpu_rank==i && world_me!=0) init_ok=LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, special_lj, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, + gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq); LJGRMMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -107,8 +107,8 @@ int ** ljgrm_gpu_compute_n(const int ago, const int inum_full, return LJGRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void ljgrm_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, @@ -118,7 +118,7 @@ void ljgrm_gpu_compute(const int ago, const int inum_full, const int nall, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } - + double ljgrm_gpu_bytes() { return LJGRMMF.host_memory_usage(); } diff --git a/lib/gpu/lal_mie.cpp b/lib/gpu/lal_mie.cpp index 2ab7cb8d14..1510275047 100644 --- a/lib/gpu/lal_mie.cpp +++ b/lib/gpu/lal_mie.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,17 +33,17 @@ MieT::Mie() : BaseAtomic(), _allocated(false) { } template -MieT::~Mie() { +MieT::~Mie() { clear(); } - + template int MieT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int MieT::init(const int ntypes, double **host_cutsq, +int MieT::init(const int ntypes, double **host_cutsq, double **host_mie1, double **host_mie2, double **host_mie3, double **host_mie4, double **host_gamA, double **host_gamR, @@ -76,12 +76,12 @@ int MieT::init(const int ntypes, double **host_cutsq, mie1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,mie1,host_write,host_mie1,host_mie2, - host_gamA,host_gamR); + host_gamA,host_gamR); mie3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,mie3,host_write,host_mie3,host_mie4, - host_offset,host_cutsq); - + host_offset,host_cutsq); + UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); @@ -126,7 +126,7 @@ void MieT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu index 4d718897eb..33018566eb 100644 --- a/lib/gpu/lal_mie.cu +++ b/lib/gpu/lal_mie.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,15 +24,15 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_mie(const __global numtyp4 *restrict x_, +__kernel void k_mie(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict mie1, const __global numtyp4 *restrict mie3, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -50,20 +50,20 @@ __kernel void k_mie(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii class Mie : public BaseAtomic { public: Mie(); - ~Mie(); + ~Mie(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,8 +41,8 @@ class Mie : public BaseAtomic { double **host_mie1, double **host_mie2, double **host_mie3, double **host_mie4, double **host_gamA, double **host_gamR, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -67,7 +67,7 @@ class Mie : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp index d7c4187a42..9b03903c4f 100644 --- a/lib/gpu/lal_mie_ext.cpp +++ b/lib/gpu/lal_mie_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -81,7 +81,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1, cell_size, gpu_split, screen); MLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -106,8 +106,8 @@ int ** mie_gpu_compute_n(const int ago, const int inum_full, return MLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void mie_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp index ddf7d843e6..cbdf928863 100644 --- a/lib/gpu/lal_morse.cpp +++ b/lib/gpu/lal_morse.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -33,20 +33,20 @@ MorseT::Morse() : BaseAtomic(), _allocated(false) { } template -MorseT::~Morse() { +MorseT::~Morse() { clear(); } - + template int MorseT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int MorseT::init(const int ntypes, - double **host_cutsq, double **host_morse1, - double **host_r0, double **host_alpha, - double **host_d0, double **host_offset, +int MorseT::init(const int ntypes, + double **host_cutsq, double **host_morse1, + double **host_r0, double **host_alpha, + double **host_d0, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -125,7 +125,7 @@ void MorseT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -135,14 +135,14 @@ void MorseT::loop(const bool _eflag, const bool _vflag) { if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu index 2015c71cb2..0a14071d19 100644 --- a/lib/gpu/lal_morse.cu +++ b/lib/gpu/lal_morse.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -26,13 +26,13 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_morse(const __global numtyp4 *restrict x_, +__kernel void k_morse(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict mor1, - const __global numtyp2 *restrict mor2, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp2 *restrict mor2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, @@ -59,13 +59,13 @@ __kernel void k_morse(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; for ( ; nbor0) { numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y; - energy+=e*factor_lj; + energy+=e*factor_lj; } if (vflag>0) { virial[0] += delx*delx*force; @@ -111,15 +111,15 @@ __kernel void k_morse(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_morse_fast(const __global numtyp4 *restrict x_, +__kernel void k_morse_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict mor1_in, - const __global numtyp2 *restrict mor2_in, + const __global numtyp2 *restrict mor2_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -134,30 +134,30 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_, if (eflag>0) mor2[tid]=mor2_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y; - energy+=e*factor_lj; + energy+=e*factor_lj; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_morse.h b/lib/gpu/lal_morse.h index e64852f315..ef80fb4235 100644 --- a/lib/gpu/lal_morse.h +++ b/lib/gpu/lal_morse.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Morse : public BaseAtomic { public: Morse(); - ~Morse(); + ~Morse(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,8 +40,8 @@ class Morse : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_morse1, double **host_r0, double **host_alpha, double **host_d0, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -66,7 +66,7 @@ class Morse : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _types; private: diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp index 3994473fd3..0338bc07a8 100644 --- a/lib/gpu/lal_morse_ext.cpp +++ b/lib/gpu/lal_morse_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static Morse MORMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int mor_gpu_init(const int ntypes, double **cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { MORMF.clear(); @@ -55,7 +55,7 @@ int mor_gpu_init(const int ntypes, double **cutsq, int init_ok=0; if (world_me==0) - init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -78,7 +78,7 @@ int mor_gpu_init(const int ntypes, double **cutsq, cell_size, gpu_split, screen); MORMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -103,8 +103,8 @@ int** mor_gpu_compute_n(const int ago, const int inum_full, return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void mor_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 074eaa842b..0a9933a6c0 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov, penwang@nvidia.com ***************************************************************************/ @@ -32,13 +32,13 @@ int Neighbor::bytes_per_atom(const int max_nbors) const { } bool Neighbor::init(NeighborShared *shared, const int inum, - const int host_inum, const int max_nbors, - const int maxspecial, UCL_Device &devi, - const int gpu_nbor, const int gpu_host, + const int host_inum, const int max_nbors, + const int maxspecial, UCL_Device &devi, + const int gpu_nbor, const int gpu_host, const bool pre_cut, const int block_cell_2d, const int block_cell_id, const int block_nbor_build, const int threads_per_atom, const int warp_size, - const bool time_device, + const bool time_device, const std::string compile_flags) { clear(); @@ -56,10 +56,10 @@ bool Neighbor::init(NeighborShared *shared, const int inum, _gpu_host=false; else if (gpu_host==1) _gpu_host=true; - else + else // Not yet implemented assert(0==1); - + if (pre_cut || gpu_nbor==0) _alloc_packed=true; else @@ -71,7 +71,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum, _packed_permissions=UCL_READ_ONLY; bool success=true; - + // Initialize timers for the selected GPU _nbor_time_avail=false; time_nbor.init(*dev); @@ -88,7 +88,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum, _max_atoms=static_cast(static_cast(inum)*1.10); if (_max_atoms==0) _max_atoms=1000; - + _max_host=static_cast(static_cast(host_inum)*1.10); _max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom; @@ -102,21 +102,21 @@ bool Neighbor::init(NeighborShared *shared, const int inum, alloc(success); if (!success) return false; - + if (_use_packing==false) _shared->compile_kernels(devi,gpu_nbor,compile_flags); return success; } -void Neighbor::alloc(bool &success) { +void Neighbor::alloc(bool &success) { dev_nbor.clear(); host_acc.clear(); int nt=_max_atoms+_max_host; - if (_use_packing==false || _gpu_nbor>0) - success=success && + if (_use_packing==false || _gpu_nbor>0) + success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS); - else + else success=success && (dev_nbor.alloc(3*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); success=success && (host_acc.alloc(nt*2,*dev, @@ -127,14 +127,17 @@ void Neighbor::alloc(bool &success) { dev_packed.clear(); success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev, _packed_permissions)==UCL_SUCCESS); - _c_bytes+=dev_packed.row_bytes(); - } + dev_acc.clear(); + success=success && (dev_acc.alloc(_max_atoms,*dev, + UCL_READ_WRITE)==UCL_SUCCESS); + _c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes(); + } if (_max_host>0) { nbor_host.clear(); dev_numj_host.clear(); host_ilist.clear(); host_jlist.clear(); - + success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE, UCL_READ_WRITE)==UCL_SUCCESS) && success; success=success && (dev_numj_host.alloc(_max_host,*dev, @@ -152,7 +155,7 @@ void Neighbor::alloc(bool &success) { for (int i=0; i<_max_host; i++) { host_jlist[i]=ptr; ptr+=_max_nbors; - } + } _c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes(); } else { // Some OpenCL implementations return errors for NULL pointers as args @@ -176,7 +179,7 @@ void Neighbor::alloc(bool &success) { _allocated=true; } - + void Neighbor::clear() { _gpu_bytes=0.0; _cell_bytes=0.0; @@ -194,6 +197,7 @@ void Neighbor::clear() { host_packed.clear(); host_acc.clear(); + dev_acc.clear(); dev_nbor.clear(); nbor_host.clear(); dev_packed.clear(); @@ -219,13 +223,13 @@ double Neighbor::host_memory_usage() const { host_ilist.row_bytes()+host_jlist.row_bytes(); else return 0; - } else + } else return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+ sizeof(Neighbor); } void Neighbor::get_host(const int inum, int *ilist, int *numj, - int **firstneigh, const int block_size) { + int **firstneigh, const int block_size) { _nbor_time_avail=true; time_nbor.start(); @@ -242,7 +246,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, int dev_count=0; int *h_ptr=host_packed.begin(); _nbor_pitch=inum; - + for (int ii=0; ii acc_view; acc_view.view_offset(inum,dev_nbor,inum*2); ucl_copy(acc_view,host_acc,true); + + UCL_H_Vec host_view; + host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE); + for (int ii=0; ii(ceil(static_cast(inum)*_threads_per_atom/ @@ -294,7 +307,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, // inum=nlocal is forced to be true to allow direct indexing of neighbors of // neighbors void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, - int **firstneigh, const int block_size) { + int **firstneigh, const int block_size) { _nbor_time_avail=true; time_nbor.start(); @@ -311,7 +324,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, int dev_count=0; int *h_ptr=host_packed.begin(); _nbor_pitch=inum; - + if (nlist!=inum) host_acc.zero(inum); @@ -322,7 +335,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, host_acc[i+inum]=acc_count; acc_count+=nj; } - + for (int i=0; i(ceil(static_cast(inum)*_threads_per_atom/ @@ -366,7 +379,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, template void Neighbor::resize_max_neighbors(const int maxn, bool &success) { - if (maxn>_max_nbors) { + if (maxn>_max_nbors) { int mn=static_cast(static_cast(maxn)*1.10); mn=(mn/_threads_per_atom+1)*_threads_per_atom; success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS); @@ -377,7 +390,7 @@ void Neighbor::resize_max_neighbors(const int maxn, bool &success) { for (int i=0; i<_max_host; i++) { host_jlist[i]=ptr; ptr+=mn; - } + } _gpu_bytes+=nbor_host.row_bytes(); } else { nbor_host.device.view(dev_nbor); @@ -393,8 +406,8 @@ void Neighbor::resize_max_neighbors(const int maxn, bool &success) { template void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, - const int nall, Atom &atom, - double *sublo, double *subhi, tagint *tag, + const int nall, Atom &atom, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success, int &mn) { _nbor_time_avail=true; @@ -409,7 +422,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, ncell_3d = ncellx * ncelly * ncellz; if (ncell_3d+1>_ncells) { cell_counts.clear(); - + if (_gpu_nbor==2) { if (_ncells>0) delete [] cell_iter; @@ -419,7 +432,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, cell_counts.device.clear(); cell_counts.device.alloc(ncell_3d+1,dev_nbor); } - + _ncells=ncell_3d+1; _cell_bytes=cell_counts.device.row_bytes(); } @@ -445,17 +458,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); - _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt); + _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt); time_transpose.stop(); } - + // If binning on CPU, do this now if (_gpu_nbor==2) { double stime = MPI_Wtime(); int *cell_id=atom.host_cell_id.begin(); int *particle_id=atom.host_particle_id.begin(); - - // Build cell list on CPU + + // Build cell list on CPU cell_counts.host.zero(); double i_cell_size=1.0/_cell_size; @@ -475,12 +488,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, int iz = static_cast(pz*i_cell_size+1); iz = std::max(iz,_cells_in_cutoff); iz = std::min(iz,ncellz-offset_hi); - + int id = ix+iy*ncellx+iz*ncellx*ncelly; cell_id[i] = id; cell_counts[id+1]++; } - + for (int i=nt; i(pz*i_cell_size+1); iz = std::max(iz,0); iz = std::min(iz,ncellz-1); - + int id = ix+iy*ncellx+iz*ncellx*ncelly; cell_id[i] = id; cell_counts[id+1]++; } - + mn=0; for (int i=0; i<_ncells; i++) mn=std::max(mn,cell_counts[i]); @@ -531,7 +544,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, ucl_copy(atom.dev_particle_id,atom.host_particle_id,true); time_hybrid2.stop(); _bin_time+=MPI_Wtime()-stime; - } + } time_kernel.start(); @@ -547,7 +560,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const numtyp sublo1=static_cast(sublo[1]); const numtyp sublo2=static_cast(sublo[2]); _shared->k_cell_id.set_size(GX,neigh_block); - _shared->k_cell_id.run(&atom.x, &atom.dev_cell_id, + _shared->k_cell_id.run(&atom.x, &atom.dev_cell_id, &atom.dev_particle_id, &sublo0, &sublo1, &sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz, &nt, &nall, &_cells_in_cutoff); @@ -556,10 +569,10 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, /* calculate cell count */ _shared->k_cell_counts.set_size(GX,neigh_block); - _shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall, + _shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall, &ncell_3d); - } - + } + /* build the neighbor list */ const int cell_block=_block_nbor_build; _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)* @@ -579,7 +592,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, host_offset.view_offset(inum,host_acc,nt-inum); ucl_copy(host_offset,dev_numj_host,nt-inum,true); } - + if (_gpu_nbor!=2) { host_acc.sync(); mn=host_acc[0]; @@ -587,7 +600,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, mn=std::max(mn,host_acc[i]); set_nbor_block_size(mn); - if (mn>_max_nbors) { + if (mn>_max_nbors) { resize_max_neighbors(mn,success); if (!success) return; @@ -599,13 +612,13 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, return; } } - + if (_maxspecial>0) { const int GX2=static_cast(ceil(static_cast (nt*_threads_per_atom)/cell_block)); _shared->k_special.set_size(GX2,cell_block); _shared->k_special.run(&dev_nbor, &nbor_host, &dev_numj_host, - &atom.dev_tag, &dev_nspecial, &dev_special, + &atom.dev_tag, &dev_nspecial, &dev_special, &inum, &nt, &_max_nbors, &_threads_per_atom); } time_kernel.stop(); diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index 7653291bbb..05168834c6 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -10,7 +10,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov, penwang@nvidia.com ***************************************************************************/ @@ -28,12 +28,12 @@ class Neighbor { public: Neighbor() : _allocated(false), _use_packing(false), _ncells(0) {} ~Neighbor() { clear(); } - + /// Determine whether neighbor unpacking should be used - /** If false, twice as much memory is reserved to allow unpacking neighbors by + /** If false, twice as much memory is reserved to allow unpacking neighbors by * atom for coalesced access. **/ void packing(const bool use_packing) { _use_packing=use_packing; } - + /// Clear any old data and setup for new LAMMPS run /** \param inum Initial number of particles whose neighbors stored on device * \param host_inum Initial number of particles whose nbors copied to host @@ -45,20 +45,20 @@ class Neighbor { * 1 if gpu_nbor is true, and host needs a half nbor list, * 2 if gpu_nbor is true, and host needs a full nbor list * \param pre_cut True if cutoff test will be performed in separate kernel - * than the force kernel + * than the force kernel * \param threads_per_atom Number of threads used per atom for force - * calculation + * calculation * \param compile_flags Flags for JIT compiling **/ bool init(NeighborShared *shared, const int inum, const int host_inum, const int max_nbors, const int maxspecial, UCL_Device &dev, const int gpu_nbor, const int gpu_host, const bool pre_cut, - const int block_cell_2d, const int block_cell_id, + const int block_cell_2d, const int block_cell_id, const int block_nbor_build, const int threads_per_atom, - const int warp_size, const bool time_device, + const int warp_size, const bool time_device, const std::string compile_flags); /// Set the size of the cutoff+skin - inline void cell_size(const double size, const double cutoff) { + inline void cell_size(const double size, const double cutoff) { _cell_size=size; _cutoff=cutoff; if (cutoff>size) @@ -66,7 +66,7 @@ class Neighbor { else _cells_in_cutoff=1; } - + /// Get the size of the cutoff+skin inline double cell_size() const { return _cell_size; } @@ -88,7 +88,7 @@ class Neighbor { * \param host_inum Number of particles whose nbors will be copied to host * \param max_nbor Current max number of neighbors for a particle * \param success False if insufficient memory **/ - inline void resize(const int inum, const int host_inum, const int max_nbor, + inline void resize(const int inum, const int host_inum, const int max_nbor, bool &success) { if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) { _max_atoms=static_cast(static_cast(inum)*1.10); @@ -124,40 +124,40 @@ class Neighbor { /// Free all memory on host and device void clear(); - + /// Bytes per atom used on device int bytes_per_atom(const int max_nbors) const; - + /// Total host memory used by class double host_memory_usage() const; - + /// Returns the type of neighboring: /** - 0 if neighboring will be performed on host * - 1 if neighboring will be performed on device * - 2 if binning on host and neighboring on device **/ inline int gpu_nbor() const { return _gpu_nbor; } - + /// Make a copy of unpacked nbor lists in the packed storage area (for gb) - inline void copy_unpacked(const int inum, const int maxj) + inline void copy_unpacked(const int inum, const int maxj) { ucl_copy(dev_packed,dev_nbor,inum*(maxj+2),true); } - /// Copy neighbor list from host (first time or from a rebuild) - void get_host(const int inum, int *ilist, int *numj, + /// Copy neighbor list from host (first time or from a rebuild) + void get_host(const int inum, int *ilist, int *numj, int **firstneigh, const int block_size); - - /// Copy neighbor list from host for 3-body (first time or from a rebuild) - void get_host3(const int inum, const int nlist, int *ilist, int *numj, + + /// Copy neighbor list from host for 3-body (first time or from a rebuild) + void get_host3(const int inum, const int nlist, int *ilist, int *numj, int **firstneigh, const int block_size); - + /// Return the stride in elements for each nbor row inline int nbor_pitch() const { return _nbor_pitch; } - + /// Return the maximum number of atoms that can currently be stored inline int max_atoms() const { return _max_atoms; } /// Return the maximum number of nbors for a particle based on current alloc inline int max_nbors() const { return _max_nbors; } - + /// Return the time spent binning on the CPU for hybrid neighbor builds inline double bin_time() const { return _bin_time; } @@ -171,9 +171,9 @@ class Neighbor { /// Build nbor list on the device template - void build_nbor_list(double **x, const int inum, const int host_inum, + void build_nbor_list(double **x, const int inum, const int host_inum, const int nall, Atom &atom, double *sublo, - double *subhi, tagint *tag, int **nspecial, tagint **special, + double *subhi, tagint *tag, int **nspecial, tagint **special, bool &success, int &max_nbors); /// Return the number of bytes used on device @@ -184,7 +184,7 @@ class Neighbor { return res; } - + // ------------------------------- Data ------------------------------- /// Device neighbor matrix @@ -199,6 +199,8 @@ class Neighbor { UCL_H_Vec host_packed; /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2) UCL_H_Vec host_acc; + /// Device storage for accessing atom indices from the neighbor list (3-body) + UCL_D_Vec dev_acc; // ----------------- Data for GPU Neighbor Calculation --------------- @@ -219,7 +221,7 @@ class Neighbor { /// Device timers UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose; - + private: NeighborShared *_shared; UCL_Device *dev; @@ -231,14 +233,14 @@ class Neighbor { double _gpu_bytes, _c_bytes, _cell_bytes; void alloc(bool &success); - + int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build; int _ncells, _threads_per_atom, _total_atoms; int _cells_in_cutoff; template inline void resize_max_neighbors(const int maxn, bool &success); - + int _warp_size; inline void set_nbor_block_size(const int mn) { int desired=mn/(2*_warp_size); diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu index 384b88d9de..d005eb9f97 100644 --- a/lib/gpu/lal_neighbor_cpu.cu +++ b/lib/gpu/lal_neighbor_cpu.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -17,7 +17,7 @@ #include "lal_preprocessor.h" #endif -__kernel void kernel_unpack(__global int *dev_nbor, +__kernel void kernel_unpack(__global int *dev_nbor, const __global int *dev_ij, const int inum, const int t_per_atom) { int tid=THREAD_ID_X; @@ -33,7 +33,7 @@ __kernel void kernel_unpack(__global int *dev_nbor, list+=offset; nbor+=fast_mul(ii,t_per_atom-1)+offset; int stride=fast_mul(t_per_atom,inum); - + for ( ; list pos_tex; texture pos_tex; #endif -__kernel void calc_cell_id(const numtyp4 *restrict pos, - unsigned *restrict cell_id, +__kernel void calc_cell_id(const numtyp4 *restrict pos, + unsigned *restrict cell_id, int *restrict particle_id, - numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, - numtyp i_cell_size, int ncellx, int ncelly, - int ncellz, int inum, int nall, + numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, + numtyp i_cell_size, int ncellx, int ncelly, + int ncellz, int inum, int nall, int cells_in_cutoff) { int i = threadIdx.x + blockIdx.x*blockDim.x; @@ -48,11 +48,11 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos, p.x -= boxlo0; p.y -= boxlo1; p.z -= boxlo2; - + int ix = int(p.x*i_cell_size+cells_in_cutoff); int iy = int(p.y*i_cell_size+cells_in_cutoff); int iz = int(p.z*i_cell_size+cells_in_cutoff); - + int offset_lo, offset_hi; if (i 0 && idx < nall) { int id_l = cell_id[idx-1]; if (id != id_l) { - for (int i = id_l+1; i <= id; i++) + for (int i = id_l+1; i <= id; i++) cell_counts[i] = idx; } } @@ -114,36 +114,36 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id, #endif #endif -__kernel void transpose(__global tagint *restrict out, - const __global tagint *restrict in, +__kernel void transpose(__global tagint *restrict out, + const __global tagint *restrict in, int columns_in, int rows_in) { - __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; - - unsigned ti=THREAD_ID_X; - unsigned tj=THREAD_ID_Y; - unsigned bi=BLOCK_ID_X; - unsigned bj=BLOCK_ID_Y; - - unsigned i=bi*BLOCK_CELL_2D+ti; - unsigned j=bj*BLOCK_CELL_2D+tj; - if ((i 1e-5) { cnt++; @@ -240,11 +240,11 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, if ((cnt & (t_per_atom-1))==0) neigh_list=neigh_list+stride; } - } + } } } - __syncthreads(); - } // for (k) + __syncthreads(); + } // for (k) } } } @@ -253,11 +253,11 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, } // for (i) } -__kernel void kernel_special(__global int *dev_nbor, - __global int *host_nbor_list, - const __global int *host_numj, +__kernel void kernel_special(__global int *dev_nbor, + __global int *host_nbor_list, + const __global int *host_numj, const __global tagint *restrict tag, - const __global int *restrict nspecial, + const __global int *restrict nspecial, const __global tagint *restrict special, int inum, int nt, int max_nbors, int t_per_atom) { int tid=THREAD_ID_X; @@ -268,7 +268,7 @@ __kernel void kernel_special(__global int *dev_nbor, if (iigpu_bytes(); - + _order=order; _order_m_1=order-1; _order2=_order_m_1*_order; @@ -130,7 +130,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, view.view(rho_coeff[0]+n2lo,numel,*ucl_device); ucl_copy(d_rho_coeff,view,true); _max_bytes+=d_rho_coeff.row_bytes(); - + // Allocate storage for grid _npts_x=nxhi_out-nxlo_out+1; _npts_y=nyhi_out-nylo_out+1; @@ -165,10 +165,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, flag=-3; return 0; } - + error_flag.device.zero(); _max_bytes+=1; - + _cpu_idle_time=0.0; return brick.host.begin(); @@ -180,13 +180,13 @@ void PPPMT::clear(const double cpu_time) { return; _allocated=false; _precompute_done=false; - + brick.clear(); vd_brick.clear(); d_brick_counts.clear(); error_flag.clear(); d_brick_atoms.clear(); - + acc_timers(); device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp, *ans,_max_bytes+_max_an_bytes,cpu_time, @@ -216,7 +216,7 @@ void PPPMT::clear(const double cpu_time) { template void PPPMT::_precompute(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, - double *host_q, double *boxlo, + double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv) { acc_timers(); @@ -224,7 +224,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, zero_timers(); return; } - + ans->inum(nlocal); if (ago==0) { @@ -250,7 +250,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); int ainum=this->ans->inum(); - + // Boxlo adjusted to be upper left brick and shift for even spline order double shift=0.0; if (_order % 2) @@ -258,7 +258,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv; _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv; _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv; - + _delxinv=delxinv; _delyinv=delyinv; _delzinv=delzinv; @@ -268,7 +268,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, device->zero(d_brick_counts,d_brick_counts.numel()); k_particle_map.set_size(GX,BX); k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum, - &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, + &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, &error_flag); @@ -299,7 +299,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, template int PPPMT::spread(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, - double *host_q, double *boxlo, + double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv) { if (_precompute_done==false) { @@ -309,10 +309,10 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall, } device->stop_host_timer(); - + if (!success || nlocal==0) return 0; - + double t=MPI_Wtime(); time_out.sync_stop(); _cpu_idle_time+=MPI_Wtime()-t; @@ -325,10 +325,10 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall, error_flag.device.zero(); d_brick_atoms.resize(_atom_stride*_max_brick_atoms); _max_bytes+=d_brick_atoms.row_bytes(); - return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, + return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, delxinv,delyinv,delzinv); } - + return error_flag[0]; } @@ -340,18 +340,18 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) { time_in.start(); vd_brick.update_device(true); time_in.stop(); - + time_interp.start(); // Compute the block size and grid size to keep all cores busy int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); int ainum=this->ans->inum(); - + k_interp.set_size(GX,BX); k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff, &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv, - &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, + &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, &ans->force); time_interp.stop(); @@ -381,7 +381,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) { #endif pppm_program=new UCL_Program(dev); - + #ifdef USE_OPENCL pppm_program->load_string(pppm,flags.c_str()); #else diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu index 99fe655dfd..24636b9a93 100644 --- a/lib/gpu/lal_pppm.cu +++ b/lib/gpu/lal_pppm.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ @@ -48,17 +48,17 @@ texture q_tex; // Number of pencils per block for charge spread #define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE) -__kernel void particle_map(const __global numtyp4 *restrict x_, +__kernel void particle_map(const __global numtyp4 *restrict x_, const __global numtyp *restrict q_, - const grdtyp delvolinv, const int nlocal, - __global int *restrict counts, - __global grdtyp4 *restrict ans, + const grdtyp delvolinv, const int nlocal, + __global int *restrict counts, + __global grdtyp4 *restrict ans, const grdtyp b_lo_x, const grdtyp b_lo_y, const grdtyp b_lo_z, const grdtyp delxinv, const grdtyp delyinv, const grdtyp delzinv, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int atom_stride, - const int max_atoms, + const int max_atoms, __global int *restrict error) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; @@ -76,7 +76,7 @@ __kernel void particle_map(const __global numtyp4 *restrict x_, grdtyp4 delta; fetch(delta.w,ii,q_tex); delta.w*=delvolinv; - + if (delta.w!=(grdtyp)0.0) { delta.x=(p.x-b_lo_x)*delxinv; nx=delta.x; @@ -85,14 +85,14 @@ __kernel void particle_map(const __global numtyp4 *restrict x_, delta.z=(p.z-b_lo_z)*delzinv; nz=delta.z; - if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 || + if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 || nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z) *error=1; else { delta.x=nx+(grdtyp)0.5-delta.x; delta.y=ny+(grdtyp)0.5-delta.y; delta.z=nz+(grdtyp)0.5-delta.z; - + int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx; int old=atom_add(counts+i, 1); if (old>=max_atoms) { @@ -107,9 +107,9 @@ __kernel void particle_map(const __global numtyp4 *restrict x_, /* --------------------------- */ -__kernel void make_rho(const __global int *restrict counts, +__kernel void make_rho(const __global int *restrict counts, const __global grdtyp4 *restrict atoms, - __global grdtyp *restrict brick, + __global grdtyp *restrict brick, const __global grdtyp *restrict _rho_coeff, const int atom_stride, const int npts_x, const int npts_y, const int npts_z, const int nlocal_x, @@ -118,15 +118,15 @@ __kernel void make_rho(const __global int *restrict counts, __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE]; __local grdtyp front[BLOCK_PENCILS][PENCIL_SIZE+PPPM_MAX_SPLINE]; __local grdtyp ans[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; - + int tid=THREAD_ID_X; if (tid -1; k-=order) { @@ -184,14 +184,14 @@ __kernel void make_rho(const __global int *restrict counts, z_pos+=z_stride; } } - + __syncthreads(); if (fid *device; @@ -142,21 +142,21 @@ class PPPM { UCL_Vector brick; UCL_Vector vd_brick; - + // Count of number of atoms assigned to each grid point UCL_D_Vec d_brick_counts; // Atoms assigned to each grid point UCL_D_Vec d_brick_atoms; - + // Error checking for out of bounds atoms UCL_Vector error_flag; - + // Number of grid points in brick (including ghost) int _npts_x, _npts_y, _npts_z, _npts_yx; - + // Number of local grid points in brick int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride; - + // -------------------------- SPLINE DATA ------------------------- UCL_D_Vec d_rho_coeff; int _order, _nlower, _nupper, _order_m_1, _order2; @@ -180,12 +180,12 @@ class PPPM { int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms; double _max_bytes, _max_an_bytes; double _cpu_idle_time; - - grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; + + grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; double _slab_volfactor; int _nx_pppm, _ny_pppm, _nz_pppm; - + void compile_kernels(UCL_Device &dev); void _precompute(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp index 6e5a82af5b..7e07d6c87b 100644 --- a/lib/gpu/lal_pppm_ext.cpp +++ b/lib/gpu/lal_pppm_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ static PPPM PPPMD; // --------------------------------------------------------------------------- template grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, - FILE *screen, const int order, const int nxlo_out, + FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, grdtyp **rho_coeff, @@ -82,7 +82,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, split,success); pppm.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -91,7 +91,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, } float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, - const int order, const int nxlo_out, + const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, float **rho_coeff, @@ -102,7 +102,7 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success); if (split==false && respa==false) - PPPMF.device->set_single_precompute(&PPPMF); + PPPMF.device->set_single_precompute(&PPPMF); return b; } @@ -133,20 +133,20 @@ void pppm_gpu_forces_f(double **f) { } double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, - const int order, const int nxlo_out, + const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, double **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, - const int nz_pppm, const bool split, + const int nz_pppm, const bool split, const bool respa, int &success) { double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, - split,success); + split,success); if (split==false && respa==false) - PPPMD.device->set_double_precompute(&PPPMD); + PPPMD.device->set_double_precompute(&PPPMD); return b; } diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h index 24f5b937f7..d5b1b9b6c0 100644 --- a/lib/gpu/lal_precision.h +++ b/lib/gpu/lal_precision.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : brownw@ornl.gov ***************************************************************************/ @@ -49,17 +49,17 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) { out << v.x << " " << v.y; return out; } - + inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) { out << v.x << " " << v.y << " " << v.z; return out; } - + inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) { out << v.x << " " << v.y; return out; } - + inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { out << v.x << " " << v.y << " " << v.z; return out; @@ -115,6 +115,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #define OCL_DEFAULT_VENDOR "generic" #endif +#ifdef INTEL_OCL +#define OCL_DEFAULT_VENDOR "intel" +#endif + +#ifdef PHI_OCL +#define OCL_DEFAULT_VENDOR "phi" +#endif + #ifndef OCL_DEFAULT_VENDOR #define OCL_DEFAULT_VENDOR "none" #endif diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 9dbb3c5944..69a8e61bd4 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -9,16 +9,16 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : brownw@ornl.gov // ***************************************************************************/ //************************************************************************* // Preprocessor Definitions -// +// // Note: It is assumed that constants with the same names are defined with // the same values in all files. -// +// // ARCH // Definition: Architecture number for accelerator // MEM_THREADS @@ -35,22 +35,22 @@ // Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE // PPPM_MAX_SPLINE // Definition: Maximum order for splines in PPPM -// PPPM_BLOCK_1D +// PPPM_BLOCK_1D // Definition: Thread block size for PPPM kernels // Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE -// PPPM_BLOCK_1D%32==0 +// PPPM_BLOCK_1D%32==0 // BLOCK_PAIR // Definition: Default thread block size for pair styles // Restrictions: // MAX_SHARED_TYPES 8 // Definition: Max # of atom type params can be stored in shared memory // Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR -// BLOCK_CELL_2D +// BLOCK_CELL_2D // Definition: Default block size in each dimension for cell list builds // and matrix transpose -// BLOCK_CELL_ID +// BLOCK_CELL_ID // Definition: Default block size for binning atoms in cell list builds -// BLOCK_NBOR_BUILD +// BLOCK_NBOR_BUILD // Definition: Default block size for neighbor list builds // BLOCK_BIO_PAIR // Definition: Default thread block size for "bio" pair styles @@ -78,10 +78,10 @@ #define BLOCK_SIZE_Y blockDim.y #define __kernel extern "C" __global__ #define __local __shared__ -#define __global +#define __global #define restrict __restrict__ #define atom_add atomicAdd -#define ucl_inline static __inline__ __device__ +#define ucl_inline static __inline__ __device__ #ifdef __CUDA_ARCH__ #define ARCH __CUDA_ARCH__ diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp index cbf50fab7d..9513f5a633 100644 --- a/lib/gpu/lal_re_squared.cpp +++ b/lib/gpu/lal_re_squared.cpp @@ -37,18 +37,18 @@ RESquaredT::RESquared() : BaseEllipsoid(), } template -RESquaredT::~RESquared() { +RESquaredT::~RESquared() { clear(); } - + template int RESquaredT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom(max_nbors); } template -int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, - double **host_cutsq, double **host_sigma, +int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, double **host_epsilon, int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, const double *host_special_lj, @@ -81,23 +81,23 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write, - host_sigma,host_epsilon); + host_sigma,host_epsilon); this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write, - host_cutsq,h_form); + host_cutsq,h_form); lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq,h_form); + host_cutsq,h_form); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, - host_offset); + host_offset); dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY); dev_error.zero(); - + // Allocate, cast and asynchronous memcpy of constant data // Copy data for bonded interactions special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -127,7 +127,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, } view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device)); ucl_copy(well,view4,false); - + _allocated=true; this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+ lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+ @@ -144,7 +144,7 @@ void RESquaredT::clear() { UCL_H_Vec err_flag(1,*(this->ucl_device)); ucl_copy(err_flag,dev_error,false); if (err_flag[0] == 2) - std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; err_flag.clear(); _allocated=false; @@ -158,7 +158,7 @@ void RESquaredT::clear() { shape.clear(); well.clear(); special_lj.clear(); - + this->clear_base(); } @@ -184,7 +184,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=0, NGX; int stride=this->nbor->nbor_pitch(); int ainum=this->ans->inum(); @@ -197,34 +197,34 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { (BX/this->_threads_per_atom))); NGX=static_cast(ceil(static_cast(this->_last_ellipse)/BX)); this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE, - ELLIPSE_ELLIPSE,_shared_types,_lj_types); + ELLIPSE_ELLIPSE,_shared_types,_lj_types); this->time_nbor1.stop(); this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->special_lj, - &this->sigma_epsilon, &this->_lj_types, - &this->nbor->dev_nbor, &stride, + &this->sigma_epsilon, &this->_lj_types, + &this->nbor->dev_nbor, &stride, &this->ans->force,&ainum, &this->ans->engv, - &this->dev_error, &eflag, &vflag, + &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &this->_threads_per_atom); this->time_ellipsoid.stop(); // ------------ ELLIPSE_SPHERE --------------- this->time_nbor2.start(); this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE, - ELLIPSE_SPHERE,_shared_types,_lj_types); + ELLIPSE_SPHERE,_shared_types,_lj_types); this->time_nbor2.stop(); this->time_ellipsoid2.start(); this->k_ellipsoid_sphere.set_size(GX,BX); - this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, + this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, &this->shape, &this->well, &this->special_lj, - &this->sigma_epsilon, &this->_lj_types, + &this->sigma_epsilon, &this->_lj_types, &this->nbor->dev_nbor, &stride, &this->ans->force,&ainum, - &this->ans->engv, &this->dev_error, + &this->ans->engv, &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &this->_threads_per_atom); this->time_ellipsoid2.stop(); @@ -245,18 +245,18 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { NGX=static_cast(ceil(static_cast(this->ans->inum()- this->_last_ellipse)/BX)); this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(), - SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types); + SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types); this->time_nbor3.stop(); this->time_ellipsoid3.start(); this->k_sphere_ellipsoid.set_size(GX,BX); this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat, - &this->shape, &this->well, &this->special_lj, + &this->shape, &this->well, &this->special_lj, &this->sigma_epsilon, &this->_lj_types, - &this->nbor->dev_nbor, &stride, + &this->nbor->dev_nbor, &stride, &this->ans->force, &this->ans->engv, &this->dev_error, &eflag, &vflag, - &this->_last_ellipse, &ainum, + &this->_last_ellipse, &ainum, &this->_threads_per_atom); this->time_ellipsoid3.stop(); } else { @@ -266,13 +266,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->ans->force.zero(); this->ans->engv.zero(); this->time_nbor1.zero(); - this->time_ellipsoid.zero(); + this->time_ellipsoid.zero(); this->time_nbor2.zero(); this->time_ellipsoid2.zero(); this->time_nbor3.zero(); this->time_ellipsoid3.zero(); } - + // ------------ LJ --------------- this->time_lj.start(); if (this->_last_ellipseans->inum()) { @@ -287,7 +287,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { } else { this->k_lj.set_size(GX,BX); this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, - &this->_lj_types, &this->special_lj, &stride, + &this->_lj_types, &this->special_lj, &stride, &this->nbor->dev_packed, &this->ans->force, &this->ans->engv, &this->dev_error, &eflag, &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); @@ -300,15 +300,15 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { NGX=static_cast(ceil(static_cast(this->ans->inum())/BX)); this->time_nbor1.start(); this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE, - ELLIPSE_ELLIPSE,_shared_types,_lj_types); + ELLIPSE_ELLIPSE,_shared_types,_lj_types); this->time_nbor1.stop(); - this->time_ellipsoid.start(); + this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, - &this->shape, &this->well, &this->special_lj, - &this->sigma_epsilon, &this->_lj_types, + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->special_lj, + &this->sigma_epsilon, &this->_lj_types, &this->nbor->dev_nbor, &stride, &this->ans->force, - &ainum, &this->ans->engv, &this->dev_error, + &ainum, &this->ans->engv, &this->dev_error, &eflag, &vflag, &ainum, &this->_threads_per_atom); this->time_ellipsoid.stop(); } diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu index 3a65ce14ce..e238734074 100644 --- a/lib/gpu/lal_re_squared.cu +++ b/lib/gpu/lal_re_squared.cu @@ -34,31 +34,31 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9]) __kernel void k_resquared(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict q, - const __global numtyp4 *restrict shape, - const __global numtyp4 *restrict well, - const __global numtyp *restrict splj, - const __global numtyp2 *restrict sig_eps, - const int ntypes, + const __global numtyp4 *restrict shape, + const __global numtyp4 *restrict well, + const __global numtyp *restrict splj, + const __global numtyp2 *restrict sig_eps, + const int ntypes, const __global int *dev_nbor, - const int stride, + const int stride, __global acctyp4 *restrict ans, - const int astride, + const int astride, __global acctyp *restrict engv, - __global int *restrict err_flag, + __global int *restrict err_flag, const int eflag, const int vflag, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; - sp_lj[0]=splj[0]; - sp_lj[1]=splj[1]; - sp_lj[2]=splj[2]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; - + __local numtyp b_alpha, cr60; b_alpha=(numtyp)45.0/(numtyp)56.0; - cr60=ucl_cbrt((numtyp)60.0); + cr60=ucl_cbrt((numtyp)60.0); acctyp energy=(acctyp)0; acctyp4 f; @@ -79,7 +79,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; @@ -91,14 +91,14 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9]; // A'*S^2*lA numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9]; // lAtwo+lA'*sa numtyp4 ishape; - + ishape=shape[itype]; numtyp4 ishape2; ishape2.x=ishape.x*ishape.x; ishape2.y=ishape.y*ishape.y; ishape2.z=ishape.z*ishape.z; numtyp ilshape = ishape.x*ishape.y*ishape.z; - + { numtyp aTs[9]; // A1'*S1^2 gpu_quat_to_mat_trans(q,i,a1); @@ -148,7 +148,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, numtyp a2[9]; // Rotation matrix (lab->body) numtyp gamma2[9]; // A'*S^2*A numtyp4 jshape; - + jshape=shape[jtype]; numtyp4 jshape2; jshape2.x=jshape.x*jshape.x; @@ -189,7 +189,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2; H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2; dH=gpu_det3(H12); - + numtyp sigma1p2, sigma2p2, lambda, nu; sigma1p2 = sigma1*sigma1; sigma2p2 = sigma2*sigma2; @@ -299,7 +299,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, hsec = ucl_recip(h12+(numtyp)3.0*sec); dspu = ucl_recip(h12)-hsec+stemp; pbsu = (numtyp)3.0*sigma*hsec; - + numtyp dspr, pbsr; stemp = ucl_recip(ishape.x*cr60+h12)+ ucl_recip(ishape.y*cr60+h12)+ @@ -310,7 +310,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, hsec = ucl_recip(h12+b_alpha*sec); dspr = (numtyp)7.0/h12-hsec+stemp; pbsr = b_alpha*sigma*hsec; - + numtyp dH12[9]; numtyp dUa, dUr, deta, dchi, ddH, dh12; numtyp dsigma1, dsigma2; diff --git a/lib/gpu/lal_re_squared.h b/lib/gpu/lal_re_squared.h index c7441ed83e..8dc137d829 100644 --- a/lib/gpu/lal_re_squared.h +++ b/lib/gpu/lal_re_squared.h @@ -25,14 +25,14 @@ template class RESquared : public BaseEllipsoid { public: RESquared(); - ~RESquared(); + ~RESquared(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device + * \param gpu_split fraction of particles handled by device * \return false if there is not sufficient memory or device init prob - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -41,7 +41,7 @@ class RESquared : public BaseEllipsoid { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_shape, double **host_well, double **host_cutsq, double **host_sigma, double **host_epsilon, - int **h_form, double **host_lj1, double **host_lj2, + int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, const double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -50,7 +50,7 @@ class RESquared : public BaseEllipsoid { /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); - + /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; @@ -59,8 +59,8 @@ class RESquared : public BaseEllipsoid { /// Device Error Flag - Set if a bad matrix inversion occurs UCL_D_Vec dev_error; - - // --------------------------- TYPE DATA -------------------------- + + // --------------------------- TYPE DATA -------------------------- /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form UCL_D_Vec lj1; @@ -70,12 +70,12 @@ class RESquared : public BaseEllipsoid { UCL_D_Vec sigma_epsilon; /// special lj 0-4 UCL_D_Vec special_lj; - + /// If atom type constants fit in shared memory, use fast kernels bool _shared_types; int _lj_types; - - // --------------------------- ATOM DATA -------------------------- + + // --------------------------- ATOM DATA -------------------------- /// Aspherical Const Data for Atoms UCL_D_Vec shape, well; diff --git a/lib/gpu/lal_re_squared_ext.cpp b/lib/gpu/lal_re_squared_ext.cpp index e1d8fffb8f..b719dfe05f 100644 --- a/lib/gpu/lal_re_squared_ext.cpp +++ b/lib/gpu/lal_re_squared_ext.cpp @@ -28,8 +28,8 @@ static RESquared REMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, - double **sigma, double **epsilon, - int **form, double **host_lj1, double **host_lj2, + double **sigma, double **epsilon, + int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, @@ -56,7 +56,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, int init_ok=0; if (world_me==0) - init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, + init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, form, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); @@ -64,7 +64,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, REMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); - + for (int i=0; igpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,8 +102,8 @@ void re_gpu_clear() { int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **host_quat); @@ -114,8 +114,8 @@ int** re_gpu_compute_n(const int ago, const int inum_full, const int nall, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { - return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, - tag, nspecial, special, eflag, vflag, eatom, vatom, + return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, + tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_quat); } diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu index 4742e5bd8e..d69dae2461 100644 --- a/lib/gpu/lal_re_squared_lj.cu +++ b/lib/gpu/lal_re_squared_lj.cu @@ -129,32 +129,32 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict q, - const __global numtyp4 *restrict shape, + const __global numtyp4 *restrict shape, const __global numtyp4 *restrict well, - const __global numtyp *restrict splj, + const __global numtyp *restrict splj, const __global numtyp2 *restrict sig_eps, - const int ntypes, + const int ntypes, const __global int *dev_nbor, - const int stride, + const int stride, __global acctyp4 *restrict ans, - const int astride, - __global acctyp *restrict engv, - __global int *restrict err_flag, - const int eflag, const int vflag, - const int inum, + const int astride, + __global acctyp *restrict engv, + __global int *restrict err_flag, + const int eflag, const int vflag, + const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; - sp_lj[0]=splj[0]; - sp_lj[1]=splj[1]; - sp_lj[2]=splj[2]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; - + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; b_alpha=(numtyp)45.0/(numtyp)56.0; - cr60=ucl_cbrt((numtyp)60.0); + cr60=ucl_cbrt((numtyp)60.0); solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); @@ -177,7 +177,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; @@ -223,7 +223,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, sigma = sig_eps[mtype].x; epsilon = sig_eps[mtype].y*factor_lj; - numtyp aTs[9]; + numtyp aTs[9]; numtyp4 scorrect; numtyp half_sigma=sigma*(numtyp)0.5; scorrect.x = ishape.x+half_sigma; @@ -260,7 +260,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0; Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua; Ua = epsilon*Ua*sigmap3*solv_f_a; - + stemp = h12/cr60; Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/ (numtyp)60.0; @@ -290,7 +290,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec); numtyp dspu = ucl_recip(h12)-hsec+stemp; numtyp pbsu = (numtyp)3.0*sigma*hsec; - + stemp = ucl_recip(ishape.x*cr60+h12)+ ucl_recip(ishape.y*cr60+h12)+ ucl_recip(ishape.z*cr60+h12)+ @@ -298,7 +298,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, hsec = ucl_recip(h12+b_alpha*sec); numtyp dspr = (numtyp)7.0/h12-hsec+stemp; numtyp pbsr = b_alpha*sigma*hsec; - + #pragma unroll for (int i=0; i<3; i++) { numtyp u[3]; @@ -334,7 +334,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, } } - + // torque on i numtyp fwae[3]; gpu_row_times3(fourw,aTe,fwae); @@ -384,33 +384,33 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, } __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict q, + const __global numtyp4 *restrict q, const __global numtyp4 *restrict shape, const __global numtyp4 *restrict well, const __global numtyp *restrict splj, const __global numtyp2 *restrict sig_eps, - const int ntypes, + const int ntypes, const __global int *dev_nbor, - const int stride, + const int stride, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, __global int *restrict err_flag, const int eflag, const int vflag, - const int start, const int inum, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; __local numtyp sp_lj[4]; - sp_lj[0]=splj[0]; - sp_lj[1]=splj[1]; - sp_lj[2]=splj[2]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; - + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; b_alpha=(numtyp)45.0/(numtyp)56.0; - cr60=ucl_cbrt((numtyp)60.0); + cr60=ucl_cbrt((numtyp)60.0); solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); @@ -429,7 +429,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, __local int n_stride; nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj, n_stride,nbor_end,nbor); - + numtyp4 jx; fetch4(jx,j,pos_tex); int jtype=jx.w; @@ -445,7 +445,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, numtyp a[9]; // Rotation matrix (lab->body) numtyp aTe[9]; // A'*E numtyp4 ishape; - + ishape=shape[itype]; gpu_quat_to_mat_trans(q,i,a); gpu_transpose_times_diag3(a,well[itype],aTe); @@ -467,7 +467,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, sigma = sig_eps[mtype].x; epsilon = sig_eps[mtype].y*factor_lj; - numtyp aTs[9]; + numtyp aTs[9]; numtyp4 scorrect; numtyp half_sigma=sigma * (numtyp)0.5; scorrect.x = ishape.x+half_sigma; @@ -477,7 +477,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5; scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5; gpu_transpose_times_diag3(a,scorrect,aTs); - + // energy numtyp gamma[9], s[3]; @@ -505,7 +505,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, numtyp ilshape=ishape.x*ishape.y*ishape.z; Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua; Ua = epsilon*Ua*sigmap3*solv_f_a; - + stemp = h12/cr60; Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/ (numtyp)60.0; @@ -535,7 +535,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec); numtyp dspu = ucl_recip(h12)-hsec+stemp; numtyp pbsu = (numtyp)3.0*sigma*hsec; - + stemp = ucl_recip(ishape.x*cr60+h12)+ ucl_recip(ishape.y*cr60+h12)+ ucl_recip(ishape.z*cr60+h12)+ @@ -543,7 +543,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, hsec = ucl_recip(h12+b_alpha*sec); numtyp dspr = (numtyp)7.0/h12-hsec+stemp; numtyp pbsr = b_alpha*sigma*hsec; - + #pragma unroll for (int i=0; i<3; i++) { numtyp u[3]; @@ -584,15 +584,15 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_resquared_lj(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict lj1, - const __global numtyp4 *restrict lj3, - const int lj_types, - const __global numtyp *restrict gum, - const int stride, - const __global int *dev_ij, +__kernel void k_resquared_lj(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict gum, + const int stride, + const __global int *dev_ij, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, __global int *restrict err_flag, const int eflag, const int vflag, const int start, const int inum, const int t_per_atom) { @@ -601,10 +601,10 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_, ii+=start; __local numtyp sp_lj[4]; - sp_lj[0]=gum[0]; - sp_lj[1]=gum[1]; - sp_lj[2]=gum[2]; - sp_lj[3]=gum[3]; + sp_lj[0]=gum[0]; + sp_lj[1]=gum[1]; + sp_lj[2]=gum[2]; + sp_lj[3]=gum[3]; acctyp energy=(acctyp)0; acctyp4 f; @@ -614,20 +614,20 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); - energy+=factor_lj*(e-lj3[ii].z); + energy+=factor_lj*(e-lj3[ii].z); } if (vflag>0) { virial[0] += delx*delx*force; @@ -671,33 +671,33 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict lj1_in, - const __global numtyp4 *restrict lj3_in, - const __global numtyp *restrict gum, +__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1_in, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict gum, const int stride, const __global int *dev_ij, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, __global int *restrict err_flag, const int eflag, const int vflag, - const int start, const int inum, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; - __local numtyp sp_lj[4]; + __local numtyp sp_lj[4]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (tid<4) - sp_lj[tid]=gum[tid]; + sp_lj[tid]=gum[tid]; if (tid0) lj3[tid]=lj3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; @@ -706,9 +706,9 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + __syncthreads(); - + if (ii0) { numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); - energy+=factor_lj*(e-lj3[mtype].z); + energy+=factor_lj*(e-lj3[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_soft.cpp b/lib/gpu/lal_soft.cpp index c206a997a9..727b112ea5 100644 --- a/lib/gpu/lal_soft.cpp +++ b/lib/gpu/lal_soft.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,10 +33,10 @@ SoftT::Soft() : BaseAtomic(), _allocated(false) { } template -SoftT::~Soft() { +SoftT::~Soft() { clear(); } - + template int SoftT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -74,7 +74,7 @@ int SoftT::init(const int ntypes, double **host_cutsq, coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_prefactor, - host_cut,host_cutsq); + host_cut,host_cutsq); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -89,16 +89,16 @@ int SoftT::init(const int ntypes, double **host_cutsq, template void SoftT::reinit(const int ntypes, double **host_cutsq, double **host_prefactor, double **host_cut) { - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), UCL_WRITE_ONLY); - + for (int i=0; i<_lj_types*_lj_types; i++) host_write[i]=0.0; - + this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor, - host_cut,host_cutsq); + host_cut,host_cutsq); } template @@ -134,7 +134,7 @@ void SoftT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu index b7c32b6879..831b986725 100644 --- a/lib/gpu/lal_soft.cu +++ b/lib/gpu/lal_soft.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -26,7 +26,7 @@ texture pos_tex; #define MY_PI (acctyp)3.14159265358979323846 -__kernel void k_soft(const __global numtyp4 *restrict x_, +__kernel void k_soft(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff, const int lj_types, const __global numtyp *restrict sp_lj_in, @@ -51,20 +51,20 @@ __kernel void k_soft(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii (numtyp)0.0) force = factor_lj * coeff[mtype].x * sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r); else force = (numtyp)0.0; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg)); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -111,7 +111,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_soft_fast(const __global numtyp4 *restrict x_, +__kernel void k_soft_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff_in, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, @@ -122,7 +122,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -130,7 +130,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_, if (tid (numtyp)0.0) force = factor_lj * coeff[mtype].x * sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r); else force = (numtyp)0.0; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg)); - energy+=factor_lj*e; + energy+=factor_lj*e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_soft.h b/lib/gpu/lal_soft.h index 7fa529c4f5..e72673248c 100644 --- a/lib/gpu/lal_soft.h +++ b/lib/gpu/lal_soft.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Soft : public BaseAtomic { public: Soft(); - ~Soft(); + ~Soft(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -40,14 +40,14 @@ class Soft : public BaseAtomic { int init(const int ntypes, double **host_cutsq, double **host_prefactor, double **host_cut, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Send updated coeffs from host to device (to be compatible with fix adapt) void reinit(const int ntypes, double **host_cutsq, double **host_prefactor, double **host_cut); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -68,7 +68,7 @@ class Soft : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kßernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp index 9591923965..d3b3fa2598 100644 --- a/lib/gpu/lal_soft_ext.cpp +++ b/lib/gpu/lal_soft_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -77,7 +77,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor, cell_size, gpu_split, screen); SLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -96,16 +96,16 @@ void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor, int world_me=SLMF.device->world_me(); int gpu_rank=SLMF.device->gpu_rank(); int procs_per_gpu=SLMF.device->procs_per_gpu(); - + if (world_me==0) SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut); - + SLMF.device->world_barrier(); - + for (int i=0; igpu_barrier(); } } @@ -124,8 +124,8 @@ int ** soft_gpu_compute_n(const int ago, const int inum_full, return SLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void soft_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index 1f68616b0e..3492d7030e 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -33,10 +33,10 @@ SWT::SW() : BaseThree(), _allocated(false) { } template -SWT::~SW() { +SWT::~SW() { clear(); } - + template int SWT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -45,7 +45,7 @@ int SWT::bytes_per_atom(const int max_nbors) const { template int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors, const double cell_size, const double gpu_split, FILE *_screen, - int* host_map, const int nelements, int*** host_elem2param, const int nparams, + int* host_map, const int nelements, int*** host_elem2param, const int nparams, const double* epsilon, const double* sigma, const double* lambda, const double* gamma, const double* costheta, const double* biga, @@ -76,41 +76,41 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_ UCL_WRITE_ONLY); for (int i=0; iucl_device),UCL_READ_ONLY); - + for (int i=0; i(epsilon[i]); dview[i].y=static_cast(sigma[i]); dview[i].z=static_cast(lambda[i]); dview[i].w=static_cast(gamma[i]); } - + ucl_copy(sw1,dview,false); sw1_tex.get_texture(*(this->pair_program),"sw1_tex"); sw1_tex.bind_float(sw1,4); sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - + for (int i=0; i(biga[i]); dview[i].y=static_cast(bigb[i]); dview[i].z=static_cast(powerp[i]); dview[i].w=static_cast(powerq[i]); } - + ucl_copy(sw2,dview,false); sw2_tex.get_texture(*(this->pair_program),"sw2_tex"); sw2_tex.bind_float(sw2,4); sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); - + for (int i=0; i(costheta[i]); dview[i].w=(numtyp)0; } - + ucl_copy(sw3,dview,false); sw3_tex.get_texture(*(this->pair_program),"sw3_tex"); sw3_tex.bind_float(sw3,4); @@ -192,31 +192,32 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); - // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa == 1 - // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 + // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 + // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); + this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, + this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); BX=this->block_size(); GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/(KTHREADS*JTHREADS)))); + (BX/(KTHREADS*JTHREADS)))); this->k_three_center.set_size(GX,BX); - this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, + this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); Answer *end_ans; @@ -227,21 +228,24 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { #endif if (evatom!=0) { this->k_three_end_vatom.set_size(GX,BX); - this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, + &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_acc, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } else { this->k_three_end.set_size(GX,BX); - this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, - &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, + &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_acc, + &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } + this->time_pair.stop(); } diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index 1e358fb6f7..46330c59e4 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -138,16 +138,16 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { __local int n_stride; int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -166,9 +166,9 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; itype=map[itype]; - + for ( ; nbor0) - energy+=(pre_sw_c5*rp - pre_sw_c6*rq) * expsrainv; + if (eflag>0) + energy+=(pre_sw_c5*rp - pre_sw_c6*rq) * expsrainv; if (vflag>0) { virial[0] += delx*delx*force; @@ -329,29 +328,28 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, fjz = delr1z*(frad1+csfac1)-delr2z*facang12; \ } -__kernel void k_sw_three_center(const __global numtyp4 *restrict x_, +__kernel void k_sw_three_center(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict sw1, const __global numtyp4 *restrict sw2, const __global numtyp4 *restrict sw3, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, const int evatom) { __local int tpa_sq, n_stride; tpa_sq=fast_mul(t_per_atom,t_per_atom); - numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma; numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik; numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk; int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -360,7 +358,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii sw3_ijparam.y) continue; numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); - sw_sigma=sw1_ijparam.y; - sw_gamma=sw1_ijparam.w; sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; sw_cut_ij=sw3_ijparam.x; @@ -419,15 +415,11 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z; if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y; numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex); - sw_sigma=sw1_ikparam.y; - sw_gamma=sw1_ikparam.w; sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma; sw_cut_ik=sw3_ikparam.x; int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype]; numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex); - sw_epsilon=sw1_ijkparam.x; - sw_lambda=sw1_ijkparam.z; sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon; sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk; numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex); @@ -439,7 +431,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, f.x -= fjx + fkx; f.y -= fjy + fky; f.z -= fjz + fkz; - } + } } } // for nbor @@ -458,29 +450,29 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_sw_three_end(const __global numtyp4 *restrict x_, +__kernel void k_sw_three_end(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict sw1, const __global numtyp4 *restrict sw2, const __global numtyp4 *restrict sw3, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const int t_per_atom) { + const __global int * dev_nbor, + const __global int * dev_packed, + const __global int * dev_acc, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const int t_per_atom, const int gpu_nbor) { __local int tpa_sq, n_stride; tpa_sq=fast_mul(t_per_atom,t_per_atom); - numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma; numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik; numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk; int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -489,7 +481,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii sw3_ijparam.y) continue; numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); - sw_sigma=sw1_ijparam.y; - sw_gamma=sw1_ijparam.w; sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; sw_cut_ij=sw3_ijparam.x; - int nbor_k=j+nbor_pitch; - int numk=dev_nbor[nbor_k]; + int nbor_k,numk; if (dev_nbor==dev_packed) { + if (gpu_nbor) nbor_k=j+nbor_pitch; + else nbor_k=dev_acc[j]+nbor_pitch; + numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { + nbor_k=dev_acc[j]+nbor_pitch; + numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch; nbor_k=dev_nbor[nbor_k]; k_end=nbor_k+numk; @@ -559,15 +553,11 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, if (rsq2 < sw3_ikparam.y) { numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex); - sw_sigma=sw1_ikparam.y; - sw_gamma=sw1_ikparam.w; sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma; sw_cut_ik=sw3_ikparam.x; int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex); - sw_epsilon=sw1_ijkparam.x; - sw_lambda=sw1_ijkparam.z; sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon; sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk; numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex); @@ -605,22 +595,22 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, - const __global int * dev_nbor, - const __global int * dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, - const int t_per_atom) { + const __global int * dev_nbor, + const __global int * dev_packed, + const __global int * dev_acc, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const int t_per_atom, const int gpu_nbor) { __local int tpa_sq, n_stride; tpa_sq=fast_mul(t_per_atom,t_per_atom); - numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma; numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik; numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk; int tid, ii, offset; atom_info(tpa_sq,ii,tid,offset); - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -629,7 +619,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (ii sw3_ijparam.y) continue; numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); - sw_sigma=sw1_ijparam.y; - sw_gamma=sw1_ijparam.w; sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; sw_cut_ij=sw3_ijparam.x; - - int nbor_k=j+nbor_pitch; - int numk=dev_nbor[nbor_k]; + + int nbor_k,numk; if (dev_nbor==dev_packed) { + if (gpu_nbor) nbor_k=j+nbor_pitch; + else nbor_k=dev_acc[j]+nbor_pitch; + numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { + nbor_k=dev_acc[j]+nbor_pitch; + numk=dev_nbor[nbor_k]; nbor_k+=nbor_pitch; nbor_k=dev_nbor[nbor_k]; k_end=nbor_k+numk; @@ -699,15 +691,11 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, if (rsq2 < sw3_ikparam.y) { numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex); - sw_sigma=sw1_ikparam.y; - sw_gamma=sw1_ikparam.w; sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma; sw_cut_ik=sw3_ikparam.x; int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex); - sw_epsilon=sw1_ijkparam.x; - sw_lambda=sw1_ijkparam.z; sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon; sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk; numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex); diff --git a/lib/gpu/lal_sw.h b/lib/gpu/lal_sw.h index 66b36a90b0..3546f02eb7 100644 --- a/lib/gpu/lal_sw.h +++ b/lib/gpu/lal_sw.h @@ -24,28 +24,28 @@ template class SW : public BaseThree { public: SW(); - ~SW(); + ~SW(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, + int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, const double cell_size, const double gpu_split, FILE *screen, - int* host_map, const int nelements, int*** host_elem2param, const int nparams, + int* host_map, const int nelements, int*** host_elem2param, const int nparams, const double* epsilon, const double* sigma, const double* lambda, const double* gamma, const double* costheta, const double* biga, const double* bigb, const double* powerp, const double* powerq, const double* cut, const double* cutsq); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -61,7 +61,7 @@ class SW : public BaseThree { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; /// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp index e2d1b5e4dd..4959650c90 100644 --- a/lib/gpu/lal_sw_ext.cpp +++ b/lib/gpu/lal_sw_ext.cpp @@ -27,14 +27,14 @@ static SW SWMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, +int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, const double cell_size, int &gpu_mode, FILE *screen, int* host_map, const int nelements, int*** host_elem2param, const int nparams, const double* sw_epsilon, const double* sw_sigma, const double* sw_lambda, const double* sw_gamma, const double* sw_costheta, const double* sw_biga, const double* sw_bigb, const double* sw_powerp, - const double* sw_powerq, const double* sw_cut, + const double* sw_powerq, const double* sw_cut, const double* sw_cutsq) { SWMF.clear(); gpu_mode=SWMF.device->gpu_mode(); @@ -46,7 +46,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_ int procs_per_gpu=SWMF.device->procs_per_gpu(); // disable host/device split for now - if (gpu_split != 1.0) + if (gpu_split != 1.0) return -8; SWMF.device->init_message(screen,"sw/gpu",first_gpu,last_gpu); @@ -64,7 +64,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_ if (world_me==0) init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, - sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, + sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq); SWMF.device->world_barrier(); @@ -83,12 +83,12 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_ if (gpu_rank==i && world_me!=0) init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen, host_map, nelements, host_elem2param, nparams, - sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, - sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, + sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, + sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq); SWMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -113,12 +113,12 @@ int ** sw_gpu_compute_n(const int ago, const int inum_full, return SWMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - -void sw_gpu_compute(const int ago, const int nlocal, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, +} + +void sw_gpu_compute(const int ago, const int nlocal, const int nall, + const int nlist, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { SWMF.compute(ago,nlocal,nall,nlist,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp index c99bf85815..0de59c84b2 100644 --- a/lib/gpu/lal_table.cpp +++ b/lib/gpu/lal_table.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -34,35 +34,35 @@ using namespace LAMMPS_AL; extern Device device; template -TableT::Table() : BaseAtomic(), +TableT::Table() : BaseAtomic(), _allocated(false), _compiled_styles(false) { } template -TableT::~Table() { +TableT::~Table() { clear(); } - + template int TableT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int TableT::init(const int ntypes, +int TableT::init(const int ntypes, double **host_cutsq, double ***host_table_coeffs, double **host_table_data, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, + const double gpu_split, FILE *_screen, int tabstyle, int ntables, int tablength) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, gpu_split,_screen,table,"k_table"); if (success!=0) return success; - + k_pair_linear.set_function(*(this->pair_program),"k_table_linear"); k_pair_linear_fast.set_function(*(this->pair_program),"k_table_linear_fast"); k_pair_spline.set_function(*(this->pair_program),"k_table_spline"); @@ -80,38 +80,38 @@ int TableT::init(const int ntypes, shared_types=true; } _lj_types=lj_types; - + _tabstyle = tabstyle; _ntables = ntables; if (tabstyle != BITMAP) _tablength = tablength; else _tablength = 1 << tablength; - + // Allocate a host write buffer for data initialization UCL_H_Vec host_write_int(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; iucl_device),UCL_READ_ONLY); nshiftbits.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); nmask.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); for (int ix=1; ix host_write(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); @@ -151,7 +151,7 @@ int TableT::init(const int ntypes, host_write2[n*_tablength+k].z = host_table_data[n][6*k+2]; // f host_write2[n*_tablength+k].w = (numtyp)0; } - } + } } ucl_copy(coeff3,host_write2,false); @@ -166,21 +166,21 @@ int TableT::init(const int ntypes, for (int n=0; n<_ntables; n++) { if (tabstyle == LINEAR) { for (int k=0; k<_tablength-1; k++) { - host_write2[n*_tablength+k].x = (numtyp)0; + host_write2[n*_tablength+k].x = (numtyp)0; host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // de host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // df host_write2[n*_tablength+k].w = (numtyp)0; } } else if (tabstyle == SPLINE) { for (int k=0; k<_tablength; k++) { - host_write2[n*_tablength+k].x = (numtyp)0; + host_write2[n*_tablength+k].x = (numtyp)0; host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // e2 host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // f2 host_write2[n*_tablength+k].w = (numtyp)0; } } else if (tabstyle == BITMAP) { for (int k=0; k<_tablength; k++) { - host_write2[n*_tablength+k].x = (numtyp)0; + host_write2[n*_tablength+k].x = (numtyp)0; host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // de host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // df host_write2[n*_tablength+k].w = host_table_data[n][6*k+5]; // drsq @@ -188,12 +188,12 @@ int TableT::init(const int ntypes, } } ucl_copy(coeff4,host_write2,false); - + UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_ONLY); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); - + UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); @@ -220,7 +220,7 @@ void TableT::clear() { coeff3.clear(); coeff4.clear(); sp_lj.clear(); - + if (_compiled_styles) { k_pair_linear_fast.clear(); k_pair_linear.clear(); @@ -230,7 +230,7 @@ void TableT::clear() { k_pair_bitmap.clear(); _compiled_styles=false; } - + this->clear_atomic(); } @@ -256,7 +256,7 @@ void TableT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -269,67 +269,67 @@ void TableT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3, &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, - &this->ans->engv, &eflag, &vflag, &ainum, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == LINEAR) { this->k_pair_linear_fast.set_size(GX,BX); - this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, + this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3, &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == SPLINE) { this->k_pair_spline_fast.set_size(GX,BX); - this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, + this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3, &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &nbor_pitch, + &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == BITMAP) { this->k_pair_bitmap_fast.set_size(GX,BX); this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits, &nmask, &coeff2, &coeff3, &coeff4, &cutsq, - &sp_lj, &this->nbor->dev_nbor, + &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, - &ainum, &nbor_pitch, + &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); - } + } } else { if (_tabstyle == LOOKUP) { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3, - &coeff4, &_lj_types, &cutsq, &sp_lj, + this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3, + &coeff4, &_lj_types, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &eflag, + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == LINEAR) { this->k_pair_linear.set_size(GX,BX); this->k_pair_linear.run(&this->atom->x, &tabindex, &coeff2, &coeff3, - &coeff4, &_lj_types, &cutsq, &sp_lj, + &coeff4, &_lj_types, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == SPLINE) { this->k_pair_spline.set_size(GX,BX); this->k_pair_spline.run(&this->atom->x, &tabindex, &coeff2, &coeff3, - &coeff4, &_lj_types, &cutsq, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &coeff4, &_lj_types, &cutsq, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == BITMAP) { this->k_pair_bitmap.set_size(GX,BX); - this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits, + this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits, &nmask, &coeff2, &coeff3, &coeff4, &_lj_types, &cutsq, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, + &nbor_pitch, &this->_threads_per_atom, &_tablength); } } diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu index 1033b7fbb8..971b56d96e 100644 --- a/lib/gpu/lal_table.cu +++ b/lib/gpu/lal_table.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -39,39 +39,39 @@ typedef union { /// ---------------- LOOKUP ------------------------------------------------- -__kernel void k_table(const __global numtyp4 *restrict x_, +__kernel void k_table(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const int lj_types, const __global numtyp *restrict cutsq, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + int tlm1 = tablength - 1; - + if (ii0) { numtyp e = (numtyp)0.0; - if (itable < tlm1) + if (itable < tlm1) e = coeff3[idx].y; energy+=factor_lj*e; } @@ -136,21 +136,21 @@ __kernel void k_table(const __global numtyp4 *restrict x_, __kernel void k_table_fast(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const __global numtyp *restrict cutsq_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -158,18 +158,18 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_, if (tid0) { numtyp e = (numtyp)0.0; - if (itable < tlm1) + if (itable < tlm1) e = coeff3[idx].y; energy+=factor_lj*e; } @@ -235,24 +235,24 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_, /// ---------------- LINEAR ------------------------------------------------- -__kernel void k_table_linear(const __global numtyp4 *restrict x_, +__kernel void k_table_linear(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const int lj_types, const __global numtyp *restrict cutsq, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -265,9 +265,9 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + int tlm1 = tablength - 1; - + if (ii0) { numtyp e = (numtyp)0.0; - if (itable < tlm1) + if (itable < tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } @@ -334,23 +334,23 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, +__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const __global numtyp *restrict cutsq_in, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -358,7 +358,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, if (tid0) { numtyp e = (numtyp)0.0; - if (itable < tlm1) + if (itable < tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } @@ -439,39 +439,39 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, /// ---------------- SPLINE ------------------------------------------------- -__kernel void k_table_spline(const __global numtyp4 *restrict x_, +__kernel void k_table_spline(const __global numtyp4 *restrict x_, const __global int *restrict tabindex, - const __global numtyp4 *restrict coeff2, + const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, const __global numtyp4 *restrict coeff4, const int lj_types, const __global numtyp *restrict cutsq, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + int tlm1 = tablength - 1; - + if (ii0) { numtyp e = (numtyp)0.0; if (itable < tlm1) { - e = a * coeff3[idx].y + b * coeff3[idx+1].y + - ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * + e = a * coeff3[idx].y + b * coeff3[idx+1].y + + ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * coeff2[mtype].z; - } + } energy+=factor_lj*e; } if (vflag>0) { @@ -545,23 +545,23 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_table_spline_fast(const __global numtyp4 *x_, +__kernel void k_table_spline_fast(const __global numtyp4 *x_, const __global int *tabindex, - const __global numtyp4* coeff2, + const __global numtyp4* coeff2, const __global numtyp4 *coeff3, const __global numtyp4 *coeff4, const __global numtyp *cutsq_in, - const __global numtyp* sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *ans, - __global acctyp *engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global numtyp* sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *ans, + __global acctyp *engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -569,7 +569,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_, if (tid0) { numtyp e = (numtyp)0.0; if (itable < tlm1) { - e = a * coeff3[idx].y + b * coeff3[idx+1].y + - ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * + e = a * coeff3[idx].y + b * coeff3[idx+1].y + + ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * coeff2[mtype].z; - } + } energy+=factor_lj*e; } if (vflag>0) { @@ -657,41 +657,41 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_, /// ---------------- BITMAP ------------------------------------------------- -__kernel void k_table_bitmap(const __global numtyp4 *x_, +__kernel void k_table_bitmap(const __global numtyp4 *x_, const __global int *tabindex, - const __global int *nshiftbits, + const __global int *nshiftbits, const __global int *nmask, - const __global numtyp4* coeff2, + const __global numtyp4* coeff2, const __global numtyp4 *coeff3, const __global numtyp4 *coeff4, const int lj_types, const __global numtyp *cutsq, - const __global numtyp* sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *ans, - __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom, + const __global numtyp* sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *ans, + __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + int tlm1 = tablength - 1; - + if (ii>= nshiftbits[mtype]; @@ -734,14 +734,14 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_, value = coeff3[idx].z + fraction*coeff4[idx].z; force = factor_lj * value; } else force = (numtyp)0.0; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e = (numtyp)0.0; - if (itable <= tlm1) + if (itable <= tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } @@ -761,25 +761,25 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_, } // if ii } -__kernel void k_table_bitmap_fast(const __global numtyp4 *x_, +__kernel void k_table_bitmap_fast(const __global numtyp4 *x_, const __global int *tabindex, - const __global int *nshiftbits, + const __global int *nshiftbits, const __global int *nmask, - const __global numtyp4* coeff2, + const __global numtyp4* coeff2, const __global numtyp4 *coeff3, const __global numtyp4 *coeff4, const __global numtyp *cutsq_in, - const __global numtyp* sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *ans, - __global acctyp *engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global numtyp* sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *ans, + __global acctyp *engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, int tablength) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -787,18 +787,18 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_, if (tid>= nshiftbits[mtype]; @@ -842,14 +842,14 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_, value = coeff3[idx].z + fraction*coeff4[idx].z; force = factor_lj * value; } else force = (numtyp)0.0; - + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e = (numtyp)0.0; - if (itable <= tlm1) + if (itable <= tlm1) e = coeff3[idx].y + fraction*coeff4[idx].y; energy+=factor_lj*e; } diff --git a/lib/gpu/lal_table.h b/lib/gpu/lal_table.h index 0e04737d27..f667336679 100644 --- a/lib/gpu/lal_table.h +++ b/lib/gpu/lal_table.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Table : public BaseAtomic { public: Table(); - ~Table(); + ~Table(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -38,10 +38,10 @@ class Table : public BaseAtomic { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double** cutsq, double ***host_table_coeffs, - double **host_table_data, + double **host_table_data, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, int tabstyle, int ntables, int tablength); @@ -54,42 +54,42 @@ class Table : public BaseAtomic { /// Total host memory used by library for pair style double host_memory_usage() const; - + // ------------------------- DEVICE KERNELS ------------------------- UCL_Kernel k_pair_linear, k_pair_linear_fast; UCL_Kernel k_pair_spline, k_pair_spline_fast; UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast; - + // --------------------------- TYPE DATA -------------------------- UCL_D_Vec tabindex, nshiftbits, nmask; - - /// coeff2.x = innersq, coeff2.y = invdelta, coeff2.z = deltasq6, + + /// coeff2.x = innersq, coeff2.y = invdelta, coeff2.z = deltasq6, UCL_D_Vec coeff2; - + /// coeff3.x = rsq, coeff3.y = e, coeff3.z = f UCL_D_Vec coeff3; - + /// coeff4.x = de, coeff4.y = df UCL_D_Vec coeff4; - + UCL_D_Vec cutsq; - + /// Special LJ values UCL_D_Vec sp_lj; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; - + /// Table style, length and number of tables int _tabstyle,_tablength,_ntables; - + private: bool _allocated, _compiled_styles; - + void loop(const bool _eflag, const bool _vflag); }; diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp index 172acb7d39..a2b5c61e74 100644 --- a/lib/gpu/lal_table_ext.cpp +++ b/lib/gpu/lal_table_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -30,7 +30,7 @@ static Table TBMF; int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs, double **table_data, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, + const double cell_size, int &gpu_mode, FILE *screen, int tabstyle, int ntables, int tablength) { TBMF.clear(); gpu_mode=TBMF.device->gpu_mode(); @@ -55,7 +55,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs, int init_ok=0; if (world_me==0) init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, tabstyle, ntables, tablength); TBMF.device->world_barrier(); @@ -73,11 +73,11 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs, } if (gpu_rank==i && world_me!=0) init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data, - special_lj, inum, nall, 300, maxspecial, cell_size, + special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, tabstyle, ntables, tablength); TBMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,8 +102,8 @@ int ** table_gpu_compute_n(const int ago, const int inum_full, return TBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void table_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp index bc89c53765..bf634cffc2 100644 --- a/lib/gpu/lal_tersoff.cpp +++ b/lib/gpu/lal_tersoff.cpp @@ -269,7 +269,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall, else _eflag=0; - int ainum=nall; + int ainum=nlist; int nbor_pitch=this->nbor->nbor_pitch(); int BX=this->block_pair(); int GX=static_cast(ceil(static_cast(ainum)/ @@ -279,7 +279,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall, this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); + &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); int evatom=0; if (eatom || vatom) @@ -364,7 +364,7 @@ int ** TersoffT::compute(const int ago, const int inum_full, this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); + &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); int evatom=0; if (eatom || vatom) @@ -437,16 +437,18 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_acc, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } else { this->k_three_end.set_size(GX,BX); this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_acc, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } this->time_pair.stop(); diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index e98a454f58..b7d48d9e34 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -184,7 +184,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, __global acctyp4 * zetaij, const __global int * dev_nbor, const __global int * dev_packed, - const int eflag, const int nall, const int inum, + const int eflag, const int inum, const int nbor_pitch, const int t_per_atom) { __local int tpa_sq,n_stride; tpa_sq = fast_mul(t_per_atom,t_per_atom); @@ -210,7 +210,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, __syncthreads(); - if (ii param_c2) return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) * // error in negligible 2nd term fixed 9/30/2015 - // (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) * + // (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) * ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) * ucl_powr(tmp,-param_powern))); if (tmp < param_c4) return (numtyp)0.0; diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp index bfcc9c3bd3..a01bcf63b1 100644 --- a/lib/gpu/lal_tersoff_mod.cpp +++ b/lib/gpu/lal_tersoff_mod.cpp @@ -269,7 +269,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall, else _eflag=0; - int ainum=nall; + int ainum=nlist; int nbor_pitch=this->nbor->nbor_pitch(); int BX=this->block_pair(); int GX=static_cast(ceil(static_cast(ainum)/ @@ -279,7 +279,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall, this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); + &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); int evatom=0; if (eatom || vatom) @@ -364,7 +364,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full, this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); + &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); int evatom=0; if (eatom || vatom) @@ -437,16 +437,18 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_acc, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } else { this->k_three_end.set_size(GX,BX); this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_acc, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } this->time_pair.stop(); diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu index ba4ad32005..3a81b36941 100644 --- a/lib/gpu/lal_tersoff_mod.cu +++ b/lib/gpu/lal_tersoff_mod.cu @@ -184,7 +184,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, __global acctyp4 * zetaij, const __global int * dev_nbor, const __global int * dev_packed, - const int eflag, const int nall, const int inum, + const int eflag, const int inum, const int nbor_pitch, const int t_per_atom) { __local int tpa_sq,n_stride; tpa_sq = fast_mul(t_per_atom,t_per_atom); @@ -210,7 +210,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, __syncthreads(); - if (ii param_ca1) return (numtyp)-0.5*(param_powern/param_powern_del) * - ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta; + ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta; if (tmp < param_ca4) return (numtyp)0.0; numtyp tmp_n = ucl_powr(tmp,param_powern); return (numtyp)-0.5 *(param_powern/param_powern_del) * - ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 / + ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 / ((numtyp)2.0*param_powern_del)))*tmp_n / zeta; } diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp index 57688f53ab..c1f3f25c04 100644 --- a/lib/gpu/lal_tersoff_zbl.cpp +++ b/lib/gpu/lal_tersoff_zbl.cpp @@ -294,7 +294,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall, else _eflag=0; - int ainum=nall; + int ainum=nlist; int nbor_pitch=this->nbor->nbor_pitch(); int BX=this->block_pair(); int GX=static_cast(ceil(static_cast(ainum)/ @@ -304,7 +304,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall, this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); + &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); int evatom=0; if (eatom || vatom) @@ -389,7 +389,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full, this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom); + &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); int evatom=0; if (eatom || vatom) @@ -463,16 +463,18 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_acc, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } else { this->k_three_end.set_size(GX,BX); this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_acc, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } this->time_pair.stop(); diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu index 0d6c5a38d6..9509b9802c 100644 --- a/lib/gpu/lal_tersoff_zbl.cu +++ b/lib/gpu/lal_tersoff_zbl.cu @@ -188,7 +188,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, __global acctyp4 * zetaij, const __global int * dev_nbor, const __global int * dev_packed, - const int eflag, const int nall, const int inum, + const int eflag, const int inum, const int nbor_pitch, const int t_per_atom) { __local int tpa_sq,n_stride; tpa_sq = fast_mul(t_per_atom,t_per_atom); @@ -216,7 +216,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, __syncthreads(); - if (ii param_c2) return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) * // error in negligible 2nd term fixed 9/30/2015 - // (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) * + // (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) * ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) * ucl_powr(tmp,-param_powern))); if (tmp < param_c4) return (numtyp)0.0; diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp index 585dc069a0..a316d195ac 100644 --- a/lib/gpu/lal_yukawa.cpp +++ b/lib/gpu/lal_yukawa.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -33,19 +33,19 @@ YukawaT::Yukawa() : BaseAtomic(), _allocated(false) { } template -YukawaT::~Yukawa() { +YukawaT::~Yukawa() { clear(); } - + template int YukawaT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int YukawaT::init(const int ntypes, +int YukawaT::init(const int ntypes, double **host_cutsq, double kappa, - double **host_a, double **host_offset, + double **host_a, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -75,7 +75,7 @@ int YukawaT::init(const int ntypes, coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,host_offset, - host_cutsq); + host_cutsq); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); @@ -83,7 +83,7 @@ int YukawaT::init(const int ntypes, ucl_copy(sp_lj,dview,false); _kappa = kappa; - + _allocated=true; this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes(); return 0; @@ -122,7 +122,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -134,7 +134,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, - &vflag, &ainum, &nbor_pitch, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu index b0c3b9978d..a8d637ec97 100644 --- a/lib/gpu/lal_yukawa.cu +++ b/lib/gpu/lal_yukawa.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -24,14 +24,14 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_yukawa(const __global numtyp4 *restrict x_, +__kernel void k_yukawa(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff, const numtyp kappa, const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -49,20 +49,20 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=coeff[mtype].x*screening*rinv; - energy+=factor_lj*(e-coeff[mtype].y); + energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; @@ -109,19 +109,19 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, +__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff_in, - const numtyp kappa, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + const numtyp kappa, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -129,7 +129,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, if (tid0) { numtyp e=coeff[mtype].x*screening*rinv; - energy+=factor_lj*(e-coeff[mtype].y); + energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_yukawa.h b/lib/gpu/lal_yukawa.h index 720dc903d0..4cc23c03e9 100644 --- a/lib/gpu/lal_yukawa.h +++ b/lib/gpu/lal_yukawa.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class Yukawa : public BaseAtomic { public: Yukawa(); - ~Yukawa(); + ~Yukawa(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,8 +39,8 @@ class Yukawa : public BaseAtomic { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double kappa, double **host_a, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data @@ -57,16 +57,16 @@ class Yukawa : public BaseAtomic { /// coeff.x = a, coeff.y = offset, coeff.z = cutsq UCL_D_Vec coeff; - + /// Special LJ values UCL_D_Vec sp_lj; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; - + /// kappa numtyp _kappa; diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp index 70282a7117..af29938a68 100644 --- a/lib/gpu/lal_yukawa_colloid.cpp +++ b/lib/gpu/lal_yukawa_colloid.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -29,23 +29,23 @@ using namespace LAMMPS_AL; extern Device device; template -YukawaColloidT::YukawaColloid() : BaseAtomic(), +YukawaColloidT::YukawaColloid() : BaseAtomic(), _max_rad_size(0), _allocated(false) { } template -YukawaColloidT::~YukawaColloid() { +YukawaColloidT::~YukawaColloid() { clear(); } - + template int YukawaColloidT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template -int YukawaColloidT::init(const int ntypes, - double **host_cutsq, double **host_a, +int YukawaColloidT::init(const int ntypes, + double **host_cutsq, double **host_a, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -62,16 +62,16 @@ int YukawaColloidT::init(const int ntypes, _shared_view=false; // allocate rad - + int ef_nall=nall; if (ef_nall==0) ef_nall=2000; - + _max_rad_size=static_cast(static_cast(ef_nall)*1.10); - + if (_shared_view==false) c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); - + rad_tex.get_texture(*(this->pair_program),"rad_tex"); rad_tex.bind_float(c_rad,1); @@ -96,13 +96,13 @@ int YukawaColloidT::init(const int ntypes, coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a, - host_offset,host_cutsq); + host_offset,host_cutsq); UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); ucl_copy(sp_lj,dview,false); - + _allocated=true; this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes(); return 0; @@ -131,15 +131,15 @@ double YukawaColloidT::host_memory_usage() const { // Copy nbor list from host if necessary and then compute atom energies/forces // --------------------------------------------------------------------------- template -void YukawaColloidT::compute(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, int *ilist, +void YukawaColloidT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *rad) { this->acc_timers(); - + // ------------------- Resize rad array -------------------------- - + if (nall>_max_rad_size) { _max_rad_size=static_cast(static_cast(nall)*1.10); if (_shared_view==false) { @@ -157,7 +157,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full, this->zero_timers(); return; } - + int ago=this->hd_balancer.ago_first(f_ago); int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); this->ans->inum(inum); @@ -170,7 +170,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full, if (!success) return; } - + this->atom->cast_x_data(host_x,host_type); this->cast_rad_data(rad); this->hd_balancer.start_timer(); @@ -182,7 +182,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full, this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); } - + // --------------------------------------------------------------------------- // Reneighbor on GPU and then compute per-atom densities // --------------------------------------------------------------------------- @@ -190,24 +190,24 @@ template int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *rad) { this->acc_timers(); - + // ------------------- Resize rad array ---------------------------- - + if (nall>_max_rad_size) { _max_rad_size=static_cast(static_cast(nall)*1.10); if (_shared_view==false) { c_rad.resize(_max_rad_size); rad_tex.bind_float(c_rad,1); } - } + } // ----------------------------------------------------------------- - + if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style @@ -215,21 +215,21 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall this->zero_timers(); return NULL; } - + // load balance, returning the atom count on the device (inum) this->hd_balancer.balance(cpu_time); int inum=this->hd_balancer.get_gpu_count(ago,inum_full); this->ans->inum(inum); host_start=inum; - - // Build neighbor list on GPU if necessary + + // Build neighbor list on GPU if necessary if (ago==0) { this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; this->cast_rad_data(rad); - this->hd_balancer.start_timer(); + this->hd_balancer.start_timer(); } else { this->atom->cast_x_data(host_x,host_type); this->cast_rad_data(rad); @@ -265,7 +265,7 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); @@ -280,8 +280,8 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) { &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); } diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu index f9f4767123..48ab47bc94 100644 --- a/lib/gpu/lal_yukawa_colloid.cu +++ b/lib/gpu/lal_yukawa_colloid.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : nguyentd@ornl.gov // ***************************************************************************/ @@ -29,15 +29,15 @@ texture rad_tex; #define rad_tex rad_ #endif -__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, +__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, const __global numtyp *restrict rad_, - const __global numtyp4 *restrict coeff, - const int lj_types, - const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, + const __global numtyp4 *restrict coeff, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const numtyp kappa) { @@ -56,21 +56,21 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (ii0) { numtyp e=coeff[mtype].x/kappa * screening; - energy+=factor_lj*(e-coeff[mtype].y); + energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; @@ -118,20 +118,20 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, +__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, const __global numtyp *restrict rad_, - const __global numtyp4 *restrict coeff_in, + const __global numtyp4 *restrict coeff_in, const __global numtyp *restrict sp_lj_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, - const int inum, const int nbor_pitch, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, const int t_per_atom, const numtyp kappa) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) @@ -139,7 +139,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, if (tid0) { numtyp e=coeff[mtype].x/kappa * screening; - energy+=factor_lj*(e-coeff[mtype].y); + energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h index 5a9ee7ae6e..ba69bc4bae 100644 --- a/lib/gpu/lal_yukawa_colloid.h +++ b/lib/gpu/lal_yukawa_colloid.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -24,13 +24,13 @@ template class YukawaColloid : public BaseAtomic { public: YukawaColloid(); - ~YukawaColloid(); + ~YukawaColloid(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found @@ -39,8 +39,8 @@ class YukawaColloid : public BaseAtomic { * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double **host_a, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const double kappa); inline void cast_rad_data(double* rad) { @@ -70,22 +70,22 @@ class YukawaColloid : public BaseAtomic { /// Total host memory used by library for pair style double host_memory_usage() const; - + /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, + void compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *rad); - + /// Pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, double *rad); // --------------------------- TEXTURES ----------------------------- @@ -101,7 +101,7 @@ class YukawaColloid : public BaseAtomic { /// If atom type constants fit in shared memory, use fast kernels bool shared_types; - /// Number of atom types + /// Number of atom types int _lj_types; int _max_rad_size; diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp index 0e3c653e06..e2b0354d10 100644 --- a/lib/gpu/lal_yukawa_colloid_ext.cpp +++ b/lib/gpu/lal_yukawa_colloid_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -27,10 +27,10 @@ static YukawaColloid YKCOLLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, +int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, double **host_offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, + const double cell_size, int &gpu_mode, FILE *screen, const double kappa) { YKCOLLMF.clear(); gpu_mode=YKCOLLMF.device->gpu_mode(); @@ -54,8 +54,8 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, int init_ok=0; if (world_me==0) - init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, - inum, nall, 300, maxspecial, cell_size, gpu_split, + init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, + inum, nall, 300, maxspecial, cell_size, gpu_split, screen, kappa); YKCOLLMF.device->world_barrier(); @@ -72,12 +72,12 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, - inum, nall, 300, maxspecial, cell_size, gpu_split, + init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, + inum, nall, 300, maxspecial, cell_size, gpu_split, screen, kappa); YKCOLLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -103,11 +103,11 @@ int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_rad); -} - -void ykcolloid_gpu_compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, +} + +void ykcolloid_gpu_compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_rad) { diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp index 1cc89885aa..9d38387bc1 100644 --- a/lib/gpu/lal_yukawa_ext.cpp +++ b/lib/gpu/lal_yukawa_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : nguyentd@ornl.gov ***************************************************************************/ @@ -28,9 +28,9 @@ static Yukawa YKMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, - double **host_a, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + double **host_a, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { YKMF.clear(); gpu_mode=YKMF.device->gpu_mode(); @@ -54,8 +54,8 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, int init_ok=0; if (world_me==0) - init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, - inum, nall, 300, maxspecial, cell_size, + init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, + inum, nall, 300, maxspecial, cell_size, gpu_split, screen); YKMF.device->world_barrier(); @@ -72,12 +72,12 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, - inum, nall, 300, maxspecial, cell_size, + init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, + inum, nall, 300, maxspecial, cell_size, gpu_split, screen); YKMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -102,8 +102,8 @@ int ** yukawa_gpu_compute_n(const int ago, const int inum_full, return YKMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void yukawa_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_zbl.cpp b/lib/gpu/lal_zbl.cpp index e172d48b33..77e55a62f9 100644 --- a/lib/gpu/lal_zbl.cpp +++ b/lib/gpu/lal_zbl.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -33,10 +33,10 @@ ZBLT::ZBL() : BaseAtomic(), _allocated(false) { } template -ZBLT::~ZBL() { +ZBLT::~ZBL() { clear(); } - + template int ZBLT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); @@ -44,15 +44,15 @@ int ZBLT::bytes_per_atom(const int max_nbors) const { template int ZBLT::init(const int ntypes, double **host_cutsq, - double **host_sw1, double **host_sw2, - double **host_sw3, double **host_sw4, + double **host_sw1, double **host_sw2, + double **host_sw3, double **host_sw4, double **host_sw5, - double **host_d1a, double **host_d2a, - double **host_d3a, double **host_d4a, - double **host_zze, double cut_globalsq, + double **host_d1a, double **host_d2a, + double **host_d3a, double **host_d4a, + double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, @@ -79,16 +79,16 @@ int ZBLT::init(const int ntypes, double **host_cutsq, coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_sw1,host_sw2, - host_zze, host_cutsq); + host_zze, host_cutsq); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_d1a,host_d2a, - host_d3a,host_d4a); + host_d3a,host_d4a); coeff3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff3,host_write,host_sw3,host_sw4,host_sw5); - _cut_globalsq = cut_globalsq; + _cut_globalsq = cut_globalsq; _cut_innersq = cut_innersq; _cut_inner = cut_inner; @@ -131,7 +131,7 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) { vflag=1; else vflag=0; - + int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_zbl.cu b/lib/gpu/lal_zbl.cu index b14753b5fa..b7f379c833 100644 --- a/lib/gpu/lal_zbl.cu +++ b/lib/gpu/lal_zbl.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : // email : ndactrung@gmail.com // ***************************************************************************/ @@ -35,9 +35,9 @@ texture pos_tex; compute ZBL pair energy ------------------------------------------------------------------------- */ -ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij, +ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij, numtyp d3aij, numtyp d4aij, numtyp zzeij) { - + numtyp rinv = ucl_recip(r); numtyp sum = c1*ucl_exp(-d1aij*r); @@ -54,7 +54,7 @@ ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij, compute ZBL first derivative ------------------------------------------------------------------------- */ -ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij, +ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij, numtyp d3aij, numtyp d4aij, numtyp zzeij) { numtyp rinv = ucl_recip(r); @@ -72,24 +72,24 @@ ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij, sum_p -= c2*d2aij*e2; sum_p -= c3*d3aij*e3; sum_p -= c4*d4aij*e4; - + numtyp result = zzeij*(sum_p - sum*rinv)*rinv; - + return result; }; -__kernel void k_zbl(const __global numtyp4 *restrict x_, +__kernel void k_zbl(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, const __global numtyp4 *restrict coeff2, const __global numtyp4 *restrict coeff3, - const double cut_globalsq, - const double cut_innersq, - const double cut_inner, - const int lj_types, - const __global int *dev_nbor, - const __global int *dev_packed, + const double cut_globalsq, + const double cut_innersq, + const double cut_inner, + const int lj_types, + const __global int *dev_nbor, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; @@ -101,19 +101,19 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_, acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; - + if (iicut_innersq) { - t = r - cut_inner; - force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t); - } + + if (rsq>cut_innersq) { + t = r - cut_inner; + force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t); + } force *= (numtyp)-1.0*ucl_recip(r); @@ -146,14 +146,14 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_, f.z+=delz*force; if (eflag>0) { - numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, + numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z); - e += coeff3[mtype].z; - if (rsq > cut_innersq) { - e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t); - } + e += coeff3[mtype].z; + if (rsq > cut_innersq) { + e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t); + } - energy+=e; + energy+=e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -171,22 +171,22 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_zbl_fast(const __global numtyp4 *restrict x_, +__kernel void k_zbl_fast(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1_in, const __global numtyp4 *restrict coeff2_in, const __global numtyp4 *restrict coeff3_in, - const double cut_globalsq, - const double cut_innersq, - const double cut_inner, + const double cut_globalsq, + const double cut_innersq, + const double cut_inner, const __global int *dev_nbor, - const __global int *dev_packed, + const __global int *dev_packed, __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; @@ -195,7 +195,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, coeff2[tid]=coeff2_in[tid]; coeff3[tid]=coeff3_in[tid]; } - + acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; @@ -204,7 +204,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; __syncthreads(); - + if (iicut_innersq) { - t = r - cut_inner; - force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t); - } + + if (rsq>cut_innersq) { + t = r - cut_inner; + force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t); + } force *= (numtyp)-1.0*ucl_recip(r); @@ -249,14 +249,14 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_, f.z+=delz*force; if (eflag>0) { - numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, + numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z); - e += coeff3[mtype].z; - if (rsq > cut_innersq) { - e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t); - } + e += coeff3[mtype].z; + if (rsq > cut_innersq) { + e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t); + } - energy+=e; + energy+=e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_zbl.h b/lib/gpu/lal_zbl.h index 2996d90a5c..9885fcedf2 100644 --- a/lib/gpu/lal_zbl.h +++ b/lib/gpu/lal_zbl.h @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -24,27 +24,27 @@ template class ZBL : public BaseAtomic { public: ZBL(); - ~ZBL(); + ~ZBL(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * + * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double **host_cutsq, double **host_sw1, + int init(const int ntypes, double **host_cutsq, double **host_sw1, double **host_sw2, double **host_sw3, double **host_sw4, double **host_sw5, - double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, + double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); - + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -70,8 +70,8 @@ class ZBL : public BaseAtomic { double _cut_globalsq; double _cut_innersq; double _cut_inner; - - /// Number of atom types + + /// Number of atom types int _lj_types; private: diff --git a/lib/gpu/lal_zbl_ext.cpp b/lib/gpu/lal_zbl_ext.cpp index ddce858076..37aa74351b 100644 --- a/lib/gpu/lal_zbl_ext.cpp +++ b/lib/gpu/lal_zbl_ext.cpp @@ -9,7 +9,7 @@ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ - begin : + begin : email : ndactrung@gmail.com ***************************************************************************/ @@ -27,11 +27,11 @@ static ZBL ZBLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, +int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, double **host_sw2, double **host_sw3, double **host_sw4, double **host_sw5, - double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, + double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner, - const int inum, const int nall, const int max_nbors, + const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { ZBLMF.clear(); gpu_mode=ZBLMF.device->gpu_mode(); @@ -55,7 +55,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, int init_ok=0; if (world_me==0) - init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, + init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze, cut_globalsq, cut_innersq, cut_inner, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); @@ -74,13 +74,13 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, + init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze, - cut_globalsq, cut_innersq, cut_inner, + cut_globalsq, cut_innersq, cut_inner, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); ZBLMF.device->gpu_barrier(); - if (message) + if (message) fprintf(screen,"Done.\n"); } if (message) @@ -105,8 +105,8 @@ int ** zbl_gpu_compute_n(const int ago, const int inum_full, return ZBLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); -} - +} + void zbl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag,