forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
8366b35459
commit
9656958169
|
@ -1,6 +1,6 @@
|
|||
# Settings that the LAMMPS build will import when this package library is used
|
||||
# settings for OpenCL builds
|
||||
gpu_SYSINC =
|
||||
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL
|
||||
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
|
||||
gpu_SYSPATH =
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
EXTRAMAKE = Makefile.lammps.standard
|
||||
|
||||
ifeq($(CUDA_HOME),)
|
||||
ifeq ($(CUDA_HOME),)
|
||||
CUDA_HOME = /usr/local/cuda
|
||||
endif
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
|
|||
OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
|
||||
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
|
||||
-I$(CUDA_HOME)/include
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
EXTRAMAKE = Makefile.lammps.mingw-cross
|
||||
|
|
|
@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
|
|||
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
|
||||
-I../../tools/mingw-cross/mpich2-win32/include/ \
|
||||
-DMPICH_IGNORE_CXX_SEEK
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
|
||||
-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
|
|
|
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
|
|||
OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
|
||||
-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
|
||||
-I$(CUDA_HOME)/include
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
|
||||
-L../../src/STUBS -lmpi_mingw64
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
|
|
|
@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
|
|||
-I../../tools/mingw-cross/mpich2-win64/include/ \
|
||||
-DMPICH_IGNORE_CXX_SEEK
|
||||
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
|
||||
-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -35,7 +35,7 @@ namespace ucl_cudadr {
|
|||
// --------------------------------------------------------------------------
|
||||
// - COMMAND QUEUE STUFF
|
||||
// --------------------------------------------------------------------------
|
||||
typedef CUstream command_queue;
|
||||
typedef CUstream command_queue;
|
||||
|
||||
inline void ucl_sync(CUstream &stream) {
|
||||
CU_SAFE_CALL(cuStreamSynchronize(stream));
|
||||
|
@ -59,21 +59,21 @@ struct NVDProperties {
|
|||
|
||||
/// Class for looking at device properties
|
||||
/** \note Calls to change the device outside of the class results in incorrect
|
||||
* behavior
|
||||
* behavior
|
||||
* \note There is no error checking for indexing past the number of devices **/
|
||||
class UCL_Device {
|
||||
public:
|
||||
/// Collect properties for every GPU on the node
|
||||
/** \note You must set the active GPU with set() before using the device **/
|
||||
inline UCL_Device();
|
||||
|
||||
|
||||
inline ~UCL_Device();
|
||||
|
||||
/// Returns 1 (For compatibility with OpenCL)
|
||||
inline int num_platforms() { return 1; }
|
||||
|
||||
/// Return a string with name and info of the current platform
|
||||
inline std::string platform_name()
|
||||
inline std::string platform_name()
|
||||
{ return "NVIDIA Corporation NVIDIA CUDA Driver"; }
|
||||
|
||||
/// Delete any contexts/data and set the platform number to be used
|
||||
|
@ -97,24 +97,24 @@ class UCL_Device {
|
|||
|
||||
/// Returns the default stream for the current device
|
||||
inline command_queue & cq() { return cq(0); }
|
||||
|
||||
|
||||
/// Returns the stream indexed by i
|
||||
inline command_queue & cq(const int i) { return _cq[i]; }
|
||||
|
||||
|
||||
/// Block until all commands in the default stream have completed
|
||||
inline void sync() { sync(0); }
|
||||
|
||||
|
||||
/// Block until all commands in the specified stream have completed
|
||||
inline void sync(const int i) { ucl_sync(cq(i)); }
|
||||
|
||||
|
||||
/// Get the number of command queues currently available on device
|
||||
inline int num_queues()
|
||||
inline int num_queues()
|
||||
{ return _cq.size(); }
|
||||
|
||||
|
||||
/// Add a stream for device computations
|
||||
inline void push_command_queue() {
|
||||
_cq.push_back(CUstream());
|
||||
CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0));
|
||||
_cq.push_back(CUstream());
|
||||
CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0));
|
||||
}
|
||||
|
||||
/// Remove a stream for device computations
|
||||
|
@ -124,19 +124,19 @@ class UCL_Device {
|
|||
CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back()));
|
||||
_cq.pop_back();
|
||||
}
|
||||
|
||||
|
||||
/// Set the default command queue (by default this is the null stream)
|
||||
/** \param i index of the command queue (as added by push_command_queue())
|
||||
/** \param i index of the command queue (as added by push_command_queue())
|
||||
If i is 0, the default command queue is set to the null stream **/
|
||||
inline void set_command_queue(const int i) {
|
||||
if (i==0) _cq[0]=0;
|
||||
else _cq[0]=_cq[i];
|
||||
}
|
||||
|
||||
|
||||
/// Get the current CUDA device name
|
||||
inline std::string name() { return name(_device); }
|
||||
/// Get the CUDA device name
|
||||
inline std::string name(const int i)
|
||||
inline std::string name(const int i)
|
||||
{ return std::string(_properties[i].name); }
|
||||
|
||||
/// Get a string telling the type of the current device
|
||||
|
@ -148,38 +148,38 @@ class UCL_Device {
|
|||
inline int device_type() { return device_type(_device); }
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i) { return UCL_GPU; }
|
||||
|
||||
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory() { return shared_memory(_device); }
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
|
||||
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
inline bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
inline bool double_precision(const int i) {return arch(i)>=1.3;}
|
||||
|
||||
|
||||
/// Get the number of compute units on the current device
|
||||
inline unsigned cus() { return cus(_device); }
|
||||
/// Get the number of compute units
|
||||
inline unsigned cus(const int i)
|
||||
inline unsigned cus(const int i)
|
||||
{ return _properties[i].multiProcessorCount; }
|
||||
|
||||
/// Get the number of cores in the current device
|
||||
inline unsigned cores() { return cores(_device); }
|
||||
/// Get the number of cores
|
||||
inline unsigned cores(const int i)
|
||||
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
|
||||
inline unsigned cores(const int i)
|
||||
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
|
||||
else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
|
||||
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
|
||||
else return _properties[i].multiProcessorCount*192; }
|
||||
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
/// Get the gigabytes of global memory
|
||||
inline double gigabytes(const int i)
|
||||
inline double gigabytes(const int i)
|
||||
{ return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
|
||||
|
||||
|
||||
/// Get the bytes of global memory in the current device
|
||||
inline size_t bytes() { return bytes(_device); }
|
||||
/// Get the bytes of global memory
|
||||
|
@ -188,13 +188,13 @@ class UCL_Device {
|
|||
// Get the gigabytes of free memory in the current device
|
||||
inline double free_gigabytes() { return free_gigabytes(_device); }
|
||||
// Get the gigabytes of free memory
|
||||
inline double free_gigabytes(const int i)
|
||||
inline double free_gigabytes(const int i)
|
||||
{ return static_cast<double>(free_bytes(i))/1073741824; }
|
||||
|
||||
|
||||
// Get the bytes of free memory in the current device
|
||||
inline size_t free_bytes() { return free_bytes(_device); }
|
||||
// Get the bytes of free memory
|
||||
inline size_t free_bytes(const int i) {
|
||||
inline size_t free_bytes(const int i) {
|
||||
CUDA_INT_TYPE dfree, dtotal;
|
||||
CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal));
|
||||
return static_cast<size_t>(dfree);
|
||||
|
@ -203,21 +203,21 @@ class UCL_Device {
|
|||
/// Return the GPGPU compute capability for current device
|
||||
inline double arch() { return arch(_device); }
|
||||
/// Return the GPGPU compute capability
|
||||
inline double arch(const int i)
|
||||
inline double arch(const int i)
|
||||
{ return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
|
||||
|
||||
|
||||
/// Clock rate in GHz for current device
|
||||
inline double clock_rate() { return clock_rate(_device); }
|
||||
/// Clock rate in GHz
|
||||
inline double clock_rate(const int i)
|
||||
inline double clock_rate(const int i)
|
||||
{ return _properties[i].p.clockRate*1e-6;}
|
||||
|
||||
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size() { return group_size(_device); }
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size(const int i)
|
||||
inline size_t group_size(const int i)
|
||||
{ return _properties[i].p.maxThreadsPerBlock; }
|
||||
|
||||
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
/// Return the maximum memory pitch in bytes
|
||||
|
@ -242,7 +242,7 @@ class UCL_Device {
|
|||
{ return fission_by_counts(_device); }
|
||||
/// True if splitting device into subdevices by specified counts supported
|
||||
inline bool fission_by_counts(const int i)
|
||||
{ return false; }
|
||||
{ return false; }
|
||||
/// True if splitting device into subdevices by affinity domains supported
|
||||
inline bool fission_by_affinity()
|
||||
{ return fission_by_affinity(_device); }
|
||||
|
@ -259,7 +259,7 @@ class UCL_Device {
|
|||
|
||||
/// List all devices along with all properties
|
||||
inline void print_all(std::ostream &out);
|
||||
|
||||
|
||||
private:
|
||||
int _device, _num_devices;
|
||||
std::vector<NVDProperties> _properties;
|
||||
|
@ -279,16 +279,16 @@ UCL_Device::UCL_Device() {
|
|||
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
|
||||
if (major==9999)
|
||||
continue;
|
||||
|
||||
|
||||
_properties.push_back(NVDProperties());
|
||||
_properties.back().device_id=dev;
|
||||
_properties.back().major=major;
|
||||
_properties.back().minor=minor;
|
||||
|
||||
|
||||
char namecstr[1024];
|
||||
CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
|
||||
_properties.back().name=namecstr;
|
||||
|
||||
|
||||
CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
|
||||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
||||
|
@ -296,23 +296,23 @@ UCL_Device::UCL_Device() {
|
|||
CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
|
||||
#if CUDA_VERSION >= 2020
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().kernelExecTimeoutEnabled,
|
||||
&_properties.back().kernelExecTimeoutEnabled,
|
||||
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().integrated,
|
||||
CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().canMapHostMemory,
|
||||
&_properties.back().canMapHostMemory,
|
||||
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode,
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode,
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
|
||||
#endif
|
||||
#if CUDA_VERSION >= 3010
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().concurrentKernels,
|
||||
&_properties.back().concurrentKernels,
|
||||
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(
|
||||
&_properties.back().ECCEnabled,
|
||||
&_properties.back().ECCEnabled,
|
||||
CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
|
||||
#endif
|
||||
}
|
||||
|
@ -365,7 +365,7 @@ void UCL_Device::print_all(std::ostream &out) {
|
|||
cuDriverGetVersion(&driver_version);
|
||||
out << "CUDA Driver Version: "
|
||||
<< driver_version/1000 << "." << driver_version%100
|
||||
<< std::endl;
|
||||
<< std::endl;
|
||||
#endif
|
||||
|
||||
if (num_devices() == 0)
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -35,15 +35,15 @@ template <class numtyp> class UCL_D_Mat;
|
|||
template <class hosttype, class devtype> class UCL_Vector;
|
||||
template <class hosttype, class devtype> class UCL_Matrix;
|
||||
#define UCL_MAX_KERNEL_ARGS 256
|
||||
|
||||
|
||||
/// Class storing 1 or more kernel functions from a single string or file
|
||||
class UCL_Program {
|
||||
public:
|
||||
inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
|
||||
inline UCL_Program(UCL_Device &device, const void *program,
|
||||
const char *flags="", std::string *log=NULL) {
|
||||
inline UCL_Program(UCL_Device &device, const void *program,
|
||||
const char *flags="", std::string *log=NULL) {
|
||||
_cq=device.cq();
|
||||
init(device);
|
||||
init(device);
|
||||
load_string(program,flags,log);
|
||||
}
|
||||
|
||||
|
@ -61,20 +61,20 @@ class UCL_Program {
|
|||
std::string *log=NULL) {
|
||||
std::ifstream in(filename);
|
||||
if (!in || in.is_open()==false) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open kernel file: "
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open kernel file: "
|
||||
<< filename << std::endl;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
return UCL_FILE_NOT_FOUND;
|
||||
}
|
||||
|
||||
|
||||
std::string program((std::istreambuf_iterator<char>(in)),
|
||||
std::istreambuf_iterator<char>());
|
||||
in.close();
|
||||
return load_string(program.c_str(),flags,log);
|
||||
}
|
||||
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const void *program, const char *flags="",
|
||||
std::string *log=NULL) {
|
||||
|
@ -94,12 +94,12 @@ class UCL_Program {
|
|||
|
||||
CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
|
||||
options,(void **)values);
|
||||
|
||||
|
||||
if (log!=NULL)
|
||||
*log=std::string(clog);
|
||||
|
||||
|
||||
if (err != CUDA_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << std::endl
|
||||
<< "----------------------------------------------------------\n"
|
||||
<< " UCL Error: Error compiling PTX Program...\n"
|
||||
|
@ -108,24 +108,24 @@ class UCL_Program {
|
|||
#endif
|
||||
return UCL_COMPILE_ERROR;
|
||||
}
|
||||
|
||||
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Load a precompiled program from a file
|
||||
inline int load_binary(const char *filename) {
|
||||
CUmodule _module;
|
||||
CUresult err = cuModuleLoad(&_module,filename);
|
||||
if (err==301) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open binary kernel file: "
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open binary kernel file: "
|
||||
<< filename << std::endl;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
return UCL_FILE_NOT_FOUND;
|
||||
} else if (err!=CUDA_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Error loading binary kernel file: "
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Error loading binary kernel file: "
|
||||
<< filename << std::endl;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
|
@ -138,7 +138,7 @@ class UCL_Program {
|
|||
// return UCL_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
friend class UCL_Kernel;
|
||||
private:
|
||||
CUmodule _module;
|
||||
|
@ -149,23 +149,23 @@ class UCL_Program {
|
|||
/// Class for dealing with CUDA Driver kernels
|
||||
class UCL_Kernel {
|
||||
public:
|
||||
UCL_Kernel() : _dimensions(1), _num_args(0) {
|
||||
UCL_Kernel() : _dimensions(1), _num_args(0) {
|
||||
#if CUDA_VERSION < 4000
|
||||
_param_size=0;
|
||||
#endif
|
||||
_num_blocks[0]=0;
|
||||
_num_blocks[0]=0;
|
||||
}
|
||||
|
||||
UCL_Kernel(UCL_Program &program, const char *function) :
|
||||
|
||||
UCL_Kernel(UCL_Program &program, const char *function) :
|
||||
_dimensions(1), _num_args(0) {
|
||||
#if CUDA_VERSION < 4000
|
||||
_param_size=0;
|
||||
#endif
|
||||
_num_blocks[0]=0;
|
||||
set_function(program,function);
|
||||
_cq=program._cq;
|
||||
_num_blocks[0]=0;
|
||||
set_function(program,function);
|
||||
_cq=program._cq;
|
||||
}
|
||||
|
||||
|
||||
~UCL_Kernel() {}
|
||||
|
||||
/// Clear any function associated with the kernel
|
||||
|
@ -189,7 +189,7 @@ class UCL_Kernel {
|
|||
|
||||
/// Set the kernel argument.
|
||||
/** If not a device pointer, this must be repeated each time the argument
|
||||
* changes
|
||||
* changes
|
||||
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
|
||||
template <class dtype>
|
||||
inline void set_arg(const unsigned index, const dtype * const arg) {
|
||||
|
@ -202,27 +202,27 @@ class UCL_Kernel {
|
|||
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
|
||||
#endif
|
||||
else
|
||||
assert(0==1); // Must add kernel parameters in sequential order
|
||||
assert(0==1); // Must add kernel parameters in sequential order
|
||||
}
|
||||
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a kernel argument.
|
||||
|
@ -257,37 +257,37 @@ class UCL_Kernel {
|
|||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size) {
|
||||
_dimensions=1;
|
||||
_num_blocks[0]=num_blocks;
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size) {
|
||||
_dimensions=1;
|
||||
_num_blocks[0]=num_blocks;
|
||||
_num_blocks[1]=1;
|
||||
_num_blocks[2]=1;
|
||||
#if CUDA_VERSION >= 4000
|
||||
_block_size[0]=block_size;
|
||||
_block_size[1]=1;
|
||||
_block_size[2]=1;
|
||||
#else
|
||||
#else
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
|
||||
#endif
|
||||
}
|
||||
|
@ -303,43 +303,43 @@ class UCL_Kernel {
|
|||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
const size_t block_size_x, const size_t block_size_y) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
_num_blocks[2]=1;
|
||||
#if CUDA_VERSION >= 4000
|
||||
_block_size[0]=block_size_x;
|
||||
_block_size[1]=block_size_y;
|
||||
_block_size[2]=1;
|
||||
#else
|
||||
#else
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue for the kernel is changed to cq **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y,
|
||||
command_queue &cq)
|
||||
command_queue &cq)
|
||||
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x,
|
||||
const size_t block_size_x,
|
||||
const size_t block_size_y, const size_t block_size_z) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
_num_blocks[2]=1;
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
_num_blocks[2]=1;
|
||||
#if CUDA_VERSION >= 4000
|
||||
_block_size[0]=block_size_x;
|
||||
_block_size[1]=block_size_y;
|
||||
_block_size[2]=block_size_z;
|
||||
#else
|
||||
#else
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
|
||||
block_size_z));
|
||||
#endif
|
||||
|
@ -352,10 +352,10 @@ class UCL_Kernel {
|
|||
const size_t block_size_x, const size_t block_size_y,
|
||||
const size_t block_size_z, command_queue &cq) {
|
||||
_cq=cq;
|
||||
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
|
||||
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
|
||||
block_size_z);
|
||||
}
|
||||
|
||||
|
||||
/// Run the kernel in the default command queue
|
||||
inline void run() {
|
||||
#if CUDA_VERSION >= 4000
|
||||
|
@ -367,12 +367,12 @@ class UCL_Kernel {
|
|||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Clear any arguments associated with the kernel
|
||||
inline void clear_args() {
|
||||
_num_args=0;
|
||||
inline void clear_args() {
|
||||
_num_args=0;
|
||||
#if CUDA_VERSION < 4000
|
||||
_offsets.clear();
|
||||
_offsets.clear();
|
||||
_param_size=0;
|
||||
#endif
|
||||
}
|
||||
|
@ -390,7 +390,7 @@ class UCL_Kernel {
|
|||
unsigned _num_blocks[3];
|
||||
unsigned _num_args;
|
||||
friend class UCL_Texture;
|
||||
|
||||
|
||||
#if CUDA_VERSION >= 4000
|
||||
unsigned _block_size[3];
|
||||
void * _kernel_args[UCL_MAX_KERNEL_ARGS];
|
||||
|
|
|
@ -17,12 +17,12 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
/*! \file */
|
||||
|
||||
|
||||
#ifndef NVD_MAT_H
|
||||
#define NVD_MAT_H
|
||||
|
||||
|
@ -52,6 +52,6 @@ namespace ucl_cudadr {
|
|||
#include "ucl_print.h"
|
||||
#undef UCL_PRINT_ALLOW
|
||||
|
||||
} // namespace ucl_cudadr
|
||||
} // namespace ucl_cudadr
|
||||
|
||||
#endif
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -46,7 +46,7 @@ typedef CUdeviceptr device_ptr;
|
|||
// - HOST MEMORY ALLOCATION ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||
CUresult err=CUDA_SUCCESS;
|
||||
if (kind==UCL_NOT_PINNED)
|
||||
|
@ -62,7 +62,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||
CUresult err=CUDA_SUCCESS;
|
||||
if (kind==UCL_NOT_PINNED)
|
||||
|
@ -95,7 +95,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
|
|||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||
else if (mat.kind()==UCL_WRITE_ONLY)
|
||||
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
||||
else
|
||||
else
|
||||
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||
return UCL_MEMORY_ERROR;
|
||||
|
@ -130,30 +130,30 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
|||
const size_t cols, size_t &pitch,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
CUresult err;
|
||||
CUDA_INT_TYPE upitch;
|
||||
CUDA_INT_TYPE upitch;
|
||||
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
|
||||
cols*sizeof(typename mat_type::data_type),rows,16);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=cm.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
||||
const size_t cols, size_t &pitch,
|
||||
const enum UCL_MEMOPT kind) {
|
||||
CUresult err;
|
||||
unsigned upitch;
|
||||
unsigned upitch;
|
||||
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
|
||||
cols*sizeof(typename mat_type::data_type),rows,16);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=d.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _device_free(mat_type &mat) {
|
||||
|
@ -175,33 +175,33 @@ inline int _device_resize(mat_type &mat, const size_t rows,
|
|||
const size_t cols, size_t &pitch) {
|
||||
_device_free(mat);
|
||||
CUresult err;
|
||||
CUDA_INT_TYPE upitch;
|
||||
CUDA_INT_TYPE upitch;
|
||||
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
|
||||
cols*sizeof(typename mat_type::data_type),rows,16);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
|
||||
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
|
||||
*ptr=in;
|
||||
}
|
||||
|
||||
template <class numtyp>
|
||||
inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
|
||||
*ptr=0;
|
||||
inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
|
||||
*ptr=0;
|
||||
}
|
||||
|
||||
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
|
||||
const size_t offset, const size_t numsize) {
|
||||
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
|
||||
const size_t offset, const size_t numsize) {
|
||||
*ptr=in+offset*numsize;
|
||||
}
|
||||
|
||||
template <class numtyp>
|
||||
inline void _device_view(CUdeviceptr *ptr, numtyp *in,
|
||||
const size_t offset, const size_t numsize) {
|
||||
*ptr=0;
|
||||
const size_t offset, const size_t numsize) {
|
||||
*ptr=0;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
|
@ -211,13 +211,13 @@ template <class mat_type, class copy_type>
|
|||
inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
||||
const size_t cols) {
|
||||
assert(0==1);
|
||||
}
|
||||
}
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
||||
const size_t cols) {
|
||||
assert(0==1);
|
||||
}
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline void _device_image_free(mat_type &mat) {
|
||||
|
@ -245,7 +245,7 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
|
|||
// - HELPER FUNCTIONS FOR MEMCPY ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
|
||||
inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
ins.srcXInBytes=0;
|
||||
|
@ -257,13 +257,13 @@ inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
|
|||
ins.WidthInBytes=cols;
|
||||
ins.Height=rows;
|
||||
}
|
||||
|
||||
|
||||
template <int mem> struct _nvd_set_2D_mem;
|
||||
template <> struct _nvd_set_2D_mem<1>
|
||||
template <> struct _nvd_set_2D_mem<1>
|
||||
{ static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } };
|
||||
template <> struct _nvd_set_2D_mem<2>
|
||||
template <> struct _nvd_set_2D_mem<2>
|
||||
{ static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } };
|
||||
template <int mem> struct _nvd_set_2D_mem
|
||||
template <int mem> struct _nvd_set_2D_mem
|
||||
{ static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } };
|
||||
|
||||
|
||||
|
@ -285,7 +285,7 @@ template<> struct _ucl_memcpy<2,2> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -297,7 +297,7 @@ template<> struct _ucl_memcpy<2,2> {
|
|||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -322,7 +322,7 @@ template<> struct _ucl_memcpy<2,0> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -334,7 +334,7 @@ template<> struct _ucl_memcpy<2,0> {
|
|||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -359,7 +359,7 @@ template<> struct _ucl_memcpy<2,1> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -371,7 +371,7 @@ template<> struct _ucl_memcpy<2,1> {
|
|||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -396,7 +396,7 @@ template<> struct _ucl_memcpy<0,2> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -408,7 +408,7 @@ template<> struct _ucl_memcpy<0,2> {
|
|||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -433,7 +433,7 @@ template<> struct _ucl_memcpy<1,2> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -445,7 +445,7 @@ template<> struct _ucl_memcpy<1,2> {
|
|||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -470,7 +470,7 @@ template <> struct _ucl_memcpy<1,0> {
|
|||
CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -482,7 +482,7 @@ template <> struct _ucl_memcpy<1,0> {
|
|||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -507,7 +507,7 @@ template <> struct _ucl_memcpy<0,1> {
|
|||
CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -519,7 +519,7 @@ template <> struct _ucl_memcpy<0,1> {
|
|||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -542,7 +542,7 @@ template <> struct _ucl_memcpy<1,1> {
|
|||
CUstream &cq)
|
||||
{ memcpy(dst.begin(),src.begin(),n); }
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -554,7 +554,7 @@ template <> struct _ucl_memcpy<1,1> {
|
|||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
|
@ -579,18 +579,18 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
if (p1::PADDED==0 || p2::PADDED==0) {
|
||||
size_t src_offset=0, dst_offset=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
|
||||
src.cbegin()+src_offset,cols));
|
||||
src_offset+=spitch;
|
||||
dst_offset+=dpitch;
|
||||
}
|
||||
} else {
|
||||
} else {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
|
@ -601,12 +601,12 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||
}
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, CUstream &cq) {
|
||||
if (p1::PADDED==0 || p2::PADDED==0) {
|
||||
size_t src_offset=0, dst_offset=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
|
||||
src.cbegin()+src_offset,cols,cq));
|
||||
src_offset+=spitch;
|
||||
|
@ -636,22 +636,22 @@ inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
|
|||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
||||
rows);
|
||||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows,CUstream &cq) {
|
||||
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
||||
rows,cq);
|
||||
}
|
||||
|
||||
} // namespace ucl_cudart
|
||||
} // namespace ucl_cudart
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -28,7 +28,7 @@
|
|||
#include "nvd_mat.h"
|
||||
|
||||
namespace ucl_cudadr {
|
||||
|
||||
|
||||
/// Class storing a texture reference
|
||||
class UCL_Texture {
|
||||
public:
|
||||
|
@ -38,39 +38,39 @@ class UCL_Texture {
|
|||
inline UCL_Texture(UCL_Program &prog, const char *texture_name)
|
||||
{ get_texture(prog,texture_name); }
|
||||
/// Set the texture reference for this object
|
||||
inline void get_texture(UCL_Program &prog, const char *texture_name)
|
||||
inline void get_texture(UCL_Program &prog, const char *texture_name)
|
||||
{ CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp>
|
||||
inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
|
||||
inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec,numel); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp>
|
||||
inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
|
||||
inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec,numel); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp, class devtyp>
|
||||
inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
|
||||
inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec.device,numel); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp, class devtyp>
|
||||
inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
|
||||
inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec.device,numel); }
|
||||
|
||||
/// Unbind the texture reference from the memory allocation
|
||||
inline void unbind() { }
|
||||
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &kernel) {
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &kernel) {
|
||||
#if CUDA_VERSION < 4000
|
||||
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
|
||||
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
CUtexref _tex;
|
||||
friend class UCL_Kernel;
|
||||
|
@ -80,7 +80,7 @@ class UCL_Texture {
|
|||
#ifdef UCL_DEBUG
|
||||
assert(numel!=0 && numel<5);
|
||||
#endif
|
||||
CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
|
||||
CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
|
||||
vec.numel()*vec.element_size()));
|
||||
if (vec.element_size()==sizeof(float))
|
||||
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -41,7 +41,7 @@ class UCL_Timer {
|
|||
/// Clear any data associated with timer
|
||||
/** \note init() must be called to reuse timer after a clear() **/
|
||||
inline void clear() {
|
||||
if (_initialized) {
|
||||
if (_initialized) {
|
||||
CU_DESTRUCT_CALL(cuEventDestroy(start_event));
|
||||
CU_DESTRUCT_CALL(cuEventDestroy(stop_event));
|
||||
_initialized=false;
|
||||
|
@ -63,16 +63,16 @@ class UCL_Timer {
|
|||
|
||||
/// Start timing on command queue
|
||||
inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); }
|
||||
|
||||
|
||||
/// Stop timing on command queue
|
||||
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
|
||||
|
||||
|
||||
/// Block until the start event has been reached on device
|
||||
inline void sync_start()
|
||||
inline void sync_start()
|
||||
{ CU_SAFE_CALL(cuEventSynchronize(start_event)); }
|
||||
|
||||
/// Block until the stop event has been reached on device
|
||||
inline void sync_stop()
|
||||
inline void sync_stop()
|
||||
{ CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
|
@ -80,29 +80,29 @@ class UCL_Timer {
|
|||
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
|
||||
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
|
||||
}
|
||||
|
||||
|
||||
/// Set the total time to zero
|
||||
inline void zero_total() { _total_time=0.0; }
|
||||
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline double add_to_total()
|
||||
inline double add_to_total()
|
||||
{ double t=time(); _total_time+=t; return t/1000.0; }
|
||||
|
||||
|
||||
/// Add a user specified time to the total (ms)
|
||||
inline void add_time_to_total(const double t) { _total_time+=t; }
|
||||
|
||||
|
||||
/// Return the time (ms) of last start to stop - Forces synchronization
|
||||
inline double time() {
|
||||
inline double time() {
|
||||
float timer;
|
||||
CU_SAFE_CALL(cuEventSynchronize(stop_event));
|
||||
CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) );
|
||||
return timer;
|
||||
return timer;
|
||||
}
|
||||
|
||||
|
||||
/// Return the time (s) of last start to stop - Forces synchronization
|
||||
inline double seconds() { return time()/1000.0; }
|
||||
|
||||
|
||||
/// Return the total time in ms
|
||||
inline double total_time() { return _total_time; }
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -40,13 +40,13 @@
|
|||
#include "ucl_types.h"
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - COMMAND QUEUE STUFF
|
||||
// --------------------------------------------------------------------------
|
||||
typedef cl_command_queue command_queue;
|
||||
typedef cl_command_queue command_queue;
|
||||
typedef cl_context context_type;
|
||||
|
||||
|
||||
inline void ucl_sync(cl_command_queue &cq) {
|
||||
CL_SAFE_CALL(clFinish(cq));
|
||||
}
|
||||
|
@ -76,19 +76,19 @@ struct OCLProperties {
|
|||
|
||||
/// Class for looking at data parallel device properties
|
||||
/** \note Calls to change the device outside of the class results in incorrect
|
||||
* behavior
|
||||
* behavior
|
||||
* \note There is no error checking for indexing past the number of devices **/
|
||||
class UCL_Device {
|
||||
public:
|
||||
/// Collect properties for every device on the node
|
||||
/** \note You must set the active GPU with set() before using the device **/
|
||||
inline UCL_Device();
|
||||
|
||||
|
||||
inline ~UCL_Device();
|
||||
|
||||
/// Return the number of platforms (0 if error or no platforms)
|
||||
inline int num_platforms() { return _num_platforms; }
|
||||
|
||||
|
||||
/// Return a string with name and info of the current platform
|
||||
inline std::string platform_name();
|
||||
|
||||
|
@ -104,38 +104,38 @@ class UCL_Device {
|
|||
* be allocated for use. clear() is called to delete any contexts and
|
||||
* associated data from previous calls to set(). **/
|
||||
inline int set(int num);
|
||||
|
||||
|
||||
/// Delete any context and associated data stored from a call to set()
|
||||
inline void clear();
|
||||
|
||||
/// Get the current device number
|
||||
inline int device_num() { return _device; }
|
||||
|
||||
|
||||
/// Returns the context for the current device
|
||||
inline cl_context & context() { return _context; }
|
||||
|
||||
|
||||
/// Returns the default stream for the current device
|
||||
inline command_queue & cq() { return cq(_default_cq); }
|
||||
|
||||
|
||||
/// Returns the stream indexed by i
|
||||
inline command_queue & cq(const int i) { return _cq[i]; }
|
||||
|
||||
|
||||
/// Set the default command queue
|
||||
/** \param i index of the command queue (as added by push_command_queue())
|
||||
/** \param i index of the command queue (as added by push_command_queue())
|
||||
If i is 0, the command queue created with device initialization is
|
||||
used **/
|
||||
inline void set_command_queue(const int i) { _default_cq=i; }
|
||||
|
||||
|
||||
/// Block until all commands in the default stream have completed
|
||||
inline void sync() { sync(_default_cq); }
|
||||
|
||||
|
||||
/// Block until all commands in the specified stream have completed
|
||||
inline void sync(const int i) { ucl_sync(cq(i)); }
|
||||
|
||||
|
||||
/// Get the number of command queues currently available on device
|
||||
inline int num_queues()
|
||||
inline int num_queues()
|
||||
{ return _cq.size(); }
|
||||
|
||||
|
||||
/// Add a command queue for device computations (with profiling enabled)
|
||||
inline void push_command_queue() {
|
||||
cl_int errorv;
|
||||
|
@ -143,7 +143,7 @@ class UCL_Device {
|
|||
_cq.back()=clCreateCommandQueue(_context,_cl_device,
|
||||
CL_QUEUE_PROFILING_ENABLE,&errorv);
|
||||
if (errorv!=CL_SUCCESS) {
|
||||
std::cerr << "Could not create command queue on device: " << name()
|
||||
std::cerr << "Could not create command queue on device: " << name()
|
||||
<< std::endl;
|
||||
UCL_GERYON_EXIT;
|
||||
}
|
||||
|
@ -160,76 +160,76 @@ class UCL_Device {
|
|||
/// Get the current OpenCL device name
|
||||
inline std::string name() { return name(_device); }
|
||||
/// Get the OpenCL device name
|
||||
inline std::string name(const int i)
|
||||
inline std::string name(const int i)
|
||||
{ return std::string(_properties[i].name); }
|
||||
|
||||
/// Get a string telling the type of the current device
|
||||
inline std::string device_type_name() { return device_type_name(_device); }
|
||||
/// Get a string telling the type of the device
|
||||
inline std::string device_type_name(const int i);
|
||||
|
||||
|
||||
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type() { return device_type(_device); }
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i);
|
||||
|
||||
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory() { return shared_memory(_device); }
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory(const int i)
|
||||
inline bool shared_memory(const int i)
|
||||
{ return _shared_mem_device(_properties[i].device_type); }
|
||||
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
inline bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
inline bool double_precision(const int i)
|
||||
inline bool double_precision(const int i)
|
||||
{return _properties[i].double_precision;}
|
||||
|
||||
|
||||
/// Get the number of compute units on the current device
|
||||
inline unsigned cus() { return cus(_device); }
|
||||
/// Get the number of compute units
|
||||
inline unsigned cus(const int i)
|
||||
inline unsigned cus(const int i)
|
||||
{ return _properties[i].compute_units; }
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
/// Get the gigabytes of global memory
|
||||
inline double gigabytes(const int i)
|
||||
inline double gigabytes(const int i)
|
||||
{ return static_cast<double>(_properties[i].global_mem)/1073741824; }
|
||||
|
||||
/// Get the bytes of global memory in the current device
|
||||
inline size_t bytes() { return bytes(_device); }
|
||||
/// Get the bytes of global memory
|
||||
inline size_t bytes(const int i) { return _properties[i].global_mem; }
|
||||
|
||||
|
||||
/// Return the GPGPU revision number for current device
|
||||
//inline double revision() { return revision(_device); }
|
||||
/// Return the GPGPU revision number
|
||||
//inline double revision(const int i)
|
||||
//inline double revision(const int i)
|
||||
// { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
|
||||
|
||||
|
||||
/// Clock rate in GHz for current device
|
||||
inline double clock_rate() { return clock_rate(_device); }
|
||||
/// Clock rate in GHz
|
||||
inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}
|
||||
|
||||
|
||||
/// Return the address alignment in bytes
|
||||
inline int alignment() { return alignment(_device); }
|
||||
/// Return the address alignment in bytes
|
||||
inline int alignment(const int i) { return _properties[i].alignment; }
|
||||
|
||||
|
||||
/// Return the timer resolution
|
||||
inline size_t timer_resolution() { return timer_resolution(_device); }
|
||||
/// Return the timer resolution
|
||||
inline size_t timer_resolution(const int i)
|
||||
inline size_t timer_resolution(const int i)
|
||||
{ return _properties[i].timer_resolution; }
|
||||
|
||||
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size() { return group_size(_device); }
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size(const int i)
|
||||
inline size_t group_size(const int i)
|
||||
{ return _properties[i].work_group_size; }
|
||||
|
||||
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
/// Return the maximum memory pitch in bytes
|
||||
|
@ -254,7 +254,7 @@ class UCL_Device {
|
|||
{ return fission_by_counts(_device); }
|
||||
/// True if splitting device into subdevices by specified counts supported
|
||||
inline bool fission_by_counts(const int i)
|
||||
{ return _properties[i].partition_counts; }
|
||||
{ return _properties[i].partition_counts; }
|
||||
/// True if splitting device into subdevices by affinity domains supported
|
||||
inline bool fission_by_affinity()
|
||||
{ return fission_by_affinity(_device); }
|
||||
|
@ -271,10 +271,10 @@ class UCL_Device {
|
|||
|
||||
/// List all devices along with all properties
|
||||
inline void print_all(std::ostream &out);
|
||||
|
||||
|
||||
/// Return the OpenCL type for the device
|
||||
inline cl_device_id & cl_device() { return _cl_device; }
|
||||
|
||||
|
||||
private:
|
||||
int _num_platforms; // Number of platforms
|
||||
int _platform; // UCL_Device ID for current platform
|
||||
|
@ -287,7 +287,7 @@ class UCL_Device {
|
|||
std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices
|
||||
int _num_devices; // Number of devices
|
||||
std::vector<OCLProperties> _properties; // Properties for each device
|
||||
|
||||
|
||||
inline void add_properties(cl_device_id);
|
||||
inline int create_context();
|
||||
int _default_cq;
|
||||
|
@ -300,7 +300,7 @@ UCL_Device::UCL_Device() {
|
|||
// --- Get Number of Platforms
|
||||
cl_uint nplatforms;
|
||||
cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
|
||||
|
||||
|
||||
if (errorv!=CL_SUCCESS) {
|
||||
_num_platforms=0;
|
||||
return;
|
||||
|
@ -328,18 +328,18 @@ void UCL_Device::clear() {
|
|||
int UCL_Device::set_platform(int pid) {
|
||||
clear();
|
||||
cl_int errorv;
|
||||
|
||||
|
||||
_cl_device=0;
|
||||
_device=-1;
|
||||
_num_devices=0;
|
||||
_default_cq=0;
|
||||
|
||||
|
||||
#ifdef UCL_DEBUG
|
||||
assert(pid<num_platforms());
|
||||
#endif
|
||||
_platform=pid;
|
||||
_cl_platform=_cl_platforms[_platform];
|
||||
|
||||
|
||||
// --- Get Number of Devices
|
||||
cl_uint n;
|
||||
errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n);
|
||||
|
@ -351,7 +351,7 @@ int UCL_Device::set_platform(int pid) {
|
|||
cl_device_id device_list[_num_devices];
|
||||
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
|
||||
&n));
|
||||
|
||||
|
||||
// --- Store properties for each device
|
||||
for (int i=0; i<_num_devices; i++) {
|
||||
_cl_devices.push_back(device_list[i]);
|
||||
|
@ -385,7 +385,7 @@ void UCL_Device::add_properties(cl_device_id device_list) {
|
|||
OCLProperties op;
|
||||
char buffer[1024];
|
||||
cl_bool ans_bool;
|
||||
|
||||
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
|
||||
op.name=buffer;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE,
|
||||
|
@ -409,8 +409,8 @@ void UCL_Device::add_properties(cl_device_id device_list) {
|
|||
NULL));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN,
|
||||
sizeof(cl_uint),&op.alignment,NULL));
|
||||
op.alignment/=8;
|
||||
|
||||
op.alignment/=8;
|
||||
|
||||
// Determine if double precision is supported
|
||||
cl_uint double_width;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
|
@ -420,11 +420,11 @@ void UCL_Device::add_properties(cl_device_id device_list) {
|
|||
op.double_precision=false;
|
||||
else
|
||||
op.double_precision=true;
|
||||
|
||||
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_PROFILING_TIMER_RESOLUTION,
|
||||
sizeof(size_t),&op.timer_resolution,NULL));
|
||||
|
||||
|
||||
|
||||
op.ecc_support=false;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
|
@ -432,7 +432,7 @@ void UCL_Device::add_properties(cl_device_id device_list) {
|
|||
sizeof(ans_bool),&ans_bool,NULL));
|
||||
if (ans_bool==CL_TRUE)
|
||||
op.ecc_support=true;
|
||||
|
||||
|
||||
op.c_version="";
|
||||
op.partition_equal=false;
|
||||
op.partition_counts=false;
|
||||
|
@ -458,30 +458,30 @@ void UCL_Device::add_properties(cl_device_id device_list) {
|
|||
else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
|
||||
op.partition_affinity=true;
|
||||
}
|
||||
|
||||
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
|
||||
sizeof(cl_uint),&op.max_sub_devices,NULL));
|
||||
#endif
|
||||
|
||||
|
||||
_properties.push_back(op);
|
||||
}
|
||||
|
||||
std::string UCL_Device::platform_name() {
|
||||
char info[1024];
|
||||
|
||||
|
||||
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
|
||||
NULL));
|
||||
std::string ans=std::string(info)+' ';
|
||||
|
||||
|
||||
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info,
|
||||
NULL));
|
||||
ans+=std::string(info)+' ';
|
||||
|
||||
|
||||
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info,
|
||||
NULL));
|
||||
ans+=std::string(info);
|
||||
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
|
@ -512,7 +512,7 @@ int UCL_Device::device_type(const int i) {
|
|||
// Set the CUDA device to the specified device number
|
||||
int UCL_Device::set(int num) {
|
||||
clear();
|
||||
|
||||
|
||||
cl_device_id device_list[_num_devices];
|
||||
cl_uint n;
|
||||
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
|
||||
|
@ -557,7 +557,7 @@ void UCL_Device::print_all(std::ostream &out) {
|
|||
<< _properties[i].work_item_size[1] << " x "
|
||||
<< _properties[i].work_item_size[2] << std::endl;
|
||||
//out << " Maximum sizes of each dimension of a grid: "
|
||||
// << _properties[i].maxGridSize[0] << " x "
|
||||
// << _properties[i].maxGridSize[0] << " x "
|
||||
// << _properties[i].maxGridSize[1] << " x "
|
||||
// << _properties[i].maxGridSize[2] << std::endl;
|
||||
//out << " Maximum memory pitch: "
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -28,7 +28,7 @@
|
|||
#include <fstream>
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
|
||||
class UCL_Texture;
|
||||
template <class numtyp> class UCL_D_Vec;
|
||||
template <class numtyp> class UCL_D_Mat;
|
||||
|
@ -41,10 +41,10 @@ class UCL_Program {
|
|||
public:
|
||||
inline UCL_Program() : _init_done(false) {}
|
||||
inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
|
||||
inline UCL_Program(UCL_Device &device, const void *program,
|
||||
const char *flags="", std::string *log=NULL) :
|
||||
_init_done(false) {
|
||||
init(device);
|
||||
inline UCL_Program(UCL_Device &device, const void *program,
|
||||
const char *flags="", std::string *log=NULL) :
|
||||
_init_done(false) {
|
||||
init(device);
|
||||
load_string(program,flags,log);
|
||||
}
|
||||
|
||||
|
@ -56,7 +56,7 @@ class UCL_Program {
|
|||
_device=device.cl_device();
|
||||
_context=device.context();
|
||||
_cq=device.cq();
|
||||
CL_SAFE_CALL(clRetainContext(_context));
|
||||
CL_SAFE_CALL(clRetainContext(_context));
|
||||
CL_SAFE_CALL(clRetainCommandQueue(_cq));
|
||||
_init_done=true;
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ class UCL_Program {
|
|||
/** \note Must call init() after each clear **/
|
||||
inline void clear() {
|
||||
if (_init_done) {
|
||||
CL_DESTRUCT_CALL(clReleaseProgram(_program));
|
||||
CL_DESTRUCT_CALL(clReleaseProgram(_program));
|
||||
CL_DESTRUCT_CALL(clReleaseContext(_context));
|
||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
|
||||
_init_done=false;
|
||||
|
@ -77,20 +77,20 @@ class UCL_Program {
|
|||
std::string *log=NULL) {
|
||||
std::ifstream in(filename);
|
||||
if (!in || in.is_open()==false) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open kernel file: "
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not open kernel file: "
|
||||
<< filename << std::endl;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
return UCL_FILE_NOT_FOUND;
|
||||
}
|
||||
|
||||
|
||||
std::string program((std::istreambuf_iterator<char>(in)),
|
||||
std::istreambuf_iterator<char>());
|
||||
in.close();
|
||||
return load_string(program.c_str(),flags,log);
|
||||
}
|
||||
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const void *program, const char *flags="",
|
||||
std::string *log=NULL) {
|
||||
|
@ -103,23 +103,23 @@ class UCL_Program {
|
|||
CL_CHECK_ERR(error_flag);
|
||||
cl_build_status build_status;
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,
|
||||
CL_PROGRAM_BUILD_STATUS,
|
||||
CL_PROGRAM_BUILD_STATUS,
|
||||
sizeof(cl_build_status),&build_status,
|
||||
NULL));
|
||||
|
||||
|
||||
if (build_status != CL_SUCCESS || log!=NULL) {
|
||||
size_t ms;
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
|
||||
NULL, &ms));
|
||||
char build_log[ms];
|
||||
char build_log[ms];
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
|
||||
build_log, NULL));
|
||||
|
||||
|
||||
if (log!=NULL)
|
||||
*log=std::string(build_log);
|
||||
|
||||
|
||||
if (build_status != CL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << std::endl
|
||||
<< "----------------------------------------------------------\n"
|
||||
<< " UCL Error: Error compiling OpenCL Program ("
|
||||
|
@ -130,10 +130,10 @@ class UCL_Program {
|
|||
return UCL_COMPILE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/// Return the default command queue/stream associated with this data
|
||||
inline command_queue & cq() { return _cq; }
|
||||
/// Change the default command queue associated with matrix
|
||||
|
@ -143,7 +143,7 @@ class UCL_Program {
|
|||
private:
|
||||
bool _init_done;
|
||||
cl_program _program;
|
||||
cl_device_id _device;
|
||||
cl_device_id _device;
|
||||
cl_context _context;
|
||||
cl_command_queue _cq;
|
||||
};
|
||||
|
@ -153,7 +153,7 @@ class UCL_Kernel {
|
|||
public:
|
||||
UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0)
|
||||
{ _block_size[0]=0; _num_blocks[0]=0; }
|
||||
|
||||
|
||||
inline UCL_Kernel(UCL_Program &program, const char *function) :
|
||||
_dimensions(1), _function_set(false), _num_args(0)
|
||||
{ _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); }
|
||||
|
@ -178,48 +178,48 @@ class UCL_Kernel {
|
|||
/** If not a device pointer, this must be repeated each time the argument
|
||||
* changes **/
|
||||
template <class dtype>
|
||||
inline void set_arg(const cl_uint index, const dtype * const arg) {
|
||||
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
|
||||
inline void set_arg(const cl_uint index, const dtype * const arg) {
|
||||
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
|
||||
if (index>_num_args) {
|
||||
_num_args=index;
|
||||
#ifdef UCL_DEBUG
|
||||
if (_num_args>_kernel_info_nargs) {
|
||||
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
|
||||
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
|
||||
<< _kernel_info_name << std::endl;
|
||||
assert(0==1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a kernel argument.
|
||||
template <class dtype>
|
||||
inline void add_arg(const dtype * const arg) {
|
||||
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
|
||||
_num_args++;
|
||||
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
|
||||
_num_args++;
|
||||
#ifdef UCL_DEBUG
|
||||
if (_num_args>_kernel_info_nargs) {
|
||||
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
|
||||
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
|
||||
<< _kernel_info_name << std::endl;
|
||||
assert(0==1);
|
||||
}
|
||||
|
@ -228,31 +228,31 @@ class UCL_Kernel {
|
|||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size) {
|
||||
_dimensions=1;
|
||||
_num_blocks[0]=num_blocks*block_size;
|
||||
_block_size[0]=block_size;
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size) {
|
||||
_dimensions=1;
|
||||
_num_blocks[0]=num_blocks*block_size;
|
||||
_block_size[0]=block_size;
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
|
@ -266,36 +266,36 @@ class UCL_Kernel {
|
|||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x*block_size_x;
|
||||
_block_size[0]=block_size_x;
|
||||
_num_blocks[1]=num_blocks_y*block_size_y;
|
||||
_block_size[1]=block_size_y;
|
||||
const size_t block_size_x, const size_t block_size_y) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x*block_size_x;
|
||||
_block_size[0]=block_size_x;
|
||||
_num_blocks[1]=num_blocks_y*block_size_y;
|
||||
_block_size[1]=block_size_y;
|
||||
}
|
||||
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue for the kernel is changed to cq **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y,
|
||||
command_queue &cq)
|
||||
command_queue &cq)
|
||||
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x,
|
||||
const size_t block_size_x,
|
||||
const size_t block_size_y, const size_t block_size_z) {
|
||||
_dimensions=3;
|
||||
_dimensions=3;
|
||||
const size_t num_blocks_z=1;
|
||||
_num_blocks[0]=num_blocks_x*block_size_x;
|
||||
_block_size[0]=block_size_x;
|
||||
_num_blocks[1]=num_blocks_y*block_size_y;
|
||||
_block_size[1]=block_size_y;
|
||||
_num_blocks[2]=num_blocks_z*block_size_z;
|
||||
_block_size[2]=block_size_z;
|
||||
_num_blocks[0]=num_blocks_x*block_size_x;
|
||||
_block_size[0]=block_size_x;
|
||||
_num_blocks[1]=num_blocks_y*block_size_y;
|
||||
_block_size[1]=block_size_y;
|
||||
_num_blocks[2]=num_blocks_z*block_size_z;
|
||||
_block_size[2]=block_size_z;
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
|
@ -305,13 +305,13 @@ class UCL_Kernel {
|
|||
const size_t block_size_x, const size_t block_size_y,
|
||||
const size_t block_size_z, command_queue &cq) {
|
||||
_cq=cq;
|
||||
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
|
||||
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
|
||||
block_size_z);
|
||||
}
|
||||
|
||||
|
||||
/// Run the kernel in the default command queue
|
||||
inline void run();
|
||||
|
||||
|
||||
/// Clear any arguments associated with the kernel
|
||||
inline void clear_args() { _num_args=0; }
|
||||
|
||||
|
@ -320,7 +320,7 @@ class UCL_Kernel {
|
|||
/// Change the default command queue associated with matrix
|
||||
inline void cq(command_queue &cq_in) { _cq=cq_in; }
|
||||
#include "ucl_arg_kludge.h"
|
||||
|
||||
|
||||
private:
|
||||
cl_kernel _kernel;
|
||||
cl_program _program;
|
||||
|
@ -328,7 +328,7 @@ class UCL_Kernel {
|
|||
size_t _block_size[3];
|
||||
size_t _num_blocks[3];
|
||||
bool _function_set;
|
||||
|
||||
|
||||
cl_command_queue _cq; // The default command queue for this kernel
|
||||
unsigned _num_args;
|
||||
|
||||
|
@ -348,7 +348,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
|
|||
CL_SAFE_CALL(clRetainProgram(_program));
|
||||
cl_int error_flag;
|
||||
_kernel=clCreateKernel(program._program,function,&error_flag);
|
||||
|
||||
|
||||
if (error_flag!=CL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not find function: " << function
|
||||
|
@ -357,7 +357,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
|
|||
#endif
|
||||
return UCL_FUNCTION_NOT_FOUND;
|
||||
}
|
||||
|
||||
|
||||
#ifdef UCL_DEBUG
|
||||
_kernel_info_name=function;
|
||||
cl_uint nargs;
|
||||
|
@ -375,7 +375,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
|
|||
#endif
|
||||
#endif
|
||||
|
||||
return UCL_SUCCESS;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
void UCL_Kernel::run() {
|
||||
|
|
|
@ -17,12 +17,12 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
/*! \file */
|
||||
|
||||
|
||||
#ifndef OCL_MAT_H
|
||||
#define OCL_MAT_H
|
||||
|
||||
|
@ -54,6 +54,6 @@ namespace ucl_opencl {
|
|||
#include "ucl_print.h"
|
||||
#undef UCL_PRINT_ALLOW
|
||||
|
||||
} // namespace ucl_cudart
|
||||
} // namespace ucl_cudart
|
||||
|
||||
#endif
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -36,10 +36,10 @@ namespace ucl_opencl {
|
|||
// --------------------------------------------------------------------------
|
||||
struct ocl_kernel_dim {
|
||||
size_t x,y,z;
|
||||
ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
|
||||
ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
|
||||
x(_x), y(_y), z(_z) {}
|
||||
operator size_t * () { return (size_t *)this; }
|
||||
operator const size_t * () const { return (const size_t *)this; }
|
||||
operator const size_t * () const { return (const size_t *)this; }
|
||||
};
|
||||
typedef ocl_kernel_dim ucl_kernel_dim;
|
||||
|
||||
|
@ -53,13 +53,13 @@ typedef cl_mem device_ptr;
|
|||
// --------------------------------------------------------------------------
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||
cl_int error_flag;
|
||||
cl_context context;
|
||||
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
||||
&context,NULL));
|
||||
|
||||
|
||||
cl_mem_flags buffer_perm;
|
||||
cl_map_flags map_perm;
|
||||
if (kind2==UCL_NOT_SPECIFIED) {
|
||||
|
@ -88,7 +88,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
||||
else
|
||||
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
|
||||
|
||||
|
||||
if (kind==UCL_READ_ONLY) {
|
||||
#ifdef CL_VERSION_1_2
|
||||
buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
|
||||
|
@ -102,9 +102,9 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||
} else
|
||||
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
||||
}
|
||||
|
||||
|
||||
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
||||
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
|
||||
|
@ -125,7 +125,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
|
|||
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
|
||||
&orig_flags,NULL));
|
||||
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
|
||||
|
||||
|
||||
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
|
||||
*mat.host_ptr(), &error_flag);
|
||||
|
||||
|
@ -135,7 +135,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
|
|||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||
cl_mem_flags buffer_perm;
|
||||
cl_map_flags map_perm;
|
||||
|
@ -160,7 +160,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|||
|
||||
cl_int error_flag;
|
||||
mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
|
||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
||||
|
@ -210,7 +210,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
|
|||
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
||||
|
||||
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
||||
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
|
||||
|
@ -248,7 +248,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||
else
|
||||
assert(0==1);
|
||||
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=cm.cq();
|
||||
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||
|
@ -278,7 +278,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|||
assert(0==1);
|
||||
mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
|
||||
&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=dev.cq();
|
||||
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||
|
@ -304,7 +304,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
|
|||
if (dev.device_type()!=UCL_CPU && cols%256!=0)
|
||||
padded_cols+=256-cols%256;
|
||||
pitch=padded_cols*sizeof(typename mat_type::data_type);
|
||||
return _device_alloc(mat,dev,pitch*rows,kind);
|
||||
return _device_alloc(mat,dev,pitch*rows,kind);
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
|
@ -342,7 +342,7 @@ inline int _device_resize(mat_type &mat, const size_t n) {
|
|||
else
|
||||
assert(0==1);
|
||||
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
@ -380,7 +380,7 @@ inline int _device_resize(mat_type &mat, const size_t rows,
|
|||
else
|
||||
assert(0==1);
|
||||
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
@ -396,21 +396,21 @@ inline void _host_zero(void *ptr, const size_t n) {
|
|||
inline void _ocl_build(cl_program &program, cl_device_id &device,
|
||||
const char* options = "") {
|
||||
clBuildProgram(program,1,&device,options,NULL,NULL);
|
||||
|
||||
|
||||
cl_build_status build_status;
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
|
||||
sizeof(cl_build_status),&build_status,
|
||||
NULL));
|
||||
if (build_status == CL_SUCCESS)
|
||||
return;
|
||||
|
||||
|
||||
size_t ms;
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
|
||||
NULL, &ms));
|
||||
char build_log[ms];
|
||||
char build_log[ms];
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms,
|
||||
build_log, NULL));
|
||||
|
||||
|
||||
std::cerr << std::endl
|
||||
<< "----------------------------------------------------------\n"
|
||||
<< " Error compiling OpenCL Program...\n"
|
||||
|
@ -423,13 +423,13 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
|
|||
cl_kernel &kernel, const char *function,
|
||||
const char *options="") {
|
||||
cl_int error_flag;
|
||||
|
||||
|
||||
cl_program program=clCreateProgramWithSource(context,lines,source,
|
||||
NULL,&error_flag);
|
||||
CL_CHECK_ERR(error_flag);
|
||||
CL_CHECK_ERR(error_flag);
|
||||
_ocl_build(program,device,options);
|
||||
kernel=clCreateKernel(program,function,&error_flag);
|
||||
CL_CHECK_ERR(error_flag);
|
||||
CL_CHECK_ERR(error_flag);
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
|
@ -452,17 +452,17 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
|
|||
cl_device_id device;
|
||||
CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES,
|
||||
sizeof(cl_device_id),&device,NULL));
|
||||
|
||||
|
||||
const char * szero[3]={
|
||||
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void _device_zero(__global NUMTYP *a, const int offset)",
|
||||
" { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
|
||||
};
|
||||
|
||||
|
||||
cl_kernel kzero;
|
||||
_ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
|
||||
_UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
|
||||
|
||||
|
||||
cl_int offset=mat.offset();
|
||||
CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
|
||||
CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
|
||||
|
@ -486,7 +486,7 @@ template<> struct _ucl_memcpy<2,2> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const cl_bool block,
|
||||
|
@ -504,7 +504,7 @@ template<> struct _ucl_memcpy<2,0> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const cl_bool block,
|
||||
|
@ -522,7 +522,7 @@ template<> struct _ucl_memcpy<2,1> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const cl_bool block,
|
||||
|
@ -540,7 +540,7 @@ template<> struct _ucl_memcpy<0,2> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const cl_bool block,
|
||||
|
@ -558,7 +558,7 @@ template<> struct _ucl_memcpy<1,2> {
|
|||
assert(0==1);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const cl_bool block,
|
||||
|
@ -587,9 +587,9 @@ template <> struct _ucl_memcpy<1,0> {
|
|||
dst.begin(),0,NULL,NULL));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const cl_bool block,
|
||||
size_t dst_offset, size_t src_offset) {
|
||||
if (src.cbegin()==dst.cbegin()) {
|
||||
|
@ -602,20 +602,20 @@ template <> struct _ucl_memcpy<1,0> {
|
|||
#ifdef UCL_DBG_MEM_TRACE
|
||||
std::cerr << "UCL_COPY 2NS\n";
|
||||
#endif
|
||||
if (spitch==dpitch && dst.cols()==src.cols() &&
|
||||
if (spitch==dpitch && dst.cols()==src.cols() &&
|
||||
src.cols()==cols/src.element_size())
|
||||
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
|
||||
spitch*rows,
|
||||
(char *)dst.begin()+dst_offset,0,NULL,
|
||||
NULL));
|
||||
else
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,cols,
|
||||
(char *)dst.begin()+dst_offset,0,NULL,
|
||||
NULL));
|
||||
src_offset+=spitch;
|
||||
dst_offset+=dpitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -630,7 +630,7 @@ template <> struct _ucl_memcpy<0,1> {
|
|||
#ifdef UCL_DBG_MEM_TRACE
|
||||
std::cerr << "UCL_COPY 3S\n";
|
||||
#endif
|
||||
return;
|
||||
return;
|
||||
}
|
||||
#ifdef UCL_DBG_MEM_TRACE
|
||||
std::cerr << "UCL_COPY 3NS\n";
|
||||
|
@ -639,9 +639,9 @@ template <> struct _ucl_memcpy<0,1> {
|
|||
src.begin(),0,NULL,NULL));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const cl_bool block,
|
||||
size_t dst_offset, size_t src_offset) {
|
||||
if (src.cbegin()==dst.cbegin()) {
|
||||
|
@ -649,12 +649,12 @@ template <> struct _ucl_memcpy<0,1> {
|
|||
#ifdef UCL_DBG_MEM_TRACE
|
||||
std::cerr << "UCL_COPY 4S\n";
|
||||
#endif
|
||||
return;
|
||||
return;
|
||||
}
|
||||
#ifdef UCL_DBG_MEM_TRACE
|
||||
std::cerr << "UCL_COPY 4NS\n";
|
||||
#endif
|
||||
if (spitch==dpitch && dst.cols()==src.cols() &&
|
||||
if (spitch==dpitch && dst.cols()==src.cols() &&
|
||||
src.cols()==cols/src.element_size())
|
||||
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
|
||||
spitch*rows,
|
||||
|
@ -667,7 +667,7 @@ template <> struct _ucl_memcpy<0,1> {
|
|||
NULL));
|
||||
src_offset+=spitch;
|
||||
dst_offset+=dpitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -687,33 +687,33 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||
#ifdef UCL_DBG_MEM_TRACE
|
||||
else std::cerr << "UCL_COPY 6S\n";
|
||||
#endif
|
||||
|
||||
|
||||
if (block==CL_TRUE) ucl_sync(cq);
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows, cl_command_queue &cq,
|
||||
const cl_bool block,
|
||||
size_t dst_offset, size_t src_offset) {
|
||||
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
|
||||
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
|
||||
#ifdef UCL_DBG_MEM_TRACE
|
||||
std::cerr << "UCL_COPY 7NS\n";
|
||||
#endif
|
||||
if (spitch==dpitch && dst.cols()==src.cols() &&
|
||||
if (spitch==dpitch && dst.cols()==src.cols() &&
|
||||
src.cols()==cols/src.element_size())
|
||||
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
|
||||
dst_offset,spitch*rows,0,NULL,NULL));
|
||||
|
||||
|
||||
else
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),
|
||||
src_offset,dst_offset,cols,0,
|
||||
NULL,NULL));
|
||||
src_offset+=spitch;
|
||||
dst_offset+=dpitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef UCL_DBG_MEM_TRACE
|
||||
else std::cerr << "UCL_COPY 7S\n";
|
||||
#endif
|
||||
|
@ -736,8 +736,8 @@ inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
|
|||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows) {
|
||||
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
||||
rows,dst.cq(),CL_TRUE,
|
||||
|
@ -745,15 +745,15 @@ inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
|||
}
|
||||
|
||||
template<class mat1, class mat2>
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
||||
const size_t spitch, const size_t cols,
|
||||
const size_t rows,cl_command_queue &cq) {
|
||||
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
||||
rows,cq,CL_FALSE,
|
||||
dst.byteoff(),src.byteoff());
|
||||
}
|
||||
|
||||
} // namespace ucl_cudart
|
||||
} // namespace ucl_cudart
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -28,7 +28,7 @@
|
|||
#include "ocl_mat.h"
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
|
||||
/// Class storing a texture reference
|
||||
class UCL_Texture {
|
||||
public:
|
||||
|
@ -46,9 +46,9 @@ class UCL_Texture {
|
|||
/// Unbind the texture reference from the memory allocation
|
||||
inline void unbind() { }
|
||||
|
||||
/// Make a texture reference available to kernel
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &kernel) { }
|
||||
|
||||
|
||||
private:
|
||||
friend class UCL_Kernel;
|
||||
};
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -67,33 +67,33 @@ class UCL_Timer {
|
|||
clRetainCommandQueue(_cq);
|
||||
_initialized=true;
|
||||
}
|
||||
|
||||
|
||||
/// Start timing on default command queue
|
||||
inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
|
||||
|
||||
|
||||
/// Stop timing on default command queue
|
||||
inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
|
||||
|
||||
|
||||
/// Block until the start event has been reached on device
|
||||
inline void sync_start()
|
||||
inline void sync_start()
|
||||
{ CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
|
||||
|
||||
/// Block until the stop event has been reached on device
|
||||
inline void sync_stop()
|
||||
inline void sync_stop()
|
||||
{ CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
|
||||
|
||||
/// Set the time elapsed to zero (not the total_time)
|
||||
inline void zero()
|
||||
{ UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
|
||||
|
||||
inline void zero()
|
||||
{ UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
|
||||
|
||||
/// Set the total time to zero
|
||||
inline void zero_total() { _total_time=0.0; }
|
||||
|
||||
|
||||
/// Add time from previous start and stop to total
|
||||
/** Forces synchronization **/
|
||||
inline double add_to_total()
|
||||
inline double add_to_total()
|
||||
{ double t=time(); _total_time+=t; return t/1000.0; }
|
||||
|
||||
|
||||
/// Add a user specified time to the total (ms)
|
||||
inline void add_time_to_total(const double t) { _total_time+=t; }
|
||||
|
||||
|
@ -107,12 +107,12 @@ class UCL_Timer {
|
|||
CL_SAFE_CALL(clGetEventProfilingInfo(start_event,
|
||||
CL_PROFILING_COMMAND_END,
|
||||
sizeof(cl_ulong), &tstart, NULL));
|
||||
return (tend-tstart)*t_factor;
|
||||
return (tend-tstart)*t_factor;
|
||||
}
|
||||
|
||||
|
||||
/// Return the time (s) of last start to stop - Forces synchronization
|
||||
inline double seconds() { return time()/1000.0; }
|
||||
|
||||
|
||||
/// Return the total time in ms
|
||||
inline double total_time() { return _total_time; }
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -38,47 +38,47 @@
|
|||
|
||||
template <class t1, class t2, class t3, class t4, class t5>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10>
|
||||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -87,9 +87,9 @@
|
|||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -98,8 +98,8 @@
|
|||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12);
|
||||
}
|
||||
|
||||
|
@ -109,9 +109,9 @@
|
|||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -120,9 +120,9 @@
|
|||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -131,9 +131,9 @@
|
|||
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -144,10 +144,10 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -158,10 +158,10 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -172,10 +172,10 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -186,10 +186,10 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -200,10 +200,10 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -216,10 +216,10 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21);
|
||||
}
|
||||
|
||||
|
@ -233,10 +233,10 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22);
|
||||
}
|
||||
|
||||
|
@ -250,10 +250,10 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23);
|
||||
}
|
||||
|
||||
|
@ -267,10 +267,10 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
|
||||
}
|
||||
|
||||
|
@ -284,11 +284,11 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
|
@ -303,11 +303,11 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26);
|
||||
}
|
||||
|
||||
|
@ -323,11 +323,11 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27);
|
||||
}
|
||||
|
||||
|
@ -343,11 +343,11 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||
}
|
||||
|
||||
|
@ -363,11 +363,11 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
|
||||
}
|
||||
|
||||
|
@ -383,12 +383,12 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||
}
|
||||
|
||||
|
||||
|
@ -425,7 +425,7 @@
|
|||
template <class t1, class t2, class t3, class t4, class t5>
|
||||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -434,8 +434,8 @@
|
|||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -444,8 +444,8 @@
|
|||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -454,8 +454,8 @@
|
|||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -464,8 +464,8 @@
|
|||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -474,8 +474,8 @@
|
|||
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -486,9 +486,9 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -499,8 +499,8 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12);
|
||||
run();
|
||||
}
|
||||
|
@ -512,9 +512,9 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -525,9 +525,9 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -538,9 +538,9 @@
|
|||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -553,10 +553,10 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -569,10 +569,10 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -585,10 +585,10 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -601,10 +601,10 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -617,10 +617,10 @@
|
|||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -635,10 +635,10 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21);
|
||||
run();
|
||||
}
|
||||
|
@ -654,10 +654,10 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22);
|
||||
run();
|
||||
}
|
||||
|
@ -673,10 +673,10 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23);
|
||||
run();
|
||||
}
|
||||
|
@ -692,10 +692,10 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
|
||||
run();
|
||||
}
|
||||
|
@ -711,11 +711,11 @@
|
|||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -732,11 +732,11 @@
|
|||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26);
|
||||
run();
|
||||
}
|
||||
|
@ -754,11 +754,11 @@
|
|||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27);
|
||||
run();
|
||||
}
|
||||
|
@ -776,12 +776,12 @@
|
|||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||
run();
|
||||
}
|
||||
|
||||
|
@ -798,11 +798,11 @@
|
|||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
|
||||
run();
|
||||
}
|
||||
|
@ -820,11 +820,11 @@
|
|||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||
run();
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -52,10 +52,10 @@
|
|||
/// Base class for vector/matrix containers
|
||||
/** All containers are associated with a default command queue.
|
||||
* For CUDA, this is the default stream.
|
||||
*
|
||||
* The default queue is used for asynchonrous operations on the container
|
||||
*
|
||||
* The default queue is used for asynchonrous operations on the container
|
||||
* that do not specify a queue. For OpenCL, this queue is also used in
|
||||
* calls for reserving and copying memory **/
|
||||
* calls for reserving and copying memory **/
|
||||
class UCL_BaseMat {
|
||||
public:
|
||||
UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
|
||||
|
@ -68,8 +68,8 @@ class UCL_BaseMat {
|
|||
inline void sync() { ucl_sync(_cq); }
|
||||
/// Return the type/permissions of memory allocation
|
||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
|
||||
* or UCL_VIEW **/
|
||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
||||
* or UCL_VIEW **/
|
||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
||||
|
||||
inline bool shared_mem_device() {
|
||||
#ifdef _OCL_MAT
|
||||
|
@ -79,12 +79,12 @@ class UCL_BaseMat {
|
|||
cl_device_type device_type;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
|
||||
sizeof(device_type),&device_type,NULL));
|
||||
return _shared_mem_device(device_type);
|
||||
return _shared_mem_device(device_type);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
command_queue _cq;
|
||||
enum UCL_MEMOPT _kind;
|
||||
|
|
|
@ -17,33 +17,33 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
||||
/***************************************************************************
|
||||
The ucl_copy and ucl_cast_copy routines provide a general prototype for
|
||||
copying data between host and device memory (including texture memory)
|
||||
for the matrix and vector types in nvc_memory.
|
||||
|
||||
For host/host and host/device transfers, typecasting is performed
|
||||
automatically as necessary.
|
||||
|
||||
The routines are written so that all branches can be removed by the
|
||||
|
||||
For host/host and host/device transfers, typecasting is performed
|
||||
automatically as necessary.
|
||||
|
||||
The routines are written so that all branches can be removed by the
|
||||
compiler during template instantiation.
|
||||
|
||||
|
||||
The routines currently assume row-major ordering for all types.
|
||||
|
||||
|
||||
For asynchronous copy in the default command queue, async is boolean true;
|
||||
For asynchronous copy in a specified command queue, async is command queue
|
||||
Otherwise, set async to boolean false;
|
||||
|
||||
|
||||
When performing frequent data copies that require casting, it is more
|
||||
efficient to allocate a casting buffer once and then pass that buffer
|
||||
to the copy routine. This can be accomplished with the ucl_cast_copy
|
||||
routines.
|
||||
|
||||
Examples
|
||||
|
||||
Examples
|
||||
(x's represent alignment padding - to maintain alignment)
|
||||
(o's represent a larger matrix in memory)
|
||||
(vectors represented as single row)
|
||||
|
@ -51,18 +51,18 @@
|
|||
dst src command
|
||||
----------------------------------------------------------------
|
||||
0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async)
|
||||
|
||||
|
||||
0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async)
|
||||
|
||||
|
||||
0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async)
|
||||
3 4 5
|
||||
|
||||
3 4 5
|
||||
|
||||
0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async)
|
||||
3 4 5
|
||||
|
||||
|
||||
0 1 2 <-- 0 1 2 ucl_copy(dst,src,async)
|
||||
3 4 5 3 4 5
|
||||
|
||||
|
||||
0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async)
|
||||
3 4 5 3 4 5
|
||||
5 6 7
|
||||
|
@ -70,33 +70,33 @@
|
|||
0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async)
|
||||
4 5 6 4 5 6 7
|
||||
8 9 10 11
|
||||
|
||||
|
||||
0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async)
|
||||
3 4 5 x x 3 4 5
|
||||
|
||||
|
||||
0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async)
|
||||
3 4 5 3 4 5 x x
|
||||
|
||||
|
||||
0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async)
|
||||
3 4 5 o o 3 4 5
|
||||
o o o o o
|
||||
o o o o o
|
||||
|
||||
0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async)
|
||||
3 4 5 o o
|
||||
o o o o o
|
||||
3 4 5 o o
|
||||
o o o o o
|
||||
|
||||
0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async)
|
||||
2 3 o o o
|
||||
o o o o o
|
||||
2 3 o o o
|
||||
o o o o o
|
||||
|
||||
0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
|
||||
5 6 7 o o 5 6 7 8 9
|
||||
o o o o o 10 11 12 13 14
|
||||
|
||||
|
||||
0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
|
||||
5 6 7 8 9
|
||||
10 11 12 13 14
|
||||
|
||||
|
||||
***************************************************************************/
|
||||
|
||||
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
|
||||
|
@ -124,7 +124,7 @@ inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
|
|||
assert(0==1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - HOST-HOST COPY ROUTINES
|
||||
|
@ -182,7 +182,7 @@ template <> struct _host_host_copy<1,1> {
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef UCL_DBG_MEM_TRACE
|
||||
std::cerr << "UCL_COPY 8NS\n";
|
||||
#endif
|
||||
|
@ -212,7 +212,7 @@ template <int host_t1, int host_t2> struct _host_host_copy {
|
|||
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols) {
|
||||
assert(0==1);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
|
@ -242,20 +242,20 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
|
|||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer) {
|
||||
// Asynchronous currently pointless here
|
||||
// Asynchronous currently pointless here
|
||||
#ifdef UCL_DEBUG
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
#endif
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
#endif
|
||||
if (mat1::VECTOR) {
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
|
||||
for (size_t i=0; i<rows*cols; i++)
|
||||
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
||||
} else {
|
||||
if (mat2::VECTOR)
|
||||
if (mat2::VECTOR)
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
cols*sizeof(typename mat2::data_type),
|
||||
cols*sizeof(typename mat2::data_type),rows);
|
||||
|
@ -276,23 +276,23 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
|
|||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
// Asynchronous currently pointless here
|
||||
// Asynchronous currently pointless here
|
||||
#ifdef UCL_DEBUG
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
#endif
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
#endif
|
||||
if (mat1::VECTOR) {
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
|
||||
cast_buffer.sync();
|
||||
cast_buffer.sync();
|
||||
for (size_t i=0; i<rows*cols; i++)
|
||||
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
||||
} else {
|
||||
if (mat2::VECTOR)
|
||||
if (mat2::VECTOR)
|
||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||
cols*sizeof(typename mat2::data_type),
|
||||
cols*sizeof(typename mat2::data_type),rows,cq);
|
||||
|
@ -338,7 +338,7 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
|
|||
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
if (mat3::VECTOR==0) {
|
||||
if (mat3::VECTOR==0) {
|
||||
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
|
||||
assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
}
|
||||
|
@ -404,9 +404,9 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
|
|||
#ifdef UCL_DEBUG
|
||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
if (mat3::VECTOR==0) {
|
||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||
if (mat3::VECTOR==0) {
|
||||
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
|
||||
assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||
}
|
||||
|
@ -472,23 +472,23 @@ template <> struct _ucl_cast_copy<1,1> {
|
|||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer, command_queue &cq) {
|
||||
assert(0==1);
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer) {
|
||||
assert(0==1);
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer) {
|
||||
assert(0==1);
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
assert(0==1);
|
||||
assert(0==1);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -497,23 +497,23 @@ template <> struct _ucl_cast_copy<0,0> {
|
|||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer, command_queue &cq) {
|
||||
assert(0==1);
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer) {
|
||||
assert(0==1);
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer) {
|
||||
assert(0==1);
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
assert(0==1);
|
||||
assert(0==1);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -525,7 +525,7 @@ template <> struct _ucl_cast_copy<0,0> {
|
|||
/** \param numel Number of elements (not bytes) to copy
|
||||
* \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
|
@ -551,7 +551,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||
* \param async Perform non-blocking copy on default stream
|
||||
* \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
|
@ -580,7 +580,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
* be passed to an alternative copy routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2>
|
||||
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
|
@ -593,7 +593,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||
#endif
|
||||
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||
if (mat1::MEM_TYPE==1) {
|
||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||
|
@ -606,8 +606,8 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||
cast_buffer,cq);
|
||||
}
|
||||
} else
|
||||
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
|
||||
} else
|
||||
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
|
||||
}
|
||||
|
||||
/// Copy matrix/vector (memory already allocated)
|
||||
|
@ -619,7 +619,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
* be passed to an alternative copy routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - The default stream is used for asynchronous copy
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2>
|
||||
|
@ -648,7 +648,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||
cast_buffer);
|
||||
}
|
||||
} else
|
||||
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
|
||||
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
|
@ -659,11 +659,11 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||
/** \param async Perform non-blocking copy on default stream
|
||||
* \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If src is a vector, routine assumes row-major rows by cols copy
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If dst is a vector, routine assumes row-major rows by cols copy
|
||||
* - If dst is a matrix, routine will copy into left tile of matrix
|
||||
* - If dst is a matrix, routine will copy into left tile of matrix
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
|
@ -686,16 +686,16 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
|
||||
/** \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If src is a vector, routine assumes row-major rows by cols copy
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If dst is a vector, routine assumes row-major rows by cols copy
|
||||
* - If dst is a matrix, routine will copy into upper left tile of matrix
|
||||
* - If dst is a matrix, routine will copy into upper left tile of matrix
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,rows,cols,cq);
|
||||
|
@ -710,11 +710,11 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||
|
||||
/// Asynchronous copy of subset matrix rows,cols (memory already allocated)
|
||||
/** - If src is a vector, routine assumes row-major rows by cols copy
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If dst is a vector, routine assumes row-major rows by cols copy
|
||||
* - If dst is a matrix, routine will copy into left tile of matrix
|
||||
* - If dst is a matrix, routine will copy into left tile of matrix
|
||||
* - If the data types of the two matrices are not the same,
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* not device to device. For host/device transfers, a temporary
|
||||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
|
@ -730,7 +730,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||
#endif
|
||||
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||
if (mat1::MEM_TYPE==1) {
|
||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||
|
@ -773,9 +773,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||
/// Copy subset of matrix rows,cols (memory already allocated)
|
||||
/** \param async Perform non-blocking copy (ignored for host to host copy)
|
||||
* - If src is a vector, routine assumes row-major rows by cols copy
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If src is a matrix, routine will copy upper left tile of matrix
|
||||
* - If dst is a vector, routine assumes row-major rows by cols copy
|
||||
* - If dst is a matrix, routine will copy into left tile of matrix
|
||||
* - If dst is a matrix, routine will copy into left tile of matrix
|
||||
* - If the data types of the two matrices are not the same,
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* not device to device. For host/device transfers, a temporary
|
||||
|
@ -796,7 +796,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||
ucl_copy(dst,src,rows,cols,dst.cq());
|
||||
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||
if (mat1::MEM_TYPE==1) {
|
||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||
|
@ -846,7 +846,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||
* \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - The number of bytes copied is determined by entire src data
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
|
@ -866,7 +866,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
|
|||
/** \param cast_buffer Buffer on host with enough storage for casting
|
||||
* - If the data types for the two matrices are same, no cast performed
|
||||
* - The number of bytes copied is determined by entire src data
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Padding for 2D matrices is not considered in this routine.
|
||||
* - Copy from vector to matrix and vice versa allowed
|
||||
* - Currently does not handle textures **/
|
||||
template <class mat1, class mat2, class mat3>
|
||||
|
@ -885,7 +885,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
|
|||
/// Asynchronous copy of matrix/vector (memory already allocated)
|
||||
/** - The number of bytes copied is determined by entire src data
|
||||
* - If the data types of the two matrices are not the same,
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* casting will be performed automatically as long as the copy is
|
||||
* not device to device. For host/device transfers, a temporary
|
||||
* buffer is created for copy. When multiple casts occur, it is
|
||||
* more efficient to create a permanent casting buffer that can
|
||||
|
@ -924,7 +924,7 @@ template <class mat1, class mat2>
|
|||
inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
|
||||
if (async)
|
||||
ucl_copy(dst,src,dst.cq());
|
||||
else if (dst.row_bytes()==src.row_bytes() &&
|
||||
else if (dst.row_bytes()==src.row_bytes() &&
|
||||
src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
|
||||
(int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||
ucl_copy(dst,src,src.row_size()*src.rows(),async);
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -37,23 +37,23 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
ROW_MAJOR = 1,
|
||||
VECTOR = 0
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_D_Mat() : _cols(0) {}
|
||||
~UCL_D_Mat() { _device_free(*this); }
|
||||
|
||||
|
||||
/// Construct with specified rows and cols
|
||||
/** \sa alloc() **/
|
||||
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
||||
_cols(0) { alloc(rows,cols,device,kind); }
|
||||
|
||||
|
||||
/// Row major matrix on device
|
||||
/** The kind parameter controls memory optimizations as follows:
|
||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||
* \param cq Default command queue for operations copied from another mat
|
||||
* \param cq Default command queue for operations copied from another mat
|
||||
* \note - Coalesced access using adjacent cols on same row
|
||||
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
|
||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
|
@ -65,7 +65,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate "
|
||||
std::cerr << "UCL Error: Could not allocate "
|
||||
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
|
@ -82,9 +82,9 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/// Row major matrix on device
|
||||
/** The kind parameter controls memory optimizations as follows:
|
||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||
|
@ -118,15 +118,15 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||
const size_t stride) {
|
||||
|
@ -145,7 +145,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
#else
|
||||
_device_view(&_array,input.begin());
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
|
@ -157,39 +157,39 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
||||
{ view(input,rows,cols,input.row_size()); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t cols)
|
||||
{ view(input,1,cols); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input)
|
||||
inline void view(ucl_type &input)
|
||||
{ view(input,input.rows(),input.cols()); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||
const size_t stride, UCL_Device &dev) {
|
||||
const size_t stride, UCL_Device &dev) {
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
|
@ -215,7 +215,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
|
@ -223,13 +223,13 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
|
||||
{ view(input,1,cols,dev); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols, const size_t stride) {
|
||||
|
@ -248,7 +248,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
#else
|
||||
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
|
@ -261,45 +261,45 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
* allocating container when using CUDA APIs **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols)
|
||||
const size_t cols)
|
||||
{ view_offset(offset,input,rows,cols,input.row_size()); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
|
||||
{ view_offset(offset,input,1,cols); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset, ucl_type &input) {
|
||||
if (input.rows()==1)
|
||||
inline void view_offset(const size_t offset, ucl_type &input) {
|
||||
if (input.rows()==1)
|
||||
view_offset(offset,input,1,input.cols()-offset);
|
||||
else
|
||||
else
|
||||
view_offset(offset,input,input.rows()-offset/input.row_size(),
|
||||
input.cols());
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||
const size_t cols,const size_t stride,
|
||||
UCL_Device &dev) {
|
||||
UCL_Device &dev) {
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
_cols=cols;
|
||||
|
@ -307,7 +307,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
_pitch=stride*sizeof(numtyp);
|
||||
_row_size=stride;
|
||||
this->_cq=dev.cq();
|
||||
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
_array=input;
|
||||
_offset=offset;
|
||||
|
@ -320,7 +320,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
_array=input+offset;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
|
@ -332,20 +332,20 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
* allocating container when using CUDA APIs **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view_offset(offset,input,rows,cols,cols,dev); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset, ptr_type input,
|
||||
inline void view_offset(const size_t offset, ptr_type input,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view_offset(offset,input,1,cols,dev); }
|
||||
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
inline void clear()
|
||||
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; }
|
||||
|
||||
/// Resize the allocation to contain cols elements
|
||||
|
@ -356,7 +356,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
int err=_device_resize(*this,rows,cols,_pitch);
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate "
|
||||
std::cerr << "UCL Error: Could not allocate "
|
||||
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
|
@ -372,13 +372,13 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/// Resize (only if bigger) the allocation to contain rows x cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize_ib(const int rows, const int cols)
|
||||
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
|
||||
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
|
||||
else return UCL_SUCCESS; }
|
||||
|
||||
/// Set each element to zero asynchronously in the default command_queue
|
||||
|
@ -386,10 +386,10 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
/// Set first n elements to zero asynchronously in the default command_queue
|
||||
inline void zero(const int n) { zero(n,_cq); }
|
||||
/// Set each element to zero asynchronously
|
||||
inline void zero(command_queue &cq)
|
||||
inline void zero(command_queue &cq)
|
||||
{ _device_zero(*this,row_bytes()*_rows,cq); }
|
||||
/// Set first n elements to zero asynchronously
|
||||
inline void zero(const int n, command_queue &cq)
|
||||
inline void zero(const int n, command_queue &cq)
|
||||
{ _device_zero(*this,n*sizeof(numtyp),cq); }
|
||||
|
||||
|
||||
|
@ -445,7 +445,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
inline size_t row_bytes() const { return _pitch; }
|
||||
/// Get the size in bytes of 1 element
|
||||
inline int element_size() const { return sizeof(numtyp); }
|
||||
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
|
@ -459,7 +459,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||
/// Return the offset (in bytes) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t byteoff() const { return offset()*sizeof(numtyp); }
|
||||
|
||||
|
||||
private:
|
||||
size_t _pitch, _row_size, _rows, _cols;
|
||||
|
||||
|
|
|
@ -17,14 +17,14 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
// Only allow this file to be included by CUDA and OpenCL specific headers
|
||||
#ifdef _UCL_MAT_ALLOW
|
||||
|
||||
/// Row vector on device
|
||||
/// Row vector on device
|
||||
template <class numtyp>
|
||||
class UCL_D_Vec : public UCL_BaseMat {
|
||||
public:
|
||||
|
@ -37,7 +37,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
ROW_MAJOR = 1,
|
||||
VECTOR = 1
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_D_Vec() : _cols(0) {}
|
||||
~UCL_D_Vec() { _device_free(*this); }
|
||||
|
@ -45,7 +45,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
/// Construct with n columns
|
||||
/** \sa alloc() **/
|
||||
UCL_D_Vec(const size_t n, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
||||
_cols(0) { alloc(n,device,kind); }
|
||||
|
||||
/// Set up host vector with 'cols' columns and reserve memory
|
||||
|
@ -58,7 +58,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
template <class mat_type>
|
||||
inline int alloc(const size_t cols, mat_type &cq,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||
|
||||
|
||||
clear();
|
||||
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
|
@ -82,8 +82,8 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Set up host vector with 'cols' columns and reserve memory
|
||||
/** The kind parameter controls memory optimizations as follows:
|
||||
|
@ -116,7 +116,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
|
@ -142,18 +142,18 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
#else
|
||||
_device_view(&_array,input.begin());
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||
const size_t stride) { view(input,rows,cols); }
|
||||
|
@ -162,24 +162,24 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t cols)
|
||||
{ view(input,1,cols); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input)
|
||||
inline void view(ucl_type &input)
|
||||
{ view(input,input.rows()*input.row_size()); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
|
@ -205,15 +205,15 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||
const size_t stride, UCL_Device &dev)
|
||||
const size_t stride, UCL_Device &dev)
|
||||
{ view(input,rows,cols,stride); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
|
@ -223,7 +223,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
template <class ptr_type>
|
||||
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
|
||||
{ view(input,1,cols,dev); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
|
@ -248,45 +248,45 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
#else
|
||||
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols, const size_t stride)
|
||||
const size_t cols, const size_t stride)
|
||||
{ view_offset(offset,input,rows,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
|
||||
{ view_offset(offset,input,1,cols); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset, ucl_type &input)
|
||||
inline void view_offset(const size_t offset, ucl_type &input)
|
||||
{ view_offset(offset,input,input.rows()*input.row_size()-offset); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
|
@ -302,7 +302,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=dev.cq();
|
||||
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
_array=input;
|
||||
_offset=offset;
|
||||
|
@ -315,20 +315,20 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
_array=input+offset;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_cols;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||
const size_t cols,const size_t stride,UCL_Device &dev)
|
||||
const size_t cols,const size_t stride,UCL_Device &dev)
|
||||
{ view_offset(offset,input,rows,cols,stride); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
|
@ -336,12 +336,12 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset, ptr_type input,
|
||||
inline void view_offset(const size_t offset, ptr_type input,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view_offset(offset,input,1,cols,dev); }
|
||||
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
inline void clear()
|
||||
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; }
|
||||
|
||||
/// Resize the allocation to contain cols elements
|
||||
|
@ -369,9 +369,9 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/// Resize (only if bigger) the allocation to contain cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize_ib(const int cols)
|
||||
|
@ -384,7 +384,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
/// Set each element to zero asynchronously
|
||||
inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
|
||||
/// Set first n elements to zero asynchronously
|
||||
inline void zero(const int n, command_queue &cq)
|
||||
inline void zero(const int n, command_queue &cq)
|
||||
{ _device_zero(*this,n*sizeof(numtyp),cq); }
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
|
@ -402,7 +402,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
/// For CUDA-RT, get device pointer to one past last element
|
||||
inline numtyp * end() const { return _end; }
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
/// Returns an API specific device pointer
|
||||
/** - For OpenCL, returns a &cl_mem object
|
||||
|
@ -427,10 +427,10 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
inline const numtyp ** cbegin() const { return &_array; }
|
||||
/// For CUDA-RT, allocate row vector and bind texture
|
||||
inline void safe_alloc(const size_t cols, UCL_Device &dev,
|
||||
textureReference *t)
|
||||
textureReference *t)
|
||||
{ alloc(cols,dev); assign_texture(t); bind(); }
|
||||
/// For CUDA-RT, assign a texture to matrix
|
||||
inline void assign_texture(textureReference *t) { _tex_ptr=t; }
|
||||
inline void assign_texture(textureReference *t) { _tex_ptr=t; }
|
||||
/// For CUDA-RT, bind to texture
|
||||
inline void bind() {
|
||||
cuda_gb_get_channel<numtyp>(_channel);
|
||||
|
@ -456,7 +456,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
inline size_t row_bytes() const { return _row_bytes; }
|
||||
/// Get the size in bytes of 1 element
|
||||
inline int element_size() const { return sizeof(numtyp); }
|
||||
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
|
@ -473,7 +473,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||
|
||||
private:
|
||||
size_t _row_bytes, _row_size, _rows, _cols;
|
||||
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
device_ptr _array;
|
||||
#else
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -37,21 +37,21 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
ROW_MAJOR = 1,
|
||||
VECTOR = 0
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_H_Mat() : _cols(0) {
|
||||
#ifdef _OCL_MAT
|
||||
_carray=(cl_mem)(0);
|
||||
#endif
|
||||
}
|
||||
~UCL_H_Mat() { _host_free(*this); }
|
||||
|
||||
|
||||
/// Construct with specied number of rows and columns
|
||||
/** \sa alloc() **/
|
||||
UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE)
|
||||
UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE)
|
||||
{ _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
|
||||
|
||||
|
||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||
/** The kind parameter controls memory pinning as follows:
|
||||
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||
|
@ -74,7 +74,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
<< " bytes on host.\n";
|
||||
_row_bytes=0;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
|
@ -84,7 +84,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
_kind=kind;
|
||||
_end=_array+rows*cols;
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||
/** The kind parameter controls memory pinning as follows:
|
||||
|
@ -117,15 +117,15 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
_kind=kind;
|
||||
_end=_array+rows*cols;
|
||||
return err;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||
const size_t stride) {
|
||||
|
@ -149,45 +149,45 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
||||
{ view(input,rows,cols,input.row_size()); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t cols)
|
||||
{ view(input,1,cols); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* will be used for view when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input)
|
||||
inline void view(ucl_type &input)
|
||||
{ view(input,input.rows(),input.cols()); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||
const size_t stride, UCL_Device &dev) {
|
||||
const size_t stride, UCL_Device &dev) {
|
||||
assert(rows==1 || stride==cols);
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
|
@ -197,40 +197,40 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
this->_cq=dev.cq();
|
||||
_array=input;
|
||||
_end=_array+_cols;
|
||||
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
_host_view(*this,dev,_row_bytes*rows);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
|
||||
{ view(input,1,cols,dev); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols, const size_t stride) {
|
||||
const size_t cols, const size_t stride) {
|
||||
assert(rows==1 || stride==cols);
|
||||
clear();
|
||||
_kind=UCL_VIEW;
|
||||
|
@ -244,81 +244,81 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
_host_view(*this,input,_row_bytes*_rows);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols)
|
||||
const size_t cols)
|
||||
{ view_offset(offset,input,rows,cols,input.row_size()); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
|
||||
{ view_offset(offset,input,1,cols); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset, ucl_type &input) {
|
||||
if (input.rows()==1)
|
||||
inline void view_offset(const size_t offset, ucl_type &input) {
|
||||
if (input.rows()==1)
|
||||
view_offset(offset,input,1,input.cols()-offset);
|
||||
else
|
||||
else
|
||||
view_offset(offset,input,input.rows()-offset/input.row_size(),
|
||||
input.cols());
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
* allocating container
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view(input+offset,rows,cols,dev); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
||||
const size_t cols,const size_t stride,UCL_Device &dev)
|
||||
const size_t cols,const size_t stride,UCL_Device &dev)
|
||||
{ view(input+offset,rows,cols,stride,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset, ptr_type *input,
|
||||
inline void view_offset(const size_t offset, ptr_type *input,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view(input+offset,1,cols,dev); }
|
||||
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
{ _host_free(*this); _cols=0; _kind=UCL_VIEW; }
|
||||
inline void clear()
|
||||
{ _host_free(*this); _cols=0; _kind=UCL_VIEW; }
|
||||
|
||||
/// Resize the allocation to rows x cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
|
@ -333,7 +333,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
<< " bytes on host.\n";
|
||||
_row_bytes=0;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
|
@ -347,7 +347,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
/// Resize (only if bigger) the allocation to contain rows x cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize_ib(const int rows, const int cols)
|
||||
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
|
||||
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
|
||||
else return UCL_SUCCESS; }
|
||||
|
||||
/// Set each element to zero
|
||||
|
@ -376,21 +376,21 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
inline size_t row_bytes() const { return _row_bytes; }
|
||||
/// Get the size in bytes of 1 element
|
||||
inline int element_size() const { return sizeof(numtyp); }
|
||||
|
||||
|
||||
/// Get element at index i
|
||||
inline numtyp & operator[](const int i) { return _array[i]; }
|
||||
/// Get element at index i
|
||||
inline const numtyp & operator[](const int i) const { return _array[i]; }
|
||||
/// 2D access (row should always be 0)
|
||||
inline numtyp & operator()(const int row, const int col)
|
||||
/// 2D access (row should always be 0)
|
||||
inline numtyp & operator()(const int row, const int col)
|
||||
{ return _array[row*_cols+col]; }
|
||||
/// 2D access (row should always be 0)
|
||||
/// 2D access (row should always be 0)
|
||||
inline const numtyp & operator()(const int row, const int col) const
|
||||
{ return _array[row*_cols+col]; }
|
||||
|
||||
|
||||
/// Returns pointer to memory pointer for allocation on host
|
||||
inline numtyp ** host_ptr() { return &_array; }
|
||||
|
||||
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t offset() const { return 0; }
|
||||
|
@ -409,14 +409,14 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||
/// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
|
||||
inline const void ** cbegin() const { return (const void **)&_array; }
|
||||
#endif
|
||||
|
||||
|
||||
private:
|
||||
numtyp *_array, *_end;
|
||||
size_t _row_bytes, _rows, _cols;
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
device_ptr _carray;
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -37,21 +37,21 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
ROW_MAJOR = 1,
|
||||
VECTOR = 1
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_H_Vec() : _cols(0) {
|
||||
#ifdef _OCL_MAT
|
||||
_carray=(cl_mem)(0);
|
||||
#endif
|
||||
}
|
||||
~UCL_H_Vec() { _host_free(*this); }
|
||||
|
||||
|
||||
/// Construct with n columns
|
||||
/** \sa alloc() **/
|
||||
UCL_H_Vec(const size_t n, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE)
|
||||
UCL_H_Vec(const size_t n, UCL_Device &device,
|
||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE)
|
||||
{ _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
|
||||
|
||||
|
||||
/// Set up host vector with 'cols' columns and reserve memory
|
||||
/** The kind parameter controls memory pinning as follows:
|
||||
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||
|
@ -84,7 +84,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
_kind=kind;
|
||||
_end=_array+cols;
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/// Set up host vector with 'cols' columns and reserve memory
|
||||
/** The kind parameter controls memory pinning as follows:
|
||||
|
@ -108,7 +108,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
<< " bytes on host.\n";
|
||||
_row_bytes=0;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
|
@ -118,13 +118,13 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
_end=_array+cols;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
|
@ -143,14 +143,14 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||
const size_t stride) { view(input,rows,cols); }
|
||||
|
@ -159,31 +159,31 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t cols)
|
||||
{ view(input,1,cols); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container
|
||||
* allocating container
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input)
|
||||
inline void view(ucl_type &input)
|
||||
{ view(input,input.rows()*input.row_size()); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||
UCL_Device &dev) {
|
||||
|
@ -197,38 +197,38 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
this->_cq=dev.cq();
|
||||
_array=input;
|
||||
_end=_array+_cols;
|
||||
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
_host_view(*this,dev,_row_bytes);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||
const size_t stride, UCL_Device &dev)
|
||||
const size_t stride, UCL_Device &dev)
|
||||
{ view(input,rows,cols,stride); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
|
||||
{ view(input,1,cols,dev); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols) {
|
||||
|
@ -246,76 +246,76 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
_host_view(*this,input,_row_bytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols, const size_t stride)
|
||||
const size_t cols, const size_t stride)
|
||||
{ view_offset(offset,input,rows,cols); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
|
||||
{ view_offset(offset,input,1,cols); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||
/** This function must be passed a Geryon vector or matrix container.
|
||||
* No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* allocating container when using CUDA APIs
|
||||
* - If a matrix is used a input, all elements (including padding)
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
* will be used for view
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset, ucl_type &input)
|
||||
inline void view_offset(const size_t offset, ucl_type &input)
|
||||
{ view_offset(offset,input,input.rows()*input.row_size()-offset); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view(input+offset,rows,cols,dev); }
|
||||
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported
|
||||
* \param stride Number of _elements_ between the start of each row **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
||||
const size_t cols,const size_t stride,UCL_Device &dev)
|
||||
const size_t cols,const size_t stride,UCL_Device &dev)
|
||||
{ view(input+offset,rows,cols,stride,dev); }
|
||||
|
||||
/// Do not allocate memory, instead use an existing allocation
|
||||
/** - No memory is freed when the object is destructed.
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view_offset(const size_t offset, ptr_type *input,
|
||||
inline void view_offset(const size_t offset, ptr_type *input,
|
||||
const size_t cols, UCL_Device &dev)
|
||||
{ view(input+offset,1,cols,dev); }
|
||||
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
inline void clear()
|
||||
{ _host_free(*this); _kind=UCL_VIEW; _cols=0; }
|
||||
|
||||
/// Resize the allocation to contain cols elements
|
||||
|
@ -324,7 +324,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
assert(_kind!=UCL_VIEW);
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
int err=_host_resize(*this,_row_bytes);
|
||||
|
||||
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
|
@ -340,7 +340,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
_end=_array+cols;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/// Resize (only if bigger) the allocation to contain cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize_ib(const int cols)
|
||||
|
@ -348,7 +348,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
|
||||
/// Set each element to zero
|
||||
inline void zero() { _host_zero(_array,row_bytes()); }
|
||||
|
||||
|
||||
/// Set first n elements to zero
|
||||
inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
|
||||
|
||||
|
@ -373,35 +373,35 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||
inline size_t row_bytes() const { return _row_bytes; }
|
||||
/// Get the size in bytes of 1 element
|
||||
inline int element_size() const { return sizeof(numtyp); }
|
||||
|
||||
|
||||
/// Get element at index i
|
||||
inline numtyp & operator[](const int i) { return _array[i]; }
|
||||
/// Get element at index i
|
||||
inline const numtyp & operator[](const int i) const { return _array[i]; }
|
||||
/// 2D access (row should always be 0)
|
||||
inline numtyp & operator()(const int row, const int col)
|
||||
/// 2D access (row should always be 0)
|
||||
inline numtyp & operator()(const int row, const int col)
|
||||
{ return _array[col]; }
|
||||
/// 2D access (row should always be 0)
|
||||
/// 2D access (row should always be 0)
|
||||
inline const numtyp & operator()(const int row, const int col) const
|
||||
{ return _array[col]; }
|
||||
|
||||
|
||||
/// Returns pointer to memory pointer for allocation on host
|
||||
inline numtyp ** host_ptr() { return &_array; }
|
||||
|
||||
|
||||
/// Return the offset (in elements) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t offset() const { return 0; }
|
||||
/// Return the offset (in bytes) from begin() pointer where data starts
|
||||
/** \note Always 0 for host matrices and CUDA APIs **/
|
||||
inline size_t byteoff() const { return 0; }
|
||||
|
||||
|
||||
#ifdef _OCL_MAT
|
||||
/// For OpenCL, returns a reference to the cl_mem object
|
||||
inline device_ptr & cbegin() { return _carray; }
|
||||
/// For OpenCL, returns a reference to the cl_mem object
|
||||
inline const device_ptr & cbegin() const { return _carray; }
|
||||
#endif
|
||||
|
||||
|
||||
private:
|
||||
numtyp *_array, *_end;
|
||||
size_t _row_bytes, _cols;
|
||||
|
|
|
@ -34,25 +34,25 @@ class UCL_Matrix {
|
|||
ROW_MAJOR = 1,
|
||||
VECTOR = 0
|
||||
};
|
||||
typedef hosttype data_type;
|
||||
typedef hosttype data_type;
|
||||
|
||||
/// Host Allocation
|
||||
UCL_H_Mat<hosttype> host;
|
||||
|
||||
|
||||
/// Device Allocation
|
||||
UCL_D_Mat<devtype> device;
|
||||
|
||||
UCL_Matrix() { }
|
||||
~UCL_Matrix() { }
|
||||
|
||||
|
||||
/// Construct with specied number of rows and columns
|
||||
/** \sa alloc() **/
|
||||
UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
|
||||
UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
|
||||
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
|
||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
|
||||
|
||||
|
||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||
/** The kind1 parameter controls memory access from the host
|
||||
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||
|
@ -74,7 +74,7 @@ class UCL_Matrix {
|
|||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
|
||||
|
||||
|
||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||
/** The kind1 parameter controls memory access from the host
|
||||
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||
|
@ -92,9 +92,9 @@ class UCL_Matrix {
|
|||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
|
||||
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
inline void clear()
|
||||
{ host.clear(); device.clear(); }
|
||||
|
||||
/// Resize the allocation to contain cols elements
|
||||
|
@ -106,10 +106,10 @@ class UCL_Matrix {
|
|||
return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
dev_resize(device,host,_buffer,rows,cols);
|
||||
}
|
||||
|
||||
|
||||
/// Resize (only if bigger) the allocation to contain cols elements
|
||||
inline int resize_ib(const int new_rows, const int new_cols)
|
||||
{ if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
|
||||
{ if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
|
||||
else return UCL_SUCCESS; }
|
||||
|
||||
/// Set each element to zero (asynchronously on device)
|
||||
|
@ -118,14 +118,14 @@ class UCL_Matrix {
|
|||
inline void zero(const int n) { zero(n,cq()); }
|
||||
/// Set each element to zero (asynchronously on device)
|
||||
inline void zero(command_queue &cq) {
|
||||
host.zero();
|
||||
host.zero();
|
||||
if (device.kind()!=UCL_VIEW) device.zero(cq);
|
||||
else if (_buffer.numel()>0) _buffer.zero();
|
||||
}
|
||||
/// Set first n elements to zero (asynchronously on device)
|
||||
inline void zero(const int n, command_queue &cq) {
|
||||
host.zero(n);
|
||||
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
|
||||
inline void zero(const int n, command_queue &cq) {
|
||||
host.zero(n);
|
||||
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
|
||||
else if (_buffer.numel()>0) _buffer.zero();
|
||||
}
|
||||
|
||||
|
@ -136,26 +136,26 @@ class UCL_Matrix {
|
|||
/// Get the number of columns
|
||||
inline size_t cols() const { return host.cols(); }
|
||||
/// Get the memory usage (bytes) of the s-object (including any buffers)
|
||||
inline size_t host_mem_usage()
|
||||
inline size_t host_mem_usage()
|
||||
{ return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); }
|
||||
/// Get the memory usage (bytes) of the s-object (including any buffers)
|
||||
inline size_t device_mem_usage()
|
||||
inline size_t device_mem_usage()
|
||||
{ return device.row_bytes()*device.rows(); }
|
||||
|
||||
|
||||
/// Get element at index i
|
||||
inline hosttype & operator[](const int i) { return host[i]; }
|
||||
/// Get element at index i
|
||||
inline const hosttype & operator[](const int i) const { return host[i]; }
|
||||
/// 2D access (row should always be 0)
|
||||
inline hosttype & operator()(const int row, const int col)
|
||||
/// 2D access (row should always be 0)
|
||||
inline hosttype & operator()(const int row, const int col)
|
||||
{ return host(row,col); }
|
||||
/// 2D access (row should always be 0)
|
||||
/// 2D access (row should always be 0)
|
||||
inline const hosttype & operator()(const int row, const int col) const
|
||||
{ return host(row,col); }
|
||||
|
||||
|
||||
/// Returns pointer to memory pointer for allocation on host
|
||||
inline hosttype ** host_ptr() { return host.host_ptr(); }
|
||||
|
||||
|
||||
/// Return the default command queue/stream associated with this data
|
||||
inline command_queue & cq() { return host.cq(); }
|
||||
/// Change the default command queue associated with this data
|
||||
|
@ -172,7 +172,7 @@ class UCL_Matrix {
|
|||
|
||||
|
||||
/// Update the allocation on the host asynchronously
|
||||
inline void update_host()
|
||||
inline void update_host()
|
||||
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
copy(host,device,_buffer,true); }
|
||||
/// Update the allocation on the host (true for asynchronous copy)
|
||||
|
@ -202,7 +202,7 @@ class UCL_Matrix {
|
|||
|
||||
|
||||
/// Update the allocation on the device asynchronously
|
||||
inline void update_device()
|
||||
inline void update_device()
|
||||
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
copy(device,host,_buffer,true); }
|
||||
/// Update the allocation on the device (true for asynchronous copy)
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -53,9 +53,9 @@ typedef struct _double4 double4;
|
|||
#define BLOCK_SIZE_Y blockDim.y
|
||||
#define __kernel extern "C" __global__
|
||||
#define __local __shared__
|
||||
#define __global
|
||||
#define __global
|
||||
#define atom_add atomicAdd
|
||||
#define ucl_inline static __inline__ __device__
|
||||
#define ucl_inline static __inline__ __device__
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -17,10 +17,10 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
||||
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
|
||||
#ifdef UCL_PRINT_ALLOW
|
||||
|
||||
|
@ -40,7 +40,7 @@ template <> struct _ucl_print<1> {
|
|||
}
|
||||
template <class mat_type>
|
||||
static inline void p(mat_type &mat, const size_t rows, const size_t cols,
|
||||
std::ostream &out, const std::string delim,
|
||||
std::ostream &out, const std::string delim,
|
||||
const std::string row_delim) {
|
||||
int offset=0;
|
||||
int row_size=cols;
|
||||
|
@ -58,12 +58,12 @@ template <> struct _ucl_print<1> {
|
|||
}
|
||||
template <class mat_type>
|
||||
static inline void p(const mat_type &mat,const size_t rows,const size_t cols,
|
||||
std::ostream &out,const std::string delim,
|
||||
std::ostream &out,const std::string delim,
|
||||
const std::string row_delim, UCL_Device &dev) {
|
||||
p(mat,rows,cols,out,delim,row_delim);
|
||||
p(mat,rows,cols,out,delim,row_delim);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <int mem> struct _ucl_print {
|
||||
template <class mat_type>
|
||||
static inline void p(mat_type &mat, const size_t n, std::ostream &out,
|
||||
|
@ -83,7 +83,7 @@ template <int mem> struct _ucl_print {
|
|||
}
|
||||
template <class mat_type>
|
||||
static inline void p(mat_type &mat, const size_t rows, const size_t cols,
|
||||
std::ostream &out, const std::string delim,
|
||||
std::ostream &out, const std::string delim,
|
||||
const std::string row_delim) {
|
||||
UCL_H_Vec<typename mat_type::data_type> temp;
|
||||
temp.alloc(mat.rows()*mat.cols(),mat);
|
||||
|
@ -91,12 +91,12 @@ template <int mem> struct _ucl_print {
|
|||
ucl_copy(temp,mat,rows*cols,false);
|
||||
else
|
||||
ucl_copy(temp,mat,rows,cols,false);
|
||||
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
|
||||
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
|
||||
}
|
||||
template <class mat_type>
|
||||
static inline void p(const mat_type &mat, const size_t rows,
|
||||
static inline void p(const mat_type &mat, const size_t rows,
|
||||
const size_t cols,std::ostream &out,
|
||||
const std::string delim,
|
||||
const std::string delim,
|
||||
const std::string row_delim, UCL_Device &dev) {
|
||||
UCL_H_Vec<typename mat_type::data_type> temp;
|
||||
temp.alloc(mat.rows()*mat.cols(),dev);
|
||||
|
@ -104,9 +104,9 @@ template <int mem> struct _ucl_print {
|
|||
ucl_copy(temp,mat,rows*cols,false);
|
||||
else
|
||||
ucl_copy(temp,mat,rows,cols,false);
|
||||
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
|
||||
_ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// - Non-const routines that do not require a device object
|
||||
|
@ -123,13 +123,13 @@ inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out,
|
|||
}
|
||||
_ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim);
|
||||
}
|
||||
|
||||
|
||||
/// Outputs n elements of mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) {
|
||||
ucl_print(mat,n,out," ");
|
||||
}
|
||||
|
||||
|
||||
/// Outputs n elements of mat delimited by a space to standard out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t n) {
|
||||
|
@ -139,8 +139,8 @@ inline void ucl_print(mat_type &mat, const size_t n) {
|
|||
/// Outputs upper left rows and cols of mat delimited by the string delim
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
|
||||
std::ostream &out, const std::string delim,
|
||||
const std::string row_delim) {
|
||||
std::ostream &out, const std::string delim,
|
||||
const std::string row_delim) {
|
||||
if (rows*cols>mat.numel()) {
|
||||
std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
|
||||
<< "that only has " << mat.numel() << " elements.";
|
||||
|
@ -148,17 +148,17 @@ inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
|
|||
}
|
||||
_ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim);
|
||||
}
|
||||
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
|
||||
std::ostream &out) {
|
||||
ucl_print(mat,rows,cols,out," ","\n");
|
||||
}
|
||||
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by a space to std out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(mat_type &mat, const size_t rows,
|
||||
inline void ucl_print(mat_type &mat, const size_t rows,
|
||||
const size_t cols) {
|
||||
ucl_print(mat,rows,cols,std::cout," ","\n");
|
||||
}
|
||||
|
@ -177,7 +177,7 @@ inline void ucl_print(mat_type &mat, std::ostream &out) {
|
|||
else
|
||||
ucl_print(mat,mat.rows(),mat.cols(),out," ","\n");
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// - Const routines that do not require a device object
|
||||
// -------------------------------------------------------------------------
|
||||
|
@ -193,14 +193,14 @@ inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
|
|||
}
|
||||
_ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim,dev);
|
||||
}
|
||||
|
||||
|
||||
/// Outputs n elements of mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
|
||||
UCL_Device &dev) {
|
||||
ucl_print(mat,n,out," ",dev);
|
||||
}
|
||||
|
||||
|
||||
/// Outputs n elements of mat delimited by a space to standard out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat, const size_t n,
|
||||
|
@ -211,7 +211,7 @@ inline void ucl_print(const mat_type &mat, const size_t n,
|
|||
/// Outputs upper left rows and cols of mat delimited by the string delim
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
|
||||
std::ostream &out, const std::string delim,
|
||||
std::ostream &out, const std::string delim,
|
||||
const std::string row_delim, UCL_Device &dev) {
|
||||
if (rows*cols>mat.numel()) {
|
||||
std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
|
||||
|
@ -220,17 +220,17 @@ inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
|
|||
}
|
||||
_ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim,dev);
|
||||
}
|
||||
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by a space
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
|
||||
std::ostream &out, UCL_Device &dev) {
|
||||
ucl_print(mat,rows,cols,out," ","\n",dev);
|
||||
}
|
||||
|
||||
|
||||
/// Outputs upper left rows and cols of mat delimited by a space to std out
|
||||
template <class mat_type>
|
||||
inline void ucl_print(const mat_type &mat, const size_t rows,
|
||||
inline void ucl_print(const mat_type &mat, const size_t rows,
|
||||
const size_t cols, UCL_Device &dev) {
|
||||
ucl_print(mat,rows,cols,std::cout," ","\n",dev);
|
||||
}
|
||||
|
@ -256,27 +256,27 @@ inline void ucl_print(const mat_type &mat, std::ostream &out, UCL_Device &dev) {
|
|||
|
||||
template <class numtyp>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_H_Vec<numtyp> &mat)
|
||||
{ ucl_print(mat,out); return out; }
|
||||
{ ucl_print(mat,out); return out; }
|
||||
|
||||
template <class numtyp>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_H_Mat<numtyp> &mat)
|
||||
{ ucl_print(mat,out); return out; }
|
||||
{ ucl_print(mat,out); return out; }
|
||||
|
||||
template <class numtyp>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_D_Vec<numtyp> &mat)
|
||||
{ ucl_print(mat,out); return out; }
|
||||
{ ucl_print(mat,out); return out; }
|
||||
|
||||
template <class numtyp>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
|
||||
{ ucl_print(mat,out); return out; }
|
||||
{ ucl_print(mat,out); return out; }
|
||||
|
||||
|
||||
template <class t1, class t2>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_Vector<t1,t2> &mat)
|
||||
{ ucl_print(mat.host,out); return out; }
|
||||
{ ucl_print(mat.host,out); return out; }
|
||||
|
||||
template <class t1, class t2>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_Matrix<t1,t2> &mat)
|
||||
{ ucl_print(mat.host,out); return out; }
|
||||
{ ucl_print(mat.host,out); return out; }
|
||||
|
||||
#endif
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
-------------------
|
||||
W. Michael Brown
|
||||
|
||||
Helper routines for allocating memory for s-objects and performing
|
||||
Helper routines for allocating memory for s-objects and performing
|
||||
host/device updates. (Different routines depending on whether the
|
||||
same type is used on the host and device).
|
||||
|
||||
|
@ -141,29 +141,29 @@ template <> struct _ucl_s_obj_help<1> {
|
|||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
const bool async) {
|
||||
ucl_copy(dst,src,cols,async);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
command_queue &cq) {
|
||||
ucl_copy(dst,src,cols,cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
t3 &buffer, const bool async) {
|
||||
ucl_copy(dst,src,rows,cols,async);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
t3 &buffer, command_queue &cq) {
|
||||
ucl_copy(dst,src,rows,cols,cq);
|
||||
}
|
||||
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
|
||||
if (device.kind()==UCL_VIEW) {
|
||||
|
@ -181,7 +181,7 @@ template <> struct _ucl_s_obj_help<1> {
|
|||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
|
||||
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
|
||||
const int cols) {
|
||||
if (device.kind()==UCL_VIEW) {
|
||||
device.view(host);
|
||||
|
@ -255,7 +255,7 @@ template <int st> struct _ucl_s_obj_help {
|
|||
e1=_buffer.alloc(cols,cq,kind1);
|
||||
if (e1!=UCL_SUCCESS)
|
||||
return e1;
|
||||
return device.alloc(cols,cq,kind2);
|
||||
return device.alloc(cols,cq,kind2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -314,7 +314,7 @@ template <int st> struct _ucl_s_obj_help {
|
|||
e1=_buffer.alloc(rows,cols,cq,kind1);
|
||||
if (e1!=UCL_SUCCESS)
|
||||
return e1;
|
||||
return device.alloc(rows,cols,cq,kind2);
|
||||
return device.alloc(rows,cols,cq,kind2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -329,25 +329,25 @@ template <int st> struct _ucl_s_obj_help {
|
|||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
const bool async) {
|
||||
ucl_cast_copy(dst,src,cols,buffer,async);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
command_queue &cq) {
|
||||
ucl_cast_copy(dst,src,cols,buffer,cq);
|
||||
}
|
||||
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
t3 &buffer, const bool async) {
|
||||
ucl_cast_copy(dst,src,rows,cols,buffer,async);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
t3 &buffer, command_queue &cq) {
|
||||
ucl_cast_copy(dst,src,rows,cols,buffer,cq);
|
||||
}
|
||||
|
@ -373,7 +373,7 @@ template <int st> struct _ucl_s_obj_help {
|
|||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
|
||||
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
|
||||
const int cols) {
|
||||
int err=buff.resize(rows,cols);
|
||||
if (err!=UCL_SUCCESS)
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/* -----------------------------------------------------------------------
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
certain rights in this software. This software is distributed under
|
||||
the Simplified BSD License.
|
||||
----------------------------------------------------------------------- */
|
||||
|
||||
|
@ -26,65 +26,65 @@
|
|||
|
||||
// Assign an integer id based on the data type: (int, float, double, etc)
|
||||
template <class eltype> struct _UCL_DATA_ID;
|
||||
template <> struct _UCL_DATA_ID<double> {
|
||||
template <> struct _UCL_DATA_ID<double> {
|
||||
enum { id=1 };
|
||||
static inline const char * name() { return "double"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }
|
||||
static inline const char * name() { return "double"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<float> {
|
||||
template <> struct _UCL_DATA_ID<float> {
|
||||
enum { id=2 };
|
||||
static inline const char * name() { return "float"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }
|
||||
static inline const char * name() { return "float"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<unsigned> {
|
||||
template <> struct _UCL_DATA_ID<unsigned> {
|
||||
enum { id=3 };
|
||||
static inline const char * name() { return "unsigned"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }
|
||||
static inline const char * name() { return "unsigned"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<int> {
|
||||
template <> struct _UCL_DATA_ID<int> {
|
||||
enum { id=4 };
|
||||
static inline const char * name() { return "int"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }
|
||||
static inline const char * name() { return "int"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<char> {
|
||||
template <> struct _UCL_DATA_ID<char> {
|
||||
enum { id=5 };
|
||||
static inline const char * name() { return "char"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }
|
||||
static inline const char * name() { return "char"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<unsigned char> {
|
||||
template <> struct _UCL_DATA_ID<unsigned char> {
|
||||
enum { id=6 };
|
||||
static inline const char * name() { return "unsigned char"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }
|
||||
static inline const char * name() { return "unsigned char"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<short> {
|
||||
template <> struct _UCL_DATA_ID<short> {
|
||||
enum { id=7 };
|
||||
static inline const char * name() { return "short"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }
|
||||
static inline const char * name() { return "short"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<unsigned short> {
|
||||
template <> struct _UCL_DATA_ID<unsigned short> {
|
||||
enum { id=8 };
|
||||
static inline const char * name() { return "unsigned short"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }
|
||||
static inline const char * name() { return "unsigned short"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<long> {
|
||||
template <> struct _UCL_DATA_ID<long> {
|
||||
enum { id=9 };
|
||||
static inline const char * name() { return "long"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }
|
||||
static inline const char * name() { return "long"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<unsigned long> {
|
||||
template <> struct _UCL_DATA_ID<unsigned long> {
|
||||
enum { id=10 };
|
||||
static inline const char * name() { return "unsigned long"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }
|
||||
static inline const char * name() { return "unsigned long"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }
|
||||
};
|
||||
template <> struct _UCL_DATA_ID<long double> {
|
||||
template <> struct _UCL_DATA_ID<long double> {
|
||||
enum { id=11 };
|
||||
static inline const char * name() { return "long double"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }
|
||||
static inline const char * name() { return "long double"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }
|
||||
};
|
||||
template <class eltype> struct _UCL_DATA_ID {
|
||||
template <class eltype> struct _UCL_DATA_ID {
|
||||
enum { id=0 };
|
||||
static inline const char * name() { return "error_type"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }
|
||||
static inline const char * name() { return "error_type"; }
|
||||
static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }
|
||||
};
|
||||
|
||||
// Host memory allocation types
|
||||
|
@ -97,7 +97,7 @@ enum UCL_MEMOPT {
|
|||
UCL_NOT_SPECIFIED
|
||||
};
|
||||
|
||||
enum UCL_DEVICE_TYPE {
|
||||
enum UCL_DEVICE_TYPE {
|
||||
UCL_DEFAULT, ///< Unknown device type
|
||||
UCL_CPU, ///< Device is a CPU
|
||||
UCL_GPU, ///< Device is a GPU
|
||||
|
@ -111,7 +111,7 @@ enum UCL_ERROR_FLAG {
|
|||
UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found
|
||||
UCL_COMPILE_ERROR, ///< Error compiling kernel
|
||||
UCL_MEMORY_ERROR
|
||||
};
|
||||
};
|
||||
|
||||
template <class numtyp>
|
||||
const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }
|
||||
|
|
|
@ -34,25 +34,25 @@ class UCL_Vector {
|
|||
ROW_MAJOR = 1,
|
||||
VECTOR = 1
|
||||
};
|
||||
typedef hosttype data_type;
|
||||
typedef hosttype data_type;
|
||||
|
||||
/// Host Allocation
|
||||
UCL_H_Vec<hosttype> host;
|
||||
|
||||
|
||||
/// Device Allocation
|
||||
UCL_D_Vec<devtype> device;
|
||||
|
||||
|
||||
UCL_Vector() { }
|
||||
~UCL_Vector() { }
|
||||
|
||||
/// Construct with n columns
|
||||
/** \sa alloc() **/
|
||||
UCL_Vector(const size_t cols, UCL_Device &acc,
|
||||
UCL_Vector(const size_t cols, UCL_Device &acc,
|
||||
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
|
||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
|
||||
|
||||
|
||||
/// Set up the vector with 'cols' columns and reserve memory
|
||||
/** The kind1 parameter controls memory access from the host
|
||||
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||
|
@ -89,12 +89,12 @@ class UCL_Vector {
|
|||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||
inline int alloc(const size_t cols, UCL_Device &acc,
|
||||
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
|
||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
|
||||
|
||||
|
||||
/// Free memory and set size to 0
|
||||
inline void clear()
|
||||
inline void clear()
|
||||
{ host.clear(); device.clear(); }
|
||||
|
||||
/// Resize the allocation to contain cols elements
|
||||
|
@ -106,7 +106,7 @@ class UCL_Vector {
|
|||
return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
dev_resize(device,host,_buffer,cols);
|
||||
}
|
||||
|
||||
|
||||
/// Resize (only if bigger) the allocation to contain cols elements
|
||||
inline int resize_ib(const int new_cols)
|
||||
{ if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
|
||||
|
@ -117,14 +117,14 @@ class UCL_Vector {
|
|||
inline void zero(const int n) { zero(n,cq()); }
|
||||
/// Set each element to zero (asynchronously on device)
|
||||
inline void zero(command_queue &cq) {
|
||||
host.zero();
|
||||
host.zero();
|
||||
if (device.kind()!=UCL_VIEW) device.zero(cq);
|
||||
else if (_buffer.numel()>0) _buffer.zero();
|
||||
}
|
||||
/// Set first n elements to zero (asynchronously on device)
|
||||
inline void zero(const int n, command_queue &cq) {
|
||||
host.zero(n);
|
||||
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
|
||||
inline void zero(const int n, command_queue &cq) {
|
||||
host.zero(n);
|
||||
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
|
||||
else if (_buffer.numel()>0) _buffer.zero();
|
||||
}
|
||||
|
||||
|
@ -135,27 +135,27 @@ class UCL_Vector {
|
|||
/// Get the number of columns
|
||||
inline size_t cols() const { return host.cols(); }
|
||||
/// Get the memory usage (bytes) of the s-object (including any buffers)
|
||||
inline size_t host_mem_usage()
|
||||
inline size_t host_mem_usage()
|
||||
{ return host.row_bytes()+_buffer.row_bytes(); }
|
||||
/// Get the memory usage (bytes) of the s-object (including any buffers)
|
||||
inline size_t device_mem_usage()
|
||||
inline size_t device_mem_usage()
|
||||
{ return device.row_bytes(); }
|
||||
|
||||
|
||||
|
||||
|
||||
/// Get element at index i
|
||||
inline hosttype & operator[](const int i) { return host[i]; }
|
||||
/// Get element at index i
|
||||
inline const hosttype & operator[](const int i) const { return host[i]; }
|
||||
/// 2D access (row should always be 0)
|
||||
inline hosttype & operator()(const int row, const int col)
|
||||
/// 2D access (row should always be 0)
|
||||
inline hosttype & operator()(const int row, const int col)
|
||||
{ return host[col]; }
|
||||
/// 2D access (row should always be 0)
|
||||
/// 2D access (row should always be 0)
|
||||
inline const hosttype & operator()(const int row, const int col) const
|
||||
{ return host[col]; }
|
||||
|
||||
|
||||
/// Returns pointer to memory pointer for allocation on host
|
||||
inline hosttype ** host_ptr() { return host.host_ptr(); }
|
||||
|
||||
|
||||
/// Return the default command queue/stream associated with this data
|
||||
inline command_queue & cq() { return host.cq(); }
|
||||
/// Change the default command queue associated with this data
|
||||
|
@ -172,7 +172,7 @@ class UCL_Vector {
|
|||
|
||||
|
||||
/// Update the allocation on the host asynchronously
|
||||
inline void update_host()
|
||||
inline void update_host()
|
||||
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
copy(host,device,_buffer,true); }
|
||||
/// Update the allocation on the host (true for asynchronous copy)
|
||||
|
@ -202,7 +202,7 @@ class UCL_Vector {
|
|||
|
||||
|
||||
/// Update the allocation on the device asynchronously
|
||||
inline void update_device()
|
||||
inline void update_device()
|
||||
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||
copy(device,host,_buffer,true); }
|
||||
/// Update the allocation on the device (true for asynchronous copy)
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -24,7 +24,7 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AnswerT::bytes_per_atom() const {
|
||||
int AnswerT::bytes_per_atom() const {
|
||||
int bytes=11*sizeof(acctyp);
|
||||
if (_rot)
|
||||
bytes+=4*sizeof(acctyp);
|
||||
|
@ -38,19 +38,19 @@ bool AnswerT::alloc(const int inum) {
|
|||
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
|
||||
|
||||
bool success=true;
|
||||
|
||||
|
||||
_ans_fields=4;
|
||||
if (_rot)
|
||||
_ans_fields+=4;
|
||||
|
||||
|
||||
// --------------------------- Device allocations
|
||||
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
|
||||
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||
success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
|
||||
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
|
||||
|
||||
_allocated=true;
|
||||
|
||||
_allocated=true;
|
||||
return success;
|
||||
}
|
||||
|
||||
|
@ -69,21 +69,21 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
|
|||
if (_charge)
|
||||
_e_fields++;
|
||||
_ev_fields=6+_e_fields;
|
||||
|
||||
|
||||
// Initialize atom and nbor data
|
||||
int ef_inum=inum;
|
||||
if (ef_inum==0)
|
||||
ef_inum=1000;
|
||||
|
||||
|
||||
// Initialize timers for the selected device
|
||||
time_answer.init(*dev);
|
||||
time_answer.zero();
|
||||
_time_cast=0.0;
|
||||
_time_cpu_idle=0.0;
|
||||
|
||||
|
||||
return success && alloc(ef_inum);
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool AnswerT::add_fields(const bool charge, const bool rot) {
|
||||
bool realloc=false;
|
||||
|
@ -127,15 +127,15 @@ void AnswerT::clear() {
|
|||
template <class numtyp, class acctyp>
|
||||
double AnswerT::host_memory_usage() const {
|
||||
int atom_bytes=4;
|
||||
if (_charge)
|
||||
if (_charge)
|
||||
atom_bytes+=1;
|
||||
if (_rot)
|
||||
if (_rot)
|
||||
atom_bytes+=4;
|
||||
int ans_bytes=atom_bytes+_ev_fields;
|
||||
return ans_bytes*(_max_local)*sizeof(acctyp)+
|
||||
sizeof(Answer<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom) {
|
||||
|
@ -144,8 +144,8 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
|
|||
_vflag=vflag;
|
||||
_ef_atom=ef_atom;
|
||||
_vf_atom=vf_atom;
|
||||
|
||||
int csize=_ev_fields;
|
||||
|
||||
int csize=_ev_fields;
|
||||
if (!eflag)
|
||||
csize-=_e_fields;
|
||||
if (!vflag)
|
||||
|
@ -180,7 +180,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
|||
for (int i=0; i<_inum; i++)
|
||||
evdwl+=engv[i];
|
||||
if (_ef_atom)
|
||||
if (_ilist==NULL)
|
||||
if (_ilist==NULL)
|
||||
for (int i=0; i<_inum; i++)
|
||||
eatom[i]+=engv[i];
|
||||
else
|
||||
|
@ -196,18 +196,18 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
|||
if (_vf_atom)
|
||||
if (_ilist==NULL) {
|
||||
int ii=0;
|
||||
for (int i=vstart; i<iend; i++)
|
||||
for (int i=vstart; i<iend; i++)
|
||||
vatom[ii++][j]+=engv[i];
|
||||
} else {
|
||||
int ii=0;
|
||||
for (int i=vstart; i<iend; i++)
|
||||
for (int i=vstart; i<iend; i++)
|
||||
vatom[_ilist[ii++]][j]+=engv[i];
|
||||
}
|
||||
vstart+=_inum;
|
||||
iend+=_inum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
|
@ -242,8 +242,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
|||
}
|
||||
vstart=iend;
|
||||
iend+=_inum;
|
||||
}
|
||||
if (_vflag) {
|
||||
}
|
||||
if (_vflag) {
|
||||
for (int j=0; j<6; j++) {
|
||||
for (int i=vstart; i<iend; i++)
|
||||
virial[j]+=engv[i];
|
||||
|
@ -254,12 +254,12 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
|||
} else {
|
||||
for (int i=vstart, ii=0; i<iend; i++)
|
||||
vatom[_ilist[ii++]][j]+=engv[i];
|
||||
}
|
||||
}
|
||||
vstart+=_inum;
|
||||
iend+=_inum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -30,7 +30,7 @@ AtomT::Atom() : _compiled(false),_allocated(false),
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AtomT::bytes_per_atom() const {
|
||||
int AtomT::bytes_per_atom() const {
|
||||
int id_space=0;
|
||||
if (_gpu_nbor==1)
|
||||
id_space=2;
|
||||
|
@ -51,7 +51,7 @@ bool AtomT::alloc(const int nall) {
|
|||
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
|
||||
bool success=true;
|
||||
|
||||
|
||||
// Ignore host/device transfers?
|
||||
_host_view=false;
|
||||
if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) {
|
||||
|
@ -60,11 +60,11 @@ bool AtomT::alloc(const int nall) {
|
|||
assert(0==1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// Allocate storage for CUDPP sort
|
||||
#ifdef USE_CUDPP
|
||||
if (_gpu_nbor==1) {
|
||||
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
|
||||
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
|
||||
if (CUDPP_SUCCESS != result)
|
||||
return false;
|
||||
}
|
||||
|
@ -110,7 +110,7 @@ bool AtomT::alloc(const int nall) {
|
|||
} else {
|
||||
success=success && (host_particle_id.alloc(_max_atoms,*dev,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
success=success &&
|
||||
success=success &&
|
||||
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||
}
|
||||
if (_gpu_nbor==2 && _host_view)
|
||||
|
@ -124,8 +124,8 @@ bool AtomT::alloc(const int nall) {
|
|||
gpu_bytes+=x.device.row_bytes();
|
||||
if (gpu_bytes>_max_gpu_bytes)
|
||||
_max_gpu_bytes=gpu_bytes;
|
||||
|
||||
_allocated=true;
|
||||
|
||||
_allocated=true;
|
||||
return success;
|
||||
}
|
||||
|
||||
|
@ -135,7 +135,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
|||
bool success=true;
|
||||
// Ignore host/device transfers?
|
||||
int gpu_bytes=0;
|
||||
|
||||
|
||||
if (charge && _charge==false) {
|
||||
_charge=true;
|
||||
_other=true;
|
||||
|
@ -179,7 +179,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
|||
_gpu_nbor=gpu_nbor;
|
||||
#ifdef USE_CUDPP
|
||||
if (_gpu_nbor==1) {
|
||||
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
|
||||
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
|
||||
if (CUDPP_SUCCESS != result)
|
||||
return false;
|
||||
}
|
||||
|
@ -198,9 +198,9 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
|||
} else {
|
||||
success=success && (host_particle_id.alloc(_max_atoms,*dev,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
success=success &&
|
||||
success=success &&
|
||||
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return success;
|
||||
|
@ -230,7 +230,7 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
|
|||
int ef_nall=nall;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
|
||||
|
||||
// Initialize timers for the selected device
|
||||
time_pos.init(*dev);
|
||||
time_q.init(*dev);
|
||||
|
@ -241,14 +241,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
|
|||
time_quat.zero();
|
||||
time_vel.zero();
|
||||
_time_cast=0.0;
|
||||
|
||||
|
||||
#ifdef GPU_CAST
|
||||
compile_kernels(*dev);
|
||||
#endif
|
||||
|
||||
|
||||
return success && alloc(ef_nall);
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AtomT::clear_resize() {
|
||||
if (!_allocated)
|
||||
|
@ -274,7 +274,7 @@ void AtomT::clear_resize() {
|
|||
#ifdef USE_CUDPP
|
||||
if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
|
||||
#endif
|
||||
|
||||
|
||||
if (_gpu_nbor==2) {
|
||||
host_particle_id.clear();
|
||||
host_cell_id.clear();
|
||||
|
@ -305,21 +305,21 @@ void AtomT::clear() {
|
|||
template <class numtyp, class acctyp>
|
||||
double AtomT::host_memory_usage() const {
|
||||
int atom_bytes=4;
|
||||
if (_charge)
|
||||
if (_charge)
|
||||
atom_bytes+=1;
|
||||
if (_rot)
|
||||
if (_rot)
|
||||
atom_bytes+=4;
|
||||
if (_vel)
|
||||
if (_vel)
|
||||
atom_bytes+=4;
|
||||
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
|
||||
// Sort arrays for neighbor list calculation
|
||||
template <class numtyp, class acctyp>
|
||||
void AtomT::sort_neighbor(const int num_atoms) {
|
||||
#ifdef USE_CUDPP
|
||||
CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
|
||||
(int *)dev_particle_id.begin(),
|
||||
CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
|
||||
(int *)dev_particle_id.begin(),
|
||||
8*sizeof(unsigned), num_atoms);
|
||||
if (CUDPP_SUCCESS != result) {
|
||||
printf("Error in cudppSort\n");
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -17,9 +17,9 @@
|
|||
#include "lal_preprocessor.h"
|
||||
#endif
|
||||
|
||||
__kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
|
||||
__kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
|
||||
const __global double *restrict x,
|
||||
const __global int *restrict type,
|
||||
const __global int *restrict type,
|
||||
const int nall) {
|
||||
int ii=GLOBAL_ID_X;
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -57,19 +57,19 @@ class Atom {
|
|||
|
||||
/// Set number of local+ghost atoms for future copy operations
|
||||
inline void nall(const int n) { _nall=n; }
|
||||
|
||||
|
||||
/// Memory usage per atom in this class
|
||||
int bytes_per_atom() const;
|
||||
int bytes_per_atom() const;
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param rot True if atom storage needs quaternions
|
||||
* \param gpu_nbor 0 if neighboring will be performed on host
|
||||
* gpu_nbor 1 if neighboring will be performed on device
|
||||
* gpu_nbor 2 if binning on host and neighboring on device **/
|
||||
bool init(const int nall, const bool charge, const bool rot,
|
||||
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
|
||||
bool init(const int nall, const bool charge, const bool rot,
|
||||
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
|
||||
const bool vel=false);
|
||||
|
||||
|
||||
/// Check if we have enough device storage and realloc if not
|
||||
/** Returns true if resized with any call during this timestep **/
|
||||
inline bool resize(const int nall, bool &success) {
|
||||
|
@ -81,7 +81,7 @@ class Atom {
|
|||
}
|
||||
return _resized;
|
||||
}
|
||||
|
||||
|
||||
/// If already initialized by another LAMMPS style, add fields as necessary
|
||||
/** \param rot True if atom storage needs quaternions
|
||||
* \param gpu_nbor 0 if neighboring will be performed on host
|
||||
|
@ -89,28 +89,28 @@ class Atom {
|
|||
* gpu_nbor 2 if binning on host and neighboring on device **/
|
||||
bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
|
||||
const bool bonds, const bool vel=false);
|
||||
|
||||
|
||||
/// Returns true if GPU is using charges
|
||||
bool charge() { return _charge; }
|
||||
|
||||
|
||||
/// Returns true if GPU is using quaternions
|
||||
bool quaternion() { return _rot; }
|
||||
|
||||
|
||||
/// Returns true if GPU is using velocities
|
||||
bool velocity() { return _vel; }
|
||||
|
||||
/// Only free matrices of length inum or nall for resizing
|
||||
void clear_resize();
|
||||
|
||||
|
||||
/// Free all memory on host and device
|
||||
void clear();
|
||||
|
||||
|
||||
/// Return the total amount of host memory used by class in bytes
|
||||
double host_memory_usage() const;
|
||||
|
||||
/// Sort arrays for neighbor list calculation on device
|
||||
void sort_neighbor(const int num_atoms);
|
||||
|
||||
|
||||
/// Add copy times to timers
|
||||
inline void acc_timers() {
|
||||
time_pos.add_to_total();
|
||||
|
@ -150,18 +150,18 @@ class Atom {
|
|||
total+=time_vel.total_seconds();
|
||||
time_vel.zero_total();
|
||||
}
|
||||
|
||||
|
||||
return total+_time_transfer/1000.0;
|
||||
}
|
||||
|
||||
|
||||
/// Return the total time for data cast/pack
|
||||
/** Zeros the time so that atom times are only included once **/
|
||||
inline double cast_time()
|
||||
inline double cast_time()
|
||||
{ double t=_time_cast; _time_cast=0.0; return t; }
|
||||
|
||||
/// Pack LAMMPS atom type constants into matrix and copy to device
|
||||
template <class dev_typ, class t1>
|
||||
inline void type_pack1(const int n, const int m_size,
|
||||
inline void type_pack1(const int n, const int m_size,
|
||||
UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
|
||||
t1 **one) {
|
||||
int ii=0;
|
||||
|
@ -215,7 +215,7 @@ class Atom {
|
|||
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
|
||||
ucl_copy(dev_v,view,false);
|
||||
}
|
||||
|
||||
|
||||
/// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
|
||||
template <class dev_typ, class t1, class t2, class t3, class t4>
|
||||
inline void type_pack4(const int n, const int m_size,
|
||||
|
@ -239,7 +239,7 @@ class Atom {
|
|||
|
||||
/// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
|
||||
template <class dev_typ, class t1, class t2>
|
||||
inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
|
||||
inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
|
||||
UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
|
||||
for (int i=0; i<n; i++) {
|
||||
buffer[i*2]=static_cast<numtyp>(one[i][i]);
|
||||
|
@ -279,7 +279,7 @@ class Atom {
|
|||
|
||||
/// Copy positions and types to device asynchronously
|
||||
/** Copies nall() elements **/
|
||||
inline void add_x_data(double **host_ptr, int *host_type) {
|
||||
inline void add_x_data(double **host_ptr, int *host_type) {
|
||||
time_pos.start();
|
||||
if (_x_avail==false) {
|
||||
#ifdef GPU_CAST
|
||||
|
@ -376,7 +376,7 @@ class Atom {
|
|||
|
||||
/// Copy velocities and tags to device asynchronously
|
||||
/** Copies nall() elements **/
|
||||
inline void add_v_data(double **host_ptr, tagint *host_tag) {
|
||||
inline void add_v_data(double **host_ptr, tagint *host_tag) {
|
||||
time_vel.start();
|
||||
if (_v_avail==false) {
|
||||
#ifdef GPU_CAST
|
||||
|
@ -407,8 +407,8 @@ class Atom {
|
|||
inline void add_transfer_time(double t) { _time_transfer+=t; }
|
||||
|
||||
/// Return number of bytes used on device
|
||||
inline double max_gpu_bytes()
|
||||
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
|
||||
inline double max_gpu_bytes()
|
||||
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
|
||||
|
||||
/// Returns true if the device is addressing memory on the host
|
||||
inline bool host_view() { return _host_view; }
|
||||
|
@ -422,7 +422,7 @@ class Atom {
|
|||
/// Quaterions
|
||||
UCL_Vector<numtyp,numtyp> quat;
|
||||
/// Velocities
|
||||
UCL_Vector<numtyp,numtyp> v;
|
||||
UCL_Vector<numtyp,numtyp> v;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
UCL_Vector<double,double> x_cast;
|
||||
|
@ -436,7 +436,7 @@ class Atom {
|
|||
|
||||
/// Atom tag information for device nbor builds
|
||||
UCL_D_Vec<tagint> dev_tag;
|
||||
|
||||
|
||||
/// Cell list identifiers for hybrid nbor builds
|
||||
UCL_H_Vec<int> host_cell_id;
|
||||
/// Cell list identifiers for hybrid nbor builds
|
||||
|
@ -444,7 +444,7 @@ class Atom {
|
|||
|
||||
/// Device timers
|
||||
UCL_Timer time_pos, time_q, time_quat, time_vel;
|
||||
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *dev;
|
||||
|
||||
|
@ -456,19 +456,19 @@ class Atom {
|
|||
#endif
|
||||
|
||||
bool _compiled;
|
||||
|
||||
|
||||
// True if data has been copied to device already
|
||||
bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
|
||||
|
||||
bool alloc(const int nall);
|
||||
|
||||
|
||||
bool _allocated, _rot, _charge, _bonds, _vel, _other;
|
||||
int _max_atoms, _nall, _gpu_nbor;
|
||||
bool _host_view;
|
||||
double _time_cast, _time_transfer;
|
||||
|
||||
|
||||
double _max_gpu_bytes;
|
||||
|
||||
|
||||
#ifdef USE_CUDPP
|
||||
CUDPPConfiguration sort_config;
|
||||
CUDPPHandle sort_plan;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -44,7 +44,7 @@ class Balance {
|
|||
_init_done=false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Return the timestep since initialization
|
||||
inline int timestep() { return _timestep; }
|
||||
|
||||
|
@ -96,7 +96,7 @@ class Balance {
|
|||
inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }
|
||||
|
||||
/// Calculate the new host/device split based on the cpu and device times
|
||||
/** \note Only does calculation every _HD_BALANCE_EVERY timesteps
|
||||
/** \note Only does calculation every _HD_BALANCE_EVERY timesteps
|
||||
(and first 10) **/
|
||||
inline void balance(const double cpu_time);
|
||||
|
||||
|
@ -105,13 +105,13 @@ class Balance {
|
|||
balance(cpu_time);
|
||||
return get_gpu_count(ago,inum_full);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
Device<numtyp,acctyp> *_device;
|
||||
UCL_Timer _device_time;
|
||||
bool _init_done;
|
||||
int _gpu_nbor;
|
||||
|
||||
|
||||
bool _load_balance;
|
||||
double _actual_split, _avg_split, _desired_split, _max_split;
|
||||
int _avg_count;
|
||||
|
@ -123,15 +123,15 @@ class Balance {
|
|||
#define BalanceT Balance<numtyp,acctyp>
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BalanceT::init(Device<numtyp, acctyp> *gpu,
|
||||
void BalanceT::init(Device<numtyp, acctyp> *gpu,
|
||||
const int gpu_nbor, const double split) {
|
||||
clear();
|
||||
_gpu_nbor=gpu_nbor;
|
||||
_init_done=true;
|
||||
|
||||
|
||||
_device=gpu;
|
||||
_device_time.init(*gpu->gpu);
|
||||
|
||||
|
||||
if (split<0.0) {
|
||||
_load_balance=true;
|
||||
_desired_split=0.90;
|
||||
|
@ -163,7 +163,7 @@ int BalanceT::get_gpu_count(const int ago, const int inum_full) {
|
|||
_timestep++;
|
||||
return _inum;
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BalanceT::balance(const double cpu_time) {
|
||||
if (_measure_this_step) {
|
||||
|
|
|
@ -9,10 +9,10 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#include "lal_base_atomic.h"
|
||||
using namespace LAMMPS_AL;
|
||||
#define BaseAtomicT BaseAtomic<numtyp, acctyp>
|
||||
|
@ -63,13 +63,13 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
|||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
|
||||
|
@ -139,7 +139,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist,
|
|||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
|
||||
return ilist;
|
||||
}
|
||||
|
||||
|
@ -188,7 +188,7 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
|
|||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
|
@ -217,7 +217,7 @@ template <class numtyp, class acctyp>
|
|||
int ** BaseAtomicT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
|
@ -230,12 +230,12 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
|
|||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
|
@ -255,7 +255,7 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
|
|||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -41,7 +41,7 @@ class BaseAtomic {
|
|||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -49,8 +49,8 @@ class BaseAtomic {
|
|||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const void *pair_program, const char *k_name);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
|
@ -80,7 +80,7 @@ class BaseAtomic {
|
|||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
@ -119,7 +119,7 @@ class BaseAtomic {
|
|||
/// Build neighbor list on device
|
||||
void build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, bool &success);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
|
@ -133,19 +133,19 @@ class BaseAtomic {
|
|||
int * compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
Device<numtyp,acctyp> *device;
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -64,7 +64,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
|||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
|
||||
int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom);
|
||||
|
@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
|
|||
inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
bool &success) {
|
||||
success=true;
|
||||
|
@ -192,7 +192,7 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
|
|||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
|
@ -226,7 +226,7 @@ template <class numtyp, class acctyp>
|
|||
int** BaseChargeT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
|
@ -240,12 +240,12 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
|
|||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
|
@ -271,7 +271,7 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
|
|||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -42,7 +42,7 @@ class BaseCharge {
|
|||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -83,7 +83,7 @@ class BaseCharge {
|
|||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
@ -137,12 +137,12 @@ class BaseCharge {
|
|||
int** compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double *charge, double *boxlo, double *prd);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
Device<numtyp,acctyp> *device;
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -65,7 +65,7 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
|
|||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
|
||||
int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom);
|
||||
|
@ -155,7 +155,7 @@ template <class numtyp, class acctyp>
|
|||
inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
bool &success) {
|
||||
success=true;
|
||||
|
@ -194,7 +194,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
|
|||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
|
@ -230,12 +230,12 @@ template <class numtyp, class acctyp>
|
|||
int** BaseDipoleT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double *host_q, double **host_mu,
|
||||
double *host_q, double **host_mu,
|
||||
double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
if (inum_full==0) {
|
||||
|
@ -245,12 +245,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
|
|||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
|
@ -279,7 +279,7 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
|
|||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -40,7 +40,7 @@ class BaseDipole {
|
|||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -82,7 +82,7 @@ class BaseDipole {
|
|||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
@ -136,12 +136,12 @@ class BaseDipole {
|
|||
int** compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double *charge, double **mu, double *boxlo, double *prd);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
Device<numtyp,acctyp> *device;
|
||||
|
|
|
@ -64,7 +64,7 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
|
|||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom,true);
|
||||
|
@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
|
|||
inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
bool &success) {
|
||||
success=true;
|
||||
|
@ -182,7 +182,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
|
|||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, tagint *tag, double **host_v,
|
||||
bool &success, tagint *tag, double **host_v,
|
||||
const double dtinvsqrt, const int seed, const int timestep,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
|
@ -193,7 +193,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
|
|||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
|
@ -228,12 +228,12 @@ template <class numtyp, class acctyp>
|
|||
int** BaseDPDT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_v, const double dtinvsqrt,
|
||||
double **host_v, const double dtinvsqrt,
|
||||
const int seed, const int timestep,
|
||||
double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
|
@ -244,12 +244,12 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
|
|||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
|
@ -276,7 +276,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
|
|||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ class BaseDPD {
|
|||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -81,7 +81,7 @@ class BaseDPD {
|
|||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
@ -129,20 +129,20 @@ class BaseDPD {
|
|||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, tagint *tag,
|
||||
double **v, const double dtinvsqrt, const int seed,
|
||||
double **v, const double dtinvsqrt, const int seed,
|
||||
const int timestep, const int nlocal, double *boxlo, double *prd);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int** compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double **v, const double dtinvsqrt, const int seed,
|
||||
const int timestep, double *boxlo, double *prd);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
Device<numtyp,acctyp> *device;
|
||||
|
|
|
@ -70,7 +70,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
|||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
|
||||
|
||||
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,true,
|
||||
1);
|
||||
|
@ -113,7 +113,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
|||
return -8;
|
||||
if (_multiple_forms && gpu_nbor!=0)
|
||||
return -9;
|
||||
|
||||
|
||||
if (_multiple_forms)
|
||||
ans->force.zero();
|
||||
|
||||
|
@ -142,7 +142,7 @@ void BaseEllipsoidT::clear_base() {
|
|||
// Output any timing information
|
||||
output_times();
|
||||
host_olist.clear();
|
||||
|
||||
|
||||
if (_compiled) {
|
||||
k_nbor_fast.clear();
|
||||
k_nbor.clear();
|
||||
|
@ -156,7 +156,7 @@ void BaseEllipsoidT::clear_base() {
|
|||
delete lj_program;
|
||||
_compiled=false;
|
||||
}
|
||||
|
||||
|
||||
time_nbor1.clear();
|
||||
time_ellipsoid.clear();
|
||||
time_nbor2.clear();
|
||||
|
@ -230,7 +230,7 @@ void BaseEllipsoidT::output_times() {
|
|||
if (times[6]>0)
|
||||
fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
|
||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||
fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom);
|
||||
fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom);
|
||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
|
||||
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
|
||||
|
@ -241,10 +241,10 @@ void BaseEllipsoidT::output_times() {
|
|||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pack neighbors to limit thread divergence for lj-lj and ellipse
|
||||
// Pack neighbors to limit thread divergence for lj-lj and ellipse
|
||||
// ---------------------------------------------------------------------------
|
||||
template<class numtyp, class acctyp>
|
||||
void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
|
||||
void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
|
||||
const int inum, const int form_low,
|
||||
const int form_high, const bool shared_types,
|
||||
int ntypes) {
|
||||
|
@ -264,18 +264,18 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
|
|||
// Copy neighbor list from host
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
|
||||
void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
|
||||
const int osize, int *ilist,
|
||||
int *numj, int *type, int **firstneigh,
|
||||
bool &success) {
|
||||
success=true;
|
||||
|
||||
|
||||
int mn=nbor->max_nbor_loop(osize,numj,ilist);
|
||||
resize_atom(nall,success);
|
||||
resize_local(inum,0,mn,osize,success);
|
||||
if (!success)
|
||||
return;
|
||||
|
||||
|
||||
if (_multiple_forms) {
|
||||
int p=0;
|
||||
for (int i=0; i<osize; i++) {
|
||||
|
@ -315,7 +315,7 @@ template <class numtyp, class acctyp>
|
|||
inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
bool &success) {
|
||||
success=true;
|
||||
|
@ -354,7 +354,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
|
|||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
|
@ -394,7 +394,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
|
|||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
|
@ -410,7 +410,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
|
|||
ans->inum(inum);
|
||||
_last_ellipse=std::min(inum,_max_last_ellipse);
|
||||
host_start=inum;
|
||||
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
|
@ -419,7 +419,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
|
|||
return NULL;
|
||||
atom->cast_quat_data(host_quat[0]);
|
||||
hd_balancer.start_timer();
|
||||
} else {
|
||||
} else {
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_quat_data(host_quat[0]);
|
||||
hd_balancer.start_timer();
|
||||
|
@ -444,9 +444,9 @@ double BaseEllipsoidT::host_memory_usage_base() const {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
|
||||
void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
|
||||
const void *ellipsoid_string,
|
||||
const void *lj_string,
|
||||
const void *lj_string,
|
||||
const char *kname, const bool e_s) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
|
|
@ -42,7 +42,7 @@ class BaseEllipsoid {
|
|||
* \param gpu_split fraction of particles handled by device
|
||||
* \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -68,7 +68,7 @@ class BaseEllipsoid {
|
|||
quat_tex.bind_float(atom->quat,4);
|
||||
lj_pos_tex.bind_float(atom->x,4);
|
||||
lj_quat_tex.bind_float(atom->quat,4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
|
@ -78,7 +78,7 @@ class BaseEllipsoid {
|
|||
* \param olist_size size of list of particles from CPU neighboring
|
||||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note if GPU is neighboring nlocal+host_inum=total number local particles
|
||||
* \note if CPU is neighboring olist_size=total number of local particles
|
||||
* \note if CPU is neighboring olist_size=total number of local particles
|
||||
* \note if GPU is neighboring olist_size=0 **/
|
||||
inline void resize_local(const int nlocal, const int host_inum,
|
||||
const int max_nbors, const int olist_size,
|
||||
|
@ -101,7 +101,7 @@ class BaseEllipsoid {
|
|||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear_base();
|
||||
|
||||
|
||||
/// Output any timing information
|
||||
void output_times();
|
||||
|
||||
|
@ -130,7 +130,7 @@ class BaseEllipsoid {
|
|||
ans->acc_timers();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Zero timers
|
||||
inline void zero_timers() {
|
||||
time_nbor1.zero();
|
||||
|
@ -148,9 +148,9 @@ class BaseEllipsoid {
|
|||
ans->zero_timers();
|
||||
}
|
||||
|
||||
/// Pack neighbors to limit thread divergence for lj-lj and ellipse
|
||||
/// Pack neighbors to limit thread divergence for lj-lj and ellipse
|
||||
void pack_nbors(const int GX, const int BX, const int start, const int inum,
|
||||
const int form_low, const int form_high,
|
||||
const int form_low, const int form_high,
|
||||
const bool shared_types, int ntypes);
|
||||
|
||||
/// Copy neighbor list from host
|
||||
|
@ -174,17 +174,17 @@ class BaseEllipsoid {
|
|||
int** compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
double **host_quat);
|
||||
|
||||
/// Build neighbor list on accelerator
|
||||
void build_nbor_list(const int inum, const int host_inum, const int nall,
|
||||
void build_nbor_list(const int inum, const int host_inum, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, bool &success);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
Device<numtyp,acctyp> *device;
|
||||
|
@ -207,7 +207,7 @@ class BaseEllipsoid {
|
|||
/// Atom Data
|
||||
Atom<numtyp,acctyp> *atom;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// cut_form.x = cutsq, cut_form.y = form
|
||||
UCL_D_Vec<numtyp2> cut_form;
|
||||
|
@ -240,7 +240,7 @@ class BaseEllipsoid {
|
|||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
// True if we want to use fast GB-sphere or sphere-sphere calculations
|
||||
// True if we want to use fast GB-sphere or sphere-sphere calculations
|
||||
bool _multiple_forms;
|
||||
int **_host_form;
|
||||
int _last_ellipse, _max_last_ellipse;
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
begin : Tue April 2, 2013
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
||||
#include "lal_base_three.h"
|
||||
using namespace LAMMPS_AL;
|
||||
#define BaseThreeT BaseThree<numtyp, acctyp>
|
||||
|
@ -45,7 +45,7 @@ int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const {
|
|||
#ifdef THREE_CONCURRENT
|
||||
b+=ans2->bytes_per_atom();
|
||||
#endif
|
||||
return b;
|
||||
return b;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||
gpu_nbor=1;
|
||||
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||
gpu_nbor=2;
|
||||
_gpu_nbor=gpu_nbor;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||
|
@ -76,7 +77,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||
_nbor_data=&(nbor->dev_nbor);
|
||||
if (_threads_per_atom*_threads_per_atom>device->warp_size())
|
||||
return -10;
|
||||
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom);
|
||||
|
@ -93,7 +94,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||
return -3;
|
||||
ans2->cq(_end_command_queue);
|
||||
#endif
|
||||
|
||||
|
||||
_block_pair=device->pair_block_size();
|
||||
_block_size=device->block_ellipse();
|
||||
compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
|
||||
|
@ -111,7 +112,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||
#ifdef THREE_CONCURRENT
|
||||
_max_an_bytes+=ans2->gpu_bytes();
|
||||
#endif
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -158,7 +159,7 @@ void BaseThreeT::clear_atomic() {
|
|||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
bool &success) {
|
||||
success=true;
|
||||
|
||||
|
@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
|||
if (!success)
|
||||
return NULL;
|
||||
|
||||
nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
|
||||
// originally the requirement that nall == nlist was enforced
|
||||
// to allow direct indexing neighbors of neighbors after re-arrangement
|
||||
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
|
||||
|
||||
// now the requirement is removed, allowing to work within pair hybrid
|
||||
nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
#ifdef THREE_CONCURRENT
|
||||
|
@ -176,7 +182,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
|||
#endif
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
|
||||
return ilist;
|
||||
}
|
||||
|
||||
|
@ -185,11 +191,11 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
|||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
bool &success) {
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
bool &success) {
|
||||
success=true;
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(nall,host_inum,nbor->max_nbors(),success);
|
||||
|
@ -214,11 +220,11 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
|||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
acc_timers();
|
||||
if (nlist==0) {
|
||||
|
@ -228,9 +234,9 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
|
|||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,nlocal,cpu_time);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
ans2->inum(inum);
|
||||
|
@ -270,7 +276,7 @@ template <class numtyp, class acctyp>
|
|||
int ** BaseThreeT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
|
@ -283,7 +289,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
|||
zero_timers();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
|
@ -291,7 +297,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
|||
ans2->inum(inum);
|
||||
#endif
|
||||
host_start=inum;
|
||||
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
|
@ -321,7 +327,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
|||
device->add_ans_object(ans2);
|
||||
#endif
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
|
@ -352,7 +358,7 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
|||
k_three_end.cq(ucl_device->cq(_end_command_queue));
|
||||
k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
|
||||
#endif
|
||||
|
||||
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ class BaseThree {
|
|||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_two name for the kernel for 2-body force calculation
|
||||
* \param k_three name for the kernel for 3-body force calculation
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -53,8 +53,8 @@ class BaseThree {
|
|||
* - -5 Double precision is not supported on card
|
||||
* - -10 if invalid thread_per_atom setting **/
|
||||
int init_three(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const void *pair_program, const char *k_two,
|
||||
const char *k_three_center, const char *k_three_end);
|
||||
|
||||
|
@ -88,7 +88,7 @@ class BaseThree {
|
|||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
@ -133,33 +133,33 @@ class BaseThree {
|
|||
/// Build neighbor list on device
|
||||
int build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, bool &success);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
void compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||
const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int * compute(const int ago, const int inum_full, const int nall,
|
||||
int * compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
Device<numtyp,acctyp> *device;
|
||||
|
@ -186,7 +186,7 @@ class BaseThree {
|
|||
Answer<numtyp,acctyp> *ans;
|
||||
#ifdef THREE_CONCURRENT
|
||||
Answer<numtyp,acctyp> *ans2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
|
@ -205,15 +205,16 @@ class BaseThree {
|
|||
protected:
|
||||
bool _compiled;
|
||||
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
|
||||
int _gpu_nbor;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
const char *k_two, const char *k_three_center,
|
||||
const char *k_three_end);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag,
|
||||
virtual void loop(const bool _eflag, const bool _vflag,
|
||||
const int evatom) = 0;
|
||||
};
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -33,17 +33,17 @@ BeckT::Beck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BeckT::~Beck() {
|
||||
BeckT::~Beck() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BeckT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BeckT::init(const int ntypes,
|
||||
int BeckT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_aa,
|
||||
double **host_alpha, double **host_beta,
|
||||
double **host_AA, double **host_BB,
|
||||
|
@ -126,7 +126,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -24,7 +24,7 @@ texture<int4,1> pos_tex;
|
|||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void k_beck(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_beck(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict beck1,
|
||||
const __global numtyp4 *restrict beck2,
|
||||
const int lj_types,
|
||||
|
@ -50,20 +50,20 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -76,7 +76,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<beck2[mtype].z) {
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
|
@ -103,7 +103,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
|||
numtyp term1inv = ucl_recip(term1);
|
||||
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
|
||||
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
|
||||
energy+=factor_lj*e;
|
||||
energy+=factor_lj*e;
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
@ -133,7 +133,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
|||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
|
||||
__local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
|
@ -143,7 +143,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
|||
beck1[tid]=beck1_in[tid];
|
||||
beck2[tid]=beck2_in[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
|
@ -152,7 +152,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
|
@ -166,7 +166,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -179,7 +179,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
if (rsq<beck2[mtype].z) {
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
numtyp r5 = rsq*rsq*r;
|
||||
|
@ -205,7 +205,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp term1inv = ucl_recip(term1);
|
||||
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
|
||||
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
|
||||
energy+=factor_lj*e;
|
||||
energy+=factor_lj*e;
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
|
|||
class Beck : public BaseAtomic<numtyp, acctyp> {
|
||||
public:
|
||||
Beck();
|
||||
~Beck();
|
||||
~Beck();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -41,8 +41,8 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
|
|||
double **host_aa, double **host_alpha,
|
||||
double **host_beta, double **host_AA,
|
||||
double **host_BB, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
|
@ -67,7 +67,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
private:
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -77,7 +77,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
|
|||
cell_size, gpu_split, screen);
|
||||
|
||||
BLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -102,8 +102,8 @@ int ** beck_gpu_compute_n(const int ago, const int inum_full,
|
|||
return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void beck_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -33,10 +33,10 @@ BornT::Born() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BornT::~Born() {
|
||||
BornT::~Born() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BornT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
|
@ -44,12 +44,12 @@ int BornT::bytes_per_atom(const int max_nbors) const {
|
|||
|
||||
template <class numtyp, class acctyp>
|
||||
int BornT::init(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_born1, double **host_born2,
|
||||
double **host_rhoinv, double **host_born1, double **host_born2,
|
||||
double **host_born3, double **host_a, double **host_c,
|
||||
double **host_d, double **host_sigma,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
|
@ -80,7 +80,7 @@ int BornT::init(const int ntypes, double **host_cutsq,
|
|||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_d,host_offset);
|
||||
host_d,host_offset);
|
||||
|
||||
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
|
||||
|
@ -102,18 +102,18 @@ void BornT::reinit(const int ntypes, double **host_rhoinv,
|
|||
double **host_born1, double **host_born2,
|
||||
double **host_born3, double **host_a, double **host_c,
|
||||
double **host_d, double **host_offset) {
|
||||
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
|
||||
|
||||
for (int i=0; i<_lj_types*_lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
|
||||
this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
|
||||
host_born1,host_born2,host_born3);
|
||||
this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_d,host_offset);
|
||||
host_d,host_offset);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
@ -151,7 +151,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -169,7 +169,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
|
|||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2,
|
||||
&cutsq_sigma, &_lj_types, &sp_lj,
|
||||
&cutsq_sigma, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag, &ainum,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -24,16 +24,16 @@ texture<int4,1> pos_tex;
|
|||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void k_born(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_born(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const __global numtyp2 *restrict cutsq_sigma,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
|
@ -51,20 +51,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -77,17 +77,17 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (r2inv<cutsq_sigma[mtype].x) {
|
||||
numtyp r=ucl_sqrt(r2inv);
|
||||
numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
|
||||
r2inv=ucl_recip(r2inv);
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||
- coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
|
||||
force*=factor_lj;
|
||||
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
@ -95,7 +95,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||
+ coeff2[mtype].z*r2inv*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
@ -113,20 +113,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
|||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp2 *restrict cutsq_sigma,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
|
@ -137,7 +137,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
|
@ -146,7 +146,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
|
@ -160,7 +160,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -173,13 +173,13 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
if (r2inv<cutsq_sigma[mtype].x) {
|
||||
numtyp r=ucl_sqrt(r2inv);
|
||||
numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
|
||||
r2inv=ucl_recip(r2inv);
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||
- coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
|
||||
force*=factor_lj;
|
||||
|
||||
|
@ -190,7 +190,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||
+ coeff2[mtype].z*r2inv*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
|
|||
class Born : public BaseAtomic<numtyp, acctyp> {
|
||||
public:
|
||||
Born();
|
||||
~Born();
|
||||
~Born();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -38,20 +38,20 @@ class Born : public BaseAtomic<numtyp, acctyp> {
|
|||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_born1, double **host_born2,
|
||||
double **host_rhoinv, double **host_born1, double **host_born2,
|
||||
double **host_born3, double **host_a, double **host_c,
|
||||
double **host_d, double **host_sigma,
|
||||
double **host_d, double **host_sigma,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
|
||||
/// Send updated coeffs from host to device (to be compatible with fix adapt)
|
||||
void reinit(const int ntypes, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2,
|
||||
double **host_born3, double **host_a, double **host_c,
|
||||
double **host_d, double **host_offset);
|
||||
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
@ -77,7 +77,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
private:
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
|
|||
BornCoulLongT::~BornCoulLongT() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BornCoulLongT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
double **host_sigma, double **host_offset,
|
||||
int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
double **host_sigma, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
|
@ -84,12 +84,12 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
|
|||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_d,host_offset);
|
||||
|
||||
host_d,host_offset);
|
||||
|
||||
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
|
||||
host_cut_ljsq,host_sigma);
|
||||
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
host_write[i]=host_special_lj[i];
|
||||
|
@ -142,7 +142,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -157,15 +157,15 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
|||
&this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq_sigma, &_cut_coulsq, &_qqrd2e,
|
||||
&cutsq_sigma, &_cut_coulsq, &_qqrd2e,
|
||||
&_g_ewald, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->q,
|
||||
&nbor_pitch, &this->atom->q,
|
||||
&cutsq_sigma, &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -29,19 +29,19 @@ texture<int2> q_tex;
|
|||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_born_long(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_born_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp4 *restrict cutsq_sigma,
|
||||
const __global numtyp4 *restrict cutsq_sigma,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
|
@ -64,14 +64,14 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
@ -114,129 +114,129 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
|
|||
numtyp r = ucl_sqrt(rsq);
|
||||
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
|
||||
} else forceborn = (numtyp)0.0;
|
||||
|
||||
force = (forceborn + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < coeff1[mtype].w) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||
+ coeff2[mtype].z*r2inv*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp4 *restrict cutsq_sigma,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq_sigma[mtype].x) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp r=ucl_rsqrt(r2inv);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else forcecoul = (numtyp)0.0;
|
||||
|
||||
if (rsq < cutsq_sigma[mtype].y) {
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
|
||||
} else forceborn = (numtyp)0.0;
|
||||
|
||||
force = (forceborn + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < coeff1[mtype].w) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||
+ coeff2[mtype].z*r2inv*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp4 *restrict cutsq_sigma,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq_sigma[mtype].x) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp r=ucl_rsqrt(r2inv);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else forcecoul = (numtyp)0.0;
|
||||
|
||||
if (rsq < cutsq_sigma[mtype].y) {
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
|
||||
} else forceborn = (numtyp)0.0;
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -30,19 +30,19 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
|
|||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
double **host_sigma, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
|
@ -59,12 +59,12 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
|
|||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
|
||||
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
|
||||
/// coeff1.w = born3
|
||||
UCL_D_Vec<numtyp4> coeff1;
|
||||
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
|
||||
UCL_D_Vec<numtyp4> coeff2;
|
||||
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
|
||||
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
|
||||
/// cutsq_sigma.z = sigma
|
||||
UCL_D_Vec<numtyp4> cutsq_sigma;
|
||||
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||
|
@ -73,7 +73,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -30,9 +30,9 @@ static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
|
|||
int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
double **sigma, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
double **sigma, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
|
@ -58,10 +58,10 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
BORNCLMF.device->world_barrier();
|
||||
|
@ -78,14 +78,14 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
BORNCLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -102,7 +102,7 @@ void borncl_gpu_clear() {
|
|||
|
||||
int** borncl_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
|
@ -112,8 +112,8 @@ int** borncl_gpu_compute_n(const int ago, const int inum_full,
|
|||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
|
|||
BornCoulWolfT::~BornCoulWolfT() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BornCoulWolfT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
double **host_sigma, double **host_offset,
|
||||
int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
double **host_sigma, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
|
@ -84,12 +84,12 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
|
|||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_d,host_offset);
|
||||
|
||||
host_d,host_offset);
|
||||
|
||||
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
|
||||
host_cut_ljsq,host_sigma);
|
||||
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
host_write[i]=host_special_lj[i];
|
||||
|
@ -144,7 +144,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -157,17 +157,17 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
|
|||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq_sigma, &_cut_coulsq, &_qqrd2e,
|
||||
&_alf, &_e_shift, &_f_shift,
|
||||
&cutsq_sigma, &_cut_coulsq, &_qqrd2e,
|
||||
&_alf, &_e_shift, &_f_shift,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->q,
|
||||
&cutsq_sigma, &_cut_coulsq,
|
||||
&_qqrd2e, &_alf, &_e_shift, &_f_shift,
|
||||
&_qqrd2e, &_alf, &_e_shift, &_f_shift,
|
||||
&this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -31,21 +31,21 @@ texture<int2> q_tex;
|
|||
|
||||
#define MY_PIS (acctyp)1.77245385090551602729
|
||||
|
||||
__kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp4 *restrict cutsq_sigma,
|
||||
const __global numtyp4 *restrict cutsq_sigma,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp alf, const numtyp e_shift,
|
||||
const numtyp alf, const numtyp e_shift,
|
||||
const numtyp f_shift, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
@ -67,20 +67,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
if (eflag>0) {
|
||||
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
|
||||
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
|
||||
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
|
||||
e_coul += (acctyp)2.0*e_self;
|
||||
}
|
||||
|
@ -108,12 +108,12 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||
numtyp forcecoul, forceborn, force, r6inv, prefactor;
|
||||
numtyp v_sh = (numtyp)0.0;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
|
||||
if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
|
||||
} else forceborn = (numtyp)0.0;
|
||||
|
||||
|
@ -147,7 +147,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||
+ coeff2[mtype].z*r2inv*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
@ -165,20 +165,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp4 *restrict cutsq_sigma,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp alf, const numtyp e_shift,
|
||||
const numtyp alf, const numtyp e_shift,
|
||||
const numtyp f_shift, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
@ -193,7 +193,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
|
@ -201,23 +201,23 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
if (eflag>0) {
|
||||
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
|
||||
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
|
||||
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
|
||||
e_coul += (acctyp)2.0*e_self;
|
||||
}
|
||||
|
@ -244,12 +244,12 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp forcecoul, forceborn, force, r6inv, prefactor;
|
||||
numtyp v_sh = (numtyp)0.0;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
|
||||
if (rsq < cutsq_sigma[mtype].y) {
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
|
||||
} else forceborn = (numtyp)0.0;
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -30,19 +30,19 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
|
|||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
double **host_sigma, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double alf, const double e_shift,
|
||||
|
@ -60,12 +60,12 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
|
|||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
|
||||
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
|
||||
/// coeff1.w = born3
|
||||
UCL_D_Vec<numtyp4> coeff1;
|
||||
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
|
||||
UCL_D_Vec<numtyp4> coeff2;
|
||||
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
|
||||
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
|
||||
/// cutsq_sigma.z = sigma
|
||||
UCL_D_Vec<numtyp4> cutsq_sigma;
|
||||
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||
|
@ -74,7 +74,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -28,7 +28,7 @@ static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
|
|||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_born1, double **host_born2, double **host_born3,
|
||||
double **host_a, double **host_c, double **host_d,
|
||||
double **sigma, double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
|
@ -60,9 +60,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
if (world_me==0)
|
||||
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||
alf, e_shift, f_shift);
|
||||
|
||||
BORNCWMF.device->world_barrier();
|
||||
|
@ -79,15 +79,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||
alf, e_shift, f_shift);
|
||||
|
||||
BORNCWMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -104,7 +104,7 @@ void borncw_gpu_clear() {
|
|||
|
||||
int** borncw_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
|
@ -114,8 +114,8 @@ int** borncw_gpu_compute_n(const int ago, const int inum_full,
|
|||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -28,9 +28,9 @@ static Born<PRECISION,ACC_PRECISION> BORNMF;
|
|||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
double **host_born1, double **host_born2,
|
||||
double **host_born3, double **host_a, double **host_c,
|
||||
double **host_d, double **sigma,
|
||||
double **host_born1, double **host_born2,
|
||||
double **host_born3, double **host_a, double **host_c,
|
||||
double **host_d, double **sigma,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
|
@ -56,7 +56,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
@ -75,13 +75,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
BORNMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -102,24 +102,24 @@ void born_gpu_reinit(const int ntypes, double **host_rhoinv,
|
|||
int world_me=BORNMF.device->world_me();
|
||||
int gpu_rank=BORNMF.device->gpu_rank();
|
||||
int procs_per_gpu=BORNMF.device->procs_per_gpu();
|
||||
|
||||
|
||||
if (world_me==0)
|
||||
BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, offset);
|
||||
|
||||
|
||||
BORNMF.device->world_barrier();
|
||||
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, offset);
|
||||
|
||||
|
||||
BORNMF.device->gpu_barrier();
|
||||
}
|
||||
}
|
||||
|
||||
void born_gpu_clear() {
|
||||
BORNMF.clear();
|
||||
BORNMF.clear();
|
||||
}
|
||||
|
||||
int ** born_gpu_compute_n(const int ago, const int inum_full,
|
||||
|
@ -132,8 +132,8 @@ int ** born_gpu_compute_n(const int ago, const int inum_full,
|
|||
return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void born_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -33,10 +33,10 @@ BuckT::Buck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BuckT::~Buck() {
|
||||
BuckT::~Buck() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BuckT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
|
@ -44,11 +44,11 @@ int BuckT::bytes_per_atom(const int max_nbors) const {
|
|||
|
||||
template <class numtyp, class acctyp>
|
||||
int BuckT::init(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
|
@ -79,7 +79,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
|
|||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
@ -95,14 +95,14 @@ template <class numtyp, class acctyp>
|
|||
void BuckT::reinit(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c, double **host_offset) {
|
||||
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
|
||||
|
||||
for (int i=0; i<_lj_types*_lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
|
||||
this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
|
||||
host_buck1,host_buck2,host_cutsq);
|
||||
this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
|
||||
|
@ -143,7 +143,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -154,13 +154,13 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
|
|||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void k_buck(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_buck(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const int lj_types,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
|
@ -50,20 +50,20 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -76,24 +76,24 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (r2inv<coeff1[mtype].w) {
|
||||
numtyp r=ucl_sqrt(r2inv);
|
||||
numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
|
||||
r2inv=ucl_recip(r2inv);
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||
- coeff1[mtype].z*r6inv);
|
||||
force*=factor_lj;
|
||||
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
@ -111,19 +111,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
|||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
|
@ -134,7 +134,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
|
@ -143,7 +143,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
|
@ -157,7 +157,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -170,13 +170,13 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
if (r2inv<coeff1[mtype].w) {
|
||||
numtyp r=ucl_sqrt(r2inv);
|
||||
numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
|
||||
r2inv=ucl_recip(r2inv);
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||
- coeff1[mtype].z*r6inv);
|
||||
force*=factor_lj;
|
||||
|
||||
|
@ -186,7 +186,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
|||
|
||||
if (eflag>0) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
|
|||
class Buck : public BaseAtomic<numtyp, acctyp> {
|
||||
public:
|
||||
Buck();
|
||||
~Buck();
|
||||
~Buck();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -38,18 +38,18 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
|
|||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Send updated coeffs from host to device (to be compatible with fix adapt)
|
||||
void reinit(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c, double **host_offset);
|
||||
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
@ -72,7 +72,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
private:
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -33,10 +33,10 @@ BuckCoulT::BuckCoul() : BaseCharge<numtyp,acctyp>(), _allocated(false) {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BuckCoulT::~BuckCoul() {
|
||||
BuckCoulT::~BuckCoul() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BuckCoulT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
|
@ -44,11 +44,11 @@ int BuckCoulT::bytes_per_atom(const int max_nbors) const {
|
|||
|
||||
template <class numtyp, class acctyp>
|
||||
int BuckCoulT::init(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen, double **host_cut_ljsq,
|
||||
double **host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e) {
|
||||
|
@ -81,21 +81,21 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
|
|||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_offset);
|
||||
|
||||
host_offset);
|
||||
|
||||
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq,
|
||||
host_cut_ljsq, host_cut_coulsq);
|
||||
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
host_write[i]=host_special_lj[i];
|
||||
host_write[i+4]=host_special_coul[i];
|
||||
}
|
||||
ucl_copy(sp_lj,host_write,8,false);
|
||||
|
||||
|
||||
_qqrd2e = qqrd2e;
|
||||
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes();
|
||||
return 0;
|
||||
|
@ -135,7 +135,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -147,12 +147,12 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_qqrd2e, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_qqrd2e, &this->_threads_per_atom);
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -29,19 +29,19 @@ texture<int2> q_tex;
|
|||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_ ,
|
||||
const __global numtyp4 *restrict cutsq,
|
||||
const __global numtyp4 *restrict cutsq,
|
||||
const numtyp qqrd2e, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
@ -63,21 +63,21 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = sp_lj[sbmask(j)+4];
|
||||
|
@ -91,30 +91,30 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<cutsq[mtype].x) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, forcebuck, force, r6inv;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
|
||||
if (rsq < cutsq[mtype].y) { // buckingham
|
||||
numtyp r=ucl_sqrt(rsq);
|
||||
rexp = ucl_exp(-r*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
forcebuck = (coeff1[mtype].y*r*rexp
|
||||
forcebuck = (coeff1[mtype].y*r*rexp
|
||||
- coeff1[mtype].z*r6inv)*factor_lj;
|
||||
} else
|
||||
forcebuck = (numtyp)0.0;
|
||||
|
||||
|
||||
if (rsq < coeff2[mtype].z) {
|
||||
fetch(forcecoul,j,q_tex);
|
||||
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
|
||||
force = (forcebuck + forcecoul) * r2inv;
|
||||
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
@ -142,22 +142,22 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
|||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp4 *restrict _cutsq,
|
||||
const __global numtyp4 *restrict _cutsq,
|
||||
const numtyp qqrd2e, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
|
@ -170,7 +170,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
|
@ -180,7 +180,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
|
@ -195,7 +195,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
|||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = sp_lj[sbmask(j)+4];
|
||||
|
@ -209,27 +209,27 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
if (rsq<cutsq[mtype].x) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, forcebuck, force, r6inv;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
|
||||
if (rsq < cutsq[mtype].y) { // buckingham
|
||||
numtyp r=ucl_sqrt(rsq);
|
||||
rexp = ucl_exp(-r*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
forcebuck = (coeff1[mtype].y*r*rexp
|
||||
forcebuck = (coeff1[mtype].y*r*rexp
|
||||
- coeff1[mtype].z*r6inv)*factor_lj;
|
||||
} else
|
||||
forcebuck = (numtyp)0.0;
|
||||
|
||||
|
||||
if (rsq < cutsq[mtype].z) {
|
||||
fetch(forcecoul,j,q_tex);
|
||||
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
|
||||
force = (forcebuck + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
|
@ -241,7 +241,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
|||
if (rsq < cutsq[mtype].y) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
|
|||
class BuckCoul : public BaseCharge<numtyp, acctyp> {
|
||||
public:
|
||||
BuckCoul();
|
||||
~BuckCoul();
|
||||
~BuckCoul();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -38,11 +38,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
|
|||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
double **host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e);
|
||||
|
@ -71,11 +71,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
|
||||
numtyp _qqrd2e;
|
||||
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -28,8 +28,8 @@ static BuckCoul<PRECISION,ACC_PRECISION> BUCKCMF;
|
|||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
|
@ -57,9 +57,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e);
|
||||
|
||||
|
@ -77,14 +77,14 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e);
|
||||
|
||||
BUCKCMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -96,12 +96,12 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
}
|
||||
|
||||
void buckc_gpu_clear() {
|
||||
BUCKCMF.clear();
|
||||
BUCKCMF.clear();
|
||||
}
|
||||
|
||||
int ** buckc_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
|
@ -111,8 +111,8 @@ int ** buckc_gpu_compute_n(const int ago, const int inum_full,
|
|||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void buckc_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
|
|||
BuckCoulLongT::~BuckCoulLongT() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
|
@ -45,8 +45,8 @@ int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
|
|||
|
||||
template <class numtyp, class acctyp>
|
||||
int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c, double **host_offset,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
|
@ -83,11 +83,11 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
|
|||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_offset);
|
||||
|
||||
host_offset);
|
||||
|
||||
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
|
||||
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
host_write[i]=host_special_lj[i];
|
||||
|
@ -139,7 +139,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -150,16 +150,16 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_cut_coulsq, &_qqrd2e,
|
||||
&cutsq, &_cut_coulsq, &_qqrd2e,
|
||||
&_g_ewald, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
|
||||
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -29,19 +29,19 @@ texture<int2> q_tex;
|
|||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global numtyp4 *restrict coeff2,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
|
@ -64,14 +64,14 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
@ -98,136 +98,136 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
|||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
|
||||
if (rsq < coeff1[mtype].w) { // cut_ljsq
|
||||
numtyp r=ucl_sqrt(rsq);
|
||||
rexp = ucl_exp(-r*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
force_lj = (coeff1[mtype].y*r*rexp
|
||||
- coeff1[mtype].z*r6inv)*factor_lj;
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp r = ucl_rsqrt(r2inv);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < coeff1[mtype].w) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[mtype]) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
if (rsq < coeff1[mtype].w) {
|
||||
numtyp r=ucl_sqrt(rsq);
|
||||
rexp = ucl_exp(-r*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
force_lj = (coeff1[mtype].y*r*rexp
|
||||
force_lj = (coeff1[mtype].y*r*rexp
|
||||
- coeff1[mtype].z*r6inv)*factor_lj;
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp r = ucl_rsqrt(r2inv);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < coeff1[mtype].w) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict coeff1_in,
|
||||
const __global numtyp4 *restrict coeff2_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const numtyp cut_coulsq,
|
||||
const numtyp qqrd2e, const numtyp g_ewald,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[mtype]) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
|
||||
numtyp rexp = (numtyp)0.0;
|
||||
|
||||
if (rsq < coeff1[mtype].w) {
|
||||
numtyp r=ucl_sqrt(rsq);
|
||||
rexp = ucl_exp(-r*coeff1[mtype].x);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
force_lj = (coeff1[mtype].y*r*rexp
|
||||
- coeff1[mtype].z*r6inv)*factor_lj;
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -30,7 +30,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
|
|||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -38,11 +38,11 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
|
|||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_rhoinv, double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
|
@ -71,7 +71,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -28,7 +28,7 @@ static BuckCoulLong<PRECISION,ACC_PRECISION> BUCKCLMF;
|
|||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
double **host_buck1, double **host_buck2,
|
||||
double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
|
@ -58,8 +58,8 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
|
@ -77,13 +77,13 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
BUCKCLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -100,7 +100,7 @@ void buckcl_gpu_clear() {
|
|||
|
||||
int** buckcl_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
|
@ -110,8 +110,8 @@ int** buckcl_gpu_compute_n(const int ago, const int inum_full,
|
|||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void buckcl_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -28,8 +28,8 @@ static Buck<PRECISION,ACC_PRECISION> BUCKMF;
|
|||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **host_buck1, double **host_buck2,
|
||||
double **host_a, double **host_c,
|
||||
double **offset, double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
|
@ -55,7 +55,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
|
@ -73,12 +73,12 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
BUCKMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -98,24 +98,24 @@ void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
|
|||
int world_me=BUCKMF.device->world_me();
|
||||
int gpu_rank=BUCKMF.device->gpu_rank();
|
||||
int procs_per_gpu=BUCKMF.device->procs_per_gpu();
|
||||
|
||||
|
||||
if (world_me==0)
|
||||
BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset);
|
||||
|
||||
|
||||
BUCKMF.device->world_barrier();
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset);
|
||||
|
||||
|
||||
BUCKMF.device->gpu_barrier();
|
||||
}
|
||||
}
|
||||
|
||||
void buck_gpu_clear() {
|
||||
BUCKMF.clear();
|
||||
BUCKMF.clear();
|
||||
}
|
||||
|
||||
int ** buck_gpu_compute_n(const int ago, const int inum_full,
|
||||
|
@ -128,8 +128,8 @@ int ** buck_gpu_compute_n(const int ago, const int inum_full,
|
|||
return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void buck_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -33,23 +33,23 @@ CGCMMT::CGCMM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
CGCMMT::~CGCMM() {
|
||||
CGCMMT::~CGCMM() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CGCMMT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CGCMMT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
int CGCMMT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
|
@ -75,12 +75,12 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
|
|||
host_write[i]=0.0;
|
||||
|
||||
lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
|
||||
this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
|
||||
host_cg_type,host_lj1,host_lj2);
|
||||
|
||||
lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
@ -126,7 +126,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -138,7 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
|
|||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1,
|
||||
const __global numtyp4 *restrict lj3,
|
||||
const int lj_types,
|
||||
const __global numtyp4 *restrict lj3,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
|
@ -50,20 +50,20 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -76,12 +76,12 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (r2inv<lj1[mtype].x) {
|
||||
r2inv=ucl_recip(r2inv);
|
||||
numtyp inv1,inv2;
|
||||
|
||||
|
||||
if (lj1[mtype].y == 2) {
|
||||
inv1=r2inv*r2inv;
|
||||
inv2=inv1*inv1;
|
||||
|
@ -93,7 +93,7 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
|
|||
inv2=inv1;
|
||||
}
|
||||
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
|
||||
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
@ -116,9 +116,9 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
|
|||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1_in,
|
||||
const __global numtyp4 *restrict lj3_in,
|
||||
const __global numtyp4 *restrict lj3_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
|
@ -139,30 +139,30 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0)
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -175,11 +175,11 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
if (r2inv<lj1[mtype].x) {
|
||||
r2inv=ucl_recip(r2inv);
|
||||
numtyp inv1,inv2;
|
||||
|
||||
|
||||
if (lj1[mtype].y == (numtyp)2) {
|
||||
inv1=r2inv*r2inv;
|
||||
inv2=inv1*inv1;
|
||||
|
@ -191,7 +191,7 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
|
|||
inv2=inv1;
|
||||
}
|
||||
numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
|
||||
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
|
|||
class CGCMM : public BaseAtomic<numtyp, acctyp> {
|
||||
public:
|
||||
CGCMM();
|
||||
~CGCMM();
|
||||
~CGCMM();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -40,7 +40,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
|
|||
int init(const int ntypes, double **host_cutsq, int **host_cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
|
@ -66,7 +66,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _cmm_types;
|
||||
|
||||
private:
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -28,9 +28,9 @@ static CGCMM<PRECISION,ACC_PRECISION> CMMMF;
|
|||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen) {
|
||||
CMMMF.clear();
|
||||
|
@ -55,7 +55,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
|
@ -78,7 +78,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
CMMMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -103,8 +103,8 @@ int** cmm_gpu_compute_n(const int ago, const int inum_full,
|
|||
return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -37,22 +37,22 @@ template <class numtyp, class acctyp>
|
|||
CGCMMLongT::~CGCMMLong() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CGCMMLongT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CGCMMLongT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
int CGCMMLongT::init(const int ntypes, double **host_cutsq,
|
||||
int **host_cg_type, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
double **host_cut_ljsq,
|
||||
double **host_cut_ljsq,
|
||||
const double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald) {
|
||||
|
@ -137,7 +137,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -149,13 +149,13 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -29,12 +29,12 @@ texture<int2> q_tex;
|
|||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1,
|
||||
const __global numtyp4 *restrict lj3,
|
||||
const int lj_types,
|
||||
const __global numtyp4 *restrict lj3,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
|
@ -70,7 +70,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
|
|||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
@ -136,7 +136,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
|
|||
if (rsq < lj1[mtype].y) {
|
||||
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
|
||||
lj3[mtype].w;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
@ -154,17 +154,17 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
|
|||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1_in,
|
||||
const __global numtyp4 *restrict lj3_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global numtyp4 *restrict lj3_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp *restrict q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
|
@ -179,7 +179,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
|
|||
lj1[tid]=lj1_in[tid];
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
|
@ -187,16 +187,16 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
|
@ -262,7 +262,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
|
|||
if (rsq < lj1[mtype].y) {
|
||||
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
|
||||
lj3[mtype].w;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -30,7 +30,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
|
|||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -40,8 +40,8 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
|
|||
int init(const int ntypes, double **host_cutsq, int ** cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
|
@ -58,7 +58,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
|
|||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
|
||||
/// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
|
||||
UCL_D_Vec<numtyp4> lj1;
|
||||
/// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
|
||||
UCL_D_Vec<numtyp4> lj3;
|
||||
|
@ -68,7 +68,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -28,9 +28,9 @@ static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF;
|
|||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
|
@ -58,7 +58,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
|
||||
|
||||
|
@ -82,7 +82,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||
qqrd2e, g_ewald);
|
||||
CMMLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -99,7 +99,7 @@ void cmml_gpu_clear() {
|
|||
|
||||
int** cmml_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
|
@ -109,8 +109,8 @@ int** cmml_gpu_compute_n(const int ago, const int inum_full,
|
|||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q,boxlo,prd);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
|
|||
CHARMMLongT::~CHARMMLong() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
|
@ -45,9 +45,9 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
|
|||
|
||||
template <class numtyp, class acctyp>
|
||||
int CHARMMLongT::init(const int ntypes,
|
||||
double host_cut_bothsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double host_cut_bothsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
|
@ -144,7 +144,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -153,17 +153,17 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
|
||||
this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
|
||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
|
||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -31,14 +31,14 @@ texture<int2> q_tex;
|
|||
|
||||
__kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1,
|
||||
const int lj_types,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const numtyp denom_lj,
|
||||
|
@ -61,7 +61,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
|||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
@ -93,7 +93,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
|||
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
if (rsq > cut_lj_innersq) {
|
||||
switch1 = (cut_ljsq-rsq);
|
||||
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
|
||||
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
|
||||
denom_lj;
|
||||
switch1 *= switch1;
|
||||
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
|
||||
|
@ -130,7 +130,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
|||
if (rsq > cut_lj_innersq)
|
||||
e *= switch1;
|
||||
energy+=factor_lj*e;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
@ -148,19 +148,19 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
|||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp2 *restrict ljd_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const numtyp denom_lj,
|
||||
const numtyp cut_bothsq, const numtyp cut_ljsq,
|
||||
const numtyp cut_bothsq, const numtyp cut_ljsq,
|
||||
const numtyp cut_lj_innersq,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
|
@ -174,7 +174,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
|||
ljd[tid]=ljd_in[tid];
|
||||
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
|
||||
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
|
@ -182,16 +182,16 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
@ -229,7 +229,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
|||
force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
|
||||
if (rsq > cut_lj_innersq) {
|
||||
switch1 = (cut_ljsq-rsq);
|
||||
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
|
||||
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
|
||||
denom_lj;
|
||||
switch1 *= switch1;
|
||||
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -30,7 +30,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
|
|||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -40,12 +40,12 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
|
|||
int init(const int ntypes, double host_cut_bothsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald,
|
||||
const double cut_lj_innersq, const double denom_lj,
|
||||
const double cut_lj_innersq, const double denom_lj,
|
||||
double **epsilon, double **sigma, const bool mix_arithmetic);
|
||||
|
||||
/// Clear all host and device data
|
||||
|
@ -70,7 +70,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _qqrd2e, _g_ewald, _denom_lj;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -87,7 +87,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
|||
sigma, mix_arithmetic);
|
||||
|
||||
CRMLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -104,7 +104,7 @@ void crml_gpu_clear() {
|
|||
|
||||
int** crml_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
|
@ -114,14 +114,14 @@ int** crml_gpu_compute_n(const int ago, const int inum_full,
|
|||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void crml_gpu_compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q, const int nlocal,
|
||||
bool &success, double *host_q, const int nlocal,
|
||||
double *boxlo, double *prd) {
|
||||
CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
|
||||
eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -33,23 +33,23 @@ ColloidT::Colloid() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
|
|||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
ColloidT::~Colloid() {
|
||||
ColloidT::~Colloid() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int ColloidT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int ColloidT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, double **host_a12,
|
||||
double **host_a1, double **host_a2,
|
||||
double **host_d1, double **host_d2,
|
||||
int ColloidT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, double **host_a12,
|
||||
double **host_a1, double **host_a2,
|
||||
double **host_d1, double **host_d2,
|
||||
double **host_sigma3, double **host_sigma6,
|
||||
int **host_form, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
|
@ -97,7 +97,7 @@ int ColloidT::init(const int ntypes,
|
|||
UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
|
||||
|
||||
|
||||
form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<ntypes; i++)
|
||||
for (int j=0; j<ntypes; j++) {
|
||||
|
@ -153,7 +153,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -170,9 +170,9 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
|
|||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||
&colloid1, &colloid2, &form,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&colloid1, &colloid2, &form,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : nguyentd@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -24,18 +24,18 @@ texture<int4,1> pos_tex;
|
|||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void k_colloid(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_colloid(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1,
|
||||
const __global numtyp4 *restrict lj3,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global numtyp4 *restrict colloid1,
|
||||
const __global numtyp4 *restrict lj3,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global numtyp4 *restrict colloid1,
|
||||
const __global numtyp4 *restrict colloid2,
|
||||
const __global int *form,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *form,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
|
@ -53,20 +53,20 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -79,21 +79,21 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<lj1[mtype].z) {
|
||||
if (rsq<lj1[mtype].z) {
|
||||
numtyp r,r2inv,r6inv;
|
||||
numtyp c1,c2,fR,evdwl;
|
||||
numtyp K[9],h[4],g[4];
|
||||
numtyp force = (numtyp)0;
|
||||
|
||||
|
||||
if (form[mtype]==0) { // SMALL_SMALL
|
||||
r2inv=ucl_recip(rsq);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
force*=factor_lj;
|
||||
} else if (form[mtype]==1) { // SMALL_LARGE
|
||||
c2 = colloid1[mtype].z;
|
||||
c2 = colloid1[mtype].z;
|
||||
K[1] = c2*c2;
|
||||
K[2] = rsq;
|
||||
K[0] = K[1] - rsq;
|
||||
|
@ -102,15 +102,15 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
|||
K[3] *= K[3]*K[3];
|
||||
K[6] = K[3]*K[3];
|
||||
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
|
||||
force = (numtyp)4.0/(numtyp)15.0*fR *
|
||||
((numtyp)2.0*(K[1]+K[2]) *
|
||||
(K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
|
||||
force = (numtyp)4.0/(numtyp)15.0*fR *
|
||||
((numtyp)2.0*(K[1]+K[2]) *
|
||||
(K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
|
||||
colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
|
||||
force*=factor_lj;
|
||||
} else if (form[mtype]==2) { // LARGE_LARGE
|
||||
r = ucl_sqrt(rsq);
|
||||
c1 = colloid1[mtype].y;
|
||||
c2 = colloid1[mtype].z;
|
||||
c1 = colloid1[mtype].y;
|
||||
c2 = colloid1[mtype].z;
|
||||
K[0] = c1*c2;
|
||||
K[1] = c1+c2;
|
||||
K[2] = c1-c2;
|
||||
|
@ -132,16 +132,16 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
|||
g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
|
||||
g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
|
||||
g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
|
||||
|
||||
|
||||
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
|
||||
evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
|
||||
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
|
||||
numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
|
||||
(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
|
||||
(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
|
||||
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
|
||||
force = factor_lj * (dUR+dUA)/r;
|
||||
}
|
||||
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
@ -151,14 +151,14 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
|||
if (form[mtype]==0) {
|
||||
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
} else if (form[mtype]==1) {
|
||||
e=(numtyp)2.0/(numtyp)9.0*fR *
|
||||
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +
|
||||
e=(numtyp)2.0/(numtyp)9.0*fR *
|
||||
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +
|
||||
(numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
|
||||
} else if (form[mtype]==2) {
|
||||
e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
|
||||
e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
|
||||
((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
|
||||
}
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
@ -176,22 +176,22 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
|||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1_in,
|
||||
const __global numtyp4 *restrict lj3_in,
|
||||
const __global numtyp4 *restrict lj3_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global numtyp4 *restrict colloid1_in,
|
||||
const __global numtyp4 *restrict colloid1_in,
|
||||
const __global numtyp4 *restrict colloid2_in,
|
||||
const __global int *form_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const __global int *form_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
|
@ -208,7 +208,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
if (eflag>0)
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
|
@ -217,7 +217,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
|
@ -231,7 +231,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
@ -244,20 +244,20 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
|
||||
if (rsq<lj1[mtype].z) {
|
||||
numtyp r,r2inv,r6inv;
|
||||
numtyp c1,c2,fR,evdwl;
|
||||
numtyp K[9],h[4],g[4];
|
||||
numtyp force = (numtyp)0;
|
||||
|
||||
|
||||
if (form[mtype]==0) { // SMALL_SMALL
|
||||
r2inv=ucl_recip(rsq);
|
||||
r6inv = r2inv*r2inv*r2inv;
|
||||
force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
force*=factor_lj;
|
||||
} else if (form[mtype]==1) { // SMALL_LARGE
|
||||
c2 = colloid1[mtype].z;
|
||||
c2 = colloid1[mtype].z;
|
||||
K[1] = c2*c2;
|
||||
K[2] = rsq;
|
||||
K[0] = K[1] - rsq;
|
||||
|
@ -266,15 +266,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
K[3] *= K[3]*K[3];
|
||||
K[6] = K[3]*K[3];
|
||||
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
|
||||
force = (numtyp)4.0/(numtyp)15.0*fR *
|
||||
((numtyp)2.0*(K[1]+K[2]) *
|
||||
(K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
|
||||
force = (numtyp)4.0/(numtyp)15.0*fR *
|
||||
((numtyp)2.0*(K[1]+K[2]) *
|
||||
(K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
|
||||
colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
|
||||
force*=factor_lj;
|
||||
} else if (form[mtype]==2) { // LARGE_LARGE
|
||||
r = ucl_sqrt(rsq);
|
||||
c1 = colloid1[mtype].y;
|
||||
c2 = colloid1[mtype].z;
|
||||
c1 = colloid1[mtype].y;
|
||||
c2 = colloid1[mtype].z;
|
||||
K[0] = c1*c2;
|
||||
K[1] = c1+c2;
|
||||
K[2] = c1-c2;
|
||||
|
@ -296,16 +296,16 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
|
||||
g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
|
||||
g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
|
||||
|
||||
|
||||
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
|
||||
evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
|
||||
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
|
||||
numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
|
||||
(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
|
||||
(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
|
||||
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
|
||||
force = factor_lj * (dUR+dUA)/r;
|
||||
} else force = (numtyp)0.0;
|
||||
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
@ -315,15 +315,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
|||
if (form[mtype]==0) {
|
||||
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
} else if (form[mtype]==1) {
|
||||
e=(numtyp)2.0/(numtyp)9.0*fR *
|
||||
e=(numtyp)2.0/(numtyp)9.0*fR *
|
||||
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+
|
||||
(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
|
||||
colloid2[mtype].w/K[6]);
|
||||
} else if (form[mtype]==2) {
|
||||
e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
|
||||
e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
|
||||
((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
|
||||
}
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
|
|||
class Colloid : public BaseAtomic<numtyp, acctyp> {
|
||||
public:
|
||||
Colloid();
|
||||
~Colloid();
|
||||
~Colloid();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -40,11 +40,11 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
|
|||
int init(const int ntypes, double **host_cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||
double **host_a12, double **host_a1, double **host_a2,
|
||||
double **host_d1, double **host_d2, double **host_sigma3,
|
||||
double **host_sigma6, int **host_form,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
double **host_a12, double **host_a1, double **host_a2,
|
||||
double **host_d1, double **host_d2, double **host_sigma3,
|
||||
double **host_sigma6, int **host_form,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
|
@ -65,7 +65,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
|
|||
UCL_D_Vec<numtyp4> lj3;
|
||||
/// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2
|
||||
UCL_D_Vec<numtyp4> colloid1;
|
||||
/// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3,
|
||||
/// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3,
|
||||
/// colloid2.w = sigma6
|
||||
UCL_D_Vec<numtyp4> colloid2;
|
||||
/// form
|
||||
|
@ -76,7 +76,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
private:
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -29,9 +29,9 @@ static Colloid<PRECISION,ACC_PRECISION> COLLMF;
|
|||
// ---------------------------------------------------------------------------
|
||||
int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double *special_lj,
|
||||
double **host_a12, double **host_a1, double **host_a2,
|
||||
double **host_d1, double **host_d2, double **host_sigma3,
|
||||
double **offset, double *special_lj,
|
||||
double **host_a12, double **host_a1, double **host_a2,
|
||||
double **host_d1, double **host_d2, double **host_sigma3,
|
||||
double **host_sigma6, int **host_form, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
|
@ -57,9 +57,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, host_a12, host_a1,
|
||||
host_a2, host_d1, host_d2, host_sigma3,
|
||||
host_a2, host_d1, host_d2, host_sigma3,
|
||||
host_sigma6, host_form, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
|
@ -78,13 +78,13 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, special_lj, host_a12, host_a1, host_a2,
|
||||
host_d1, host_d2, host_sigma3, host_sigma6, host_form,
|
||||
offset, special_lj, host_a12, host_a1, host_a2,
|
||||
host_d1, host_d2, host_sigma3, host_sigma6, host_form,
|
||||
inum, nall, 300, maxspecial,
|
||||
cell_size, gpu_split, screen);
|
||||
|
||||
COLLMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -109,8 +109,8 @@ int ** colloid_gpu_compute_n(const int ago, const int inum_full,
|
|||
return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : ndtrung@umich.edu
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
|
|||
CoulT::~Coul() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CoulT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
|
@ -75,7 +75,7 @@ int CoulT::init(const int ntypes, double **host_scale, double **host_cutsq,
|
|||
|
||||
scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
|
||||
|
||||
|
||||
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
|
||||
|
||||
|
@ -97,10 +97,10 @@ void CoulT::reinit(const int ntypes, double **host_scale) {
|
|||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
|
||||
|
||||
for (int i=0; i<_lj_types*_lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
|
||||
this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
|
||||
}
|
||||
|
||||
|
@ -138,7 +138,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -149,14 +149,14 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_qqrd2e, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_qqrd2e, &this->_threads_per_atom);
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : ndtrung@umich.edu
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -33,14 +33,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
|
|||
const __global numtyp *restrict scale,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const numtyp qqrd2e, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
@ -50,7 +50,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
|
|||
sp_cl[1]=sp_cl_in[1];
|
||||
sp_cl[2]=sp_cl_in[2];
|
||||
sp_cl[3]=sp_cl_in[3];
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
|
@ -58,13 +58,13 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
@ -120,14 +120,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
|
|||
__kernel void k_coul_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict scale,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const __global numtyp *restrict _cutsq,
|
||||
const __global numtyp *restrict _cutsq,
|
||||
const numtyp qqrd2e, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
@ -139,7 +139,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
|
|||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
cutsq[tid]=_cutsq[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
|
@ -147,15 +147,15 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : ndtrung@umich.edu
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -30,7 +30,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
|
|||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -39,13 +39,13 @@ class Coul : public BaseCharge<numtyp, acctyp> {
|
|||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_scale,
|
||||
double **host_cutsq, double *host_special_coul,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, const double qqrd2e);
|
||||
|
||||
|
||||
/// Send updated coeffs from host to device (to be compatible with fix adapt)
|
||||
void reinit(const int ntypes, double **host_scale);
|
||||
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
@ -68,7 +68,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _qqrd2e;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : ndtrung@umich.edu
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
|
|||
CoulDebyeT::~CoulDebye() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CoulDebyeT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
|
@ -87,7 +87,7 @@ int CoulDebyeT::init(const int ntypes, double **host_scale,
|
|||
|
||||
_qqrd2e=qqrd2e;
|
||||
_kappa=kappa;
|
||||
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=cutsq.row_bytes()+scale.row_bytes()+sp_cl.row_bytes();
|
||||
return 0;
|
||||
|
@ -98,10 +98,10 @@ void CoulDebyeT::reinit(const int ntypes, double **host_scale) {
|
|||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
|
||||
|
||||
for (int i=0; i<_lj_types*_lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
|
||||
this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
|
||||
}
|
||||
|
||||
|
@ -139,7 +139,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -156,9 +156,9 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
|
|||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
|
||||
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
|
||||
&_qqrd2e, &_kappa, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// begin :
|
||||
// email : ndtrung@umich.edu
|
||||
// ***************************************************************************/
|
||||
|
||||
|
@ -31,16 +31,16 @@ texture<int2> q_tex;
|
|||
|
||||
__kernel void k_coul_debye(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict scale,
|
||||
const int lj_types,
|
||||
const int lj_types,
|
||||
const __global numtyp *restrict sp_cl_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch,
|
||||
const __global numtyp *restrict q_ ,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const numtyp qqrd2e, const numtyp kappa,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
|
@ -59,27 +59,27 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_coul;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
factor_coul = sp_cl[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
|
@ -146,7 +146,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
|
|||
scale[tid]=scale_in[tid];
|
||||
cutsq[tid]=_cutsq[tid];
|
||||
}
|
||||
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
|
@ -154,15 +154,15 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
|
|||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (ii<inum) {
|
||||
int i, numj, nbor, nbor_end;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : ndtrung@umich.edu
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -30,7 +30,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
|
|||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
|
@ -39,14 +39,14 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
|
|||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_scale,
|
||||
double **host_cutsq, double *host_special_coul,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const double qqrd2e, const double kappa);
|
||||
|
||||
|
||||
/// Send updated coeffs from host to device (to be compatible with fix adapt)
|
||||
void reinit(const int ntypes, double **host_scale);
|
||||
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
@ -69,7 +69,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
|
|||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _qqrd2e,_kappa;
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
begin :
|
||||
email : ndtrung@umich.edu
|
||||
***************************************************************************/
|
||||
|
||||
|
@ -75,7 +75,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
|
|||
maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
|
||||
|
||||
CDEMF.device->gpu_barrier();
|
||||
if (message)
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
|
@ -93,16 +93,16 @@ void cdebye_gpu_reinit(const int ntypes, double **host_scale) {
|
|||
int world_me=CDEMF.device->world_me();
|
||||
int gpu_rank=CDEMF.device->gpu_rank();
|
||||
int procs_per_gpu=CDEMF.device->procs_per_gpu();
|
||||
|
||||
|
||||
if (world_me==0)
|
||||
CDEMF.reinit(ntypes, host_scale);
|
||||
|
||||
|
||||
CDEMF.device->world_barrier();
|
||||
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
CDEMF.reinit(ntypes, host_scale);
|
||||
|
||||
|
||||
CDEMF.device->gpu_barrier();
|
||||
}
|
||||
}
|
||||
|
@ -123,8 +123,8 @@ int** cdebye_gpu_compute_n(const int ago, const int inum_full,
|
|||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void cdebye_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
|
|
|
@ -37,18 +37,18 @@ template <class numtyp, class acctyp>
|
|||
CoulDSFT::~CoulDSF() {
|
||||
clear();
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CoulDSFT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size, const double gpu_split, FILE *_screen,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double e_shift, const double f_shift,
|
||||
const double qqrd2e, const double e_shift, const double f_shift,
|
||||
const double alpha) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
|
@ -123,7 +123,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
|
|||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
|
@ -134,15 +134,15 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
|
|||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv,
|
||||
this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
|
||||
&this->_threads_per_atom);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue