lammps/lib/gpu/geryon/ocl_memory.h

760 lines
27 KiB
C++

/***************************************************************************
ocl_memory.h
-------------------
W. Michael Brown
OpenCL Specific Memory Management and Vector/Matrix Containers
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Wed Jan 13 2010
copyright : (C) 2010 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
Copyright (2010) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the Simplified BSD License.
----------------------------------------------------------------------- */
#ifndef OCL_MEMORY_H
#define OCL_MEMORY_H
#include <iostream>
#include <cassert>
#include <cstring>
#include "ucl_types.h"
namespace ucl_opencl {
// --------------------------------------------------------------------------
// - API Specific Types
// --------------------------------------------------------------------------
struct ocl_kernel_dim {
size_t x,y,z;
ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
x(_x), y(_y), z(_z) {}
operator size_t * () { return (size_t *)this; }
operator const size_t * () const { return (const size_t *)this; }
};
typedef ocl_kernel_dim ucl_kernel_dim;
// --------------------------------------------------------------------------
// - API SPECIFIC DEVICE POINTERS
// --------------------------------------------------------------------------
typedef cl_mem device_ptr;
// --------------------------------------------------------------------------
// - HOST MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
cl_mem_flags buffer_perm;
cl_map_flags map_perm;
if (kind2==UCL_NOT_SPECIFIED) {
if (kind==UCL_READ_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
#else
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
#endif
map_perm=CL_MAP_READ;
} else if (kind==UCL_WRITE_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
#else
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
#endif
map_perm=CL_MAP_WRITE;
} else {
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
map_perm=CL_MAP_READ | CL_MAP_WRITE;
}
} else {
if (kind2==UCL_READ_ONLY)
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
else if (kind2==UCL_WRITE_ONLY)
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
else
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
if (kind==UCL_READ_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
#endif
map_perm=CL_MAP_READ;
} else if (kind==UCL_WRITE_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY;
#endif
map_perm=CL_MAP_WRITE;
} else
map_perm=CL_MAP_READ | CL_MAP_WRITE;
}
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
map_perm,0,n,0,NULL,NULL,NULL);
mat.cq()=cm.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type, class copy_type>
inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
cl_mem_flags orig_flags;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
&orig_flags,NULL));
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
*mat.host_ptr(), &error_flag);
CL_CHECK_ERR(error_flag);
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_mem_flags buffer_perm;
cl_map_flags map_perm;
if (kind==UCL_READ_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
#else
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
#endif
map_perm=CL_MAP_READ;
} else if (kind==UCL_WRITE_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
#else
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
#endif
map_perm=CL_MAP_WRITE;
} else {
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
map_perm=CL_MAP_READ | CL_MAP_WRITE;
}
cl_int error_flag;
mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
map_perm,0,n,0,NULL,NULL,NULL);
mat.cq()=dev.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type>
inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
cl_int error_flag;
mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
n,*mat.host_ptr(),&error_flag);
CL_CHECK_ERR(error_flag);
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type>
inline void _host_free(mat_type &mat) {
if (mat.cols()>0) {
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
}
}
template <class mat_type>
inline int _host_resize(mat_type &mat, const size_t n) {
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
cl_mem_flags buffer_perm;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm),
&buffer_perm,NULL));
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
cl_map_flags map_perm;
if (mat.kind()==UCL_READ_ONLY)
map_perm=CL_MAP_READ;
else if (mat.kind()==UCL_WRITE_ONLY)
map_perm=CL_MAP_WRITE;
else
map_perm=CL_MAP_READ | CL_MAP_WRITE;
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
map_perm,0,n,0,NULL,NULL,NULL);
return UCL_SUCCESS;
}
// --------------------------------------------------------------------------
// - DEVICE MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind) {
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
cl_mem_flags flag;
if (kind==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (kind==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY;
#endif
else if (kind==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY;
#endif
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type>
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind) {
cl_int error_flag;
cl_mem_flags flag;
if (kind==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (kind==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY;
#endif
else if (kind==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY;
#endif
else
assert(0==1);
mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=dev.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type, class copy_type>
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
size_t padded_cols=cols;
if (cols%256!=0)
padded_cols+=256-cols%256;
pitch=padded_cols*sizeof(typename mat_type::data_type);
return _device_alloc(mat,cm,pitch*rows,kind);
}
template <class mat_type>
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
const size_t cols, size_t &pitch,
const enum UCL_MEMOPT kind) {
size_t padded_cols=cols;
if (dev.device_type()!=UCL_CPU && cols%256!=0)
padded_cols+=256-cols%256;
pitch=padded_cols*sizeof(typename mat_type::data_type);
return _device_alloc(mat,dev,pitch*rows,kind);
}
template <class mat_type>
inline void _device_free(mat_type &mat) {
if (mat.cols()>0) {
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
}
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t n) {
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
cl_mem_flags flag;
if (mat.kind()==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (mat.kind()==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY;
#endif
else if (mat.kind()==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY;
#endif
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t rows,
const size_t cols, size_t &pitch) {
size_t padded_cols=cols;
if (cols%256!=0)
padded_cols+=256-cols%256;
pitch=padded_cols*sizeof(typename mat_type::data_type);
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
cl_mem_flags flag;
if (mat.kind()==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (mat.kind()==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY;
#endif
else if (mat.kind()==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY;
#endif
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
// --------------------------------------------------------------------------
// - ZERO ROUTINES
// --------------------------------------------------------------------------
inline void _host_zero(void *ptr, const size_t n) {
memset(ptr,0,n);
}
inline void _ocl_build(cl_program &program, cl_device_id &device,
const char* options = "") {
clBuildProgram(program,1,&device,options,NULL,NULL);
cl_build_status build_status;
CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status),&build_status,
NULL));
if (build_status == CL_SUCCESS)
return;
size_t ms;
CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
NULL, &ms));
char build_log[ms];
CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms,
build_log, NULL));
std::cerr << std::endl
<< "----------------------------------------------------------\n"
<< " Error compiling OpenCL Program...\n"
<< "----------------------------------------------------------\n";
std::cerr << build_log << std::endl;
}
inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
const char **source, const size_t lines,
cl_kernel &kernel, const char *function,
const char *options="") {
cl_int error_flag;
cl_program program=clCreateProgramWithSource(context,lines,source,
NULL,&error_flag);
CL_CHECK_ERR(error_flag);
_ocl_build(program,device,options);
kernel=clCreateKernel(program,function,&error_flag);
CL_CHECK_ERR(error_flag);
}
template <class mat_type>
inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
#ifdef CL_VERSION_1_2
#ifndef __APPLE__
#define UCL_CL_ZERO
#endif
#endif
#ifdef UCL_CL_ZERO
cl_int zeroint=0;
CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int),
mat.byteoff(),n,0,NULL,NULL));
#else
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
cl_device_id device;
CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES,
sizeof(cl_device_id),&device,NULL));
const char * szero[3]={
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void _device_zero(__global NUMTYP *a, const int offset)",
" { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
};
cl_kernel kzero;
_ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
_UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
cl_int offset=mat.offset();
CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
size_t kn=n/sizeof(typename mat_type::data_type);
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
#endif
}
// --------------------------------------------------------------------------
// - MEMCPY ROUTINES
// --------------------------------------------------------------------------
template<int mem1, int mem2> struct _ucl_memcpy;
// Both are images
template<> struct _ucl_memcpy<2,2> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
};
// Destination is texture, source on device
template<> struct _ucl_memcpy<2,0> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
};
// Destination is texture, source on host
template<> struct _ucl_memcpy<2,1> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
};
// Source is texture, dest on device
template<> struct _ucl_memcpy<0,2> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
};
// Source is texture, dest on host
template<> struct _ucl_memcpy<1,2> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
assert(0==1);
}
};
// Neither are textures, destination on host
template <> struct _ucl_memcpy<1,0> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 1S\n";
#endif
if (block) ucl_sync(cq);
return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 1NS\n";
#endif
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
dst.begin(),0,NULL,NULL));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 2S\n";
#endif
return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 2NS\n";
#endif
if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
spitch*rows,
(char *)dst.begin()+dst_offset,0,NULL,
NULL));
else
for (size_t i=0; i<rows; i++) {
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,cols,
(char *)dst.begin()+dst_offset,0,NULL,
NULL));
src_offset+=spitch;
dst_offset+=dpitch;
}
}
};
// Neither are textures, source on host
template <> struct _ucl_memcpy<0,1> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3S\n";
#endif
return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3NS\n";
#endif
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
src.begin(),0,NULL,NULL));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4S\n";
#endif
return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4NS\n";
#endif
if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
spitch*rows,
(char *)src.begin()+src_offset,0,NULL,
NULL));
else
for (size_t i=0; i<rows; i++) {
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,cols,
(char *)src.begin()+src_offset,0,NULL,
NULL));
src_offset+=spitch;
dst_offset+=dpitch;
}
}
};
// Neither are textures, both on device
template <int mem1, int mem2> struct _ucl_memcpy {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
dst_offset,n,0,NULL,NULL));
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 6NS\n";
#endif
}
#ifdef UCL_DBG_MEM_TRACE
else std::cerr << "UCL_COPY 6S\n";
#endif
if (block==CL_TRUE) ucl_sync(cq);
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
const size_t spitch, const size_t cols,
const size_t rows, cl_command_queue &cq,
const cl_bool block,
size_t dst_offset, size_t src_offset) {
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 7NS\n";
#endif
if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
dst_offset,spitch*rows,0,NULL,NULL));
else
for (size_t i=0; i<rows; i++) {
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),
src_offset,dst_offset,cols,0,
NULL,NULL));
src_offset+=spitch;
dst_offset+=dpitch;
}
}
#ifdef UCL_DBG_MEM_TRACE
else std::cerr << "UCL_COPY 7S\n";
#endif
if (block==CL_TRUE) ucl_sync(cq);
}
};
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,dst.cq(),CL_TRUE,
dst.byteoff(),src.byteoff());
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
cl_command_queue &cq) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,cq,CL_FALSE,
dst.byteoff(),src.byteoff());
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols,
const size_t rows) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
rows,dst.cq(),CL_TRUE,
dst.byteoff(),src.byteoff());
}
template<class mat1, class mat2>
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
const size_t spitch, const size_t cols,
const size_t rows,cl_command_queue &cq) {
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
rows,cq,CL_FALSE,
dst.byteoff(),src.byteoff());
}
} // namespace ucl_cudart
#endif