forked from lijiext/lammps
761 lines
27 KiB
C++
761 lines
27 KiB
C++
/***************************************************************************
|
|
ocl_memory.h
|
|
-------------------
|
|
W. Michael Brown
|
|
|
|
OpenCL Specific Memory Management and Vector/Matrix Containers
|
|
|
|
__________________________________________________________________________
|
|
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
|
__________________________________________________________________________
|
|
|
|
begin : Wed Jan 13 2010
|
|
copyright : (C) 2010 by W. Michael Brown
|
|
email : brownw@ornl.gov
|
|
***************************************************************************/
|
|
|
|
/* -----------------------------------------------------------------------
|
|
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the Simplified BSD License.
|
|
----------------------------------------------------------------------- */
|
|
|
|
#ifndef OCL_MEMORY_H
|
|
#define OCL_MEMORY_H
|
|
|
|
#include <iostream>
|
|
#include <cassert>
|
|
#include <cstring>
|
|
#include "ucl_types.h"
|
|
|
|
namespace ucl_opencl {
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - API Specific Types
|
|
// --------------------------------------------------------------------------
|
|
struct ocl_kernel_dim {
|
|
size_t x,y,z;
|
|
ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
|
|
x(_x), y(_y), z(_z) {}
|
|
operator size_t * () { return (size_t *)this; }
|
|
operator const size_t * () const { return (const size_t *)this; }
|
|
};
|
|
typedef ocl_kernel_dim ucl_kernel_dim;
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - API SPECIFIC DEVICE POINTERS
|
|
// --------------------------------------------------------------------------
|
|
typedef cl_mem device_ptr;
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - HOST MEMORY ALLOCATION ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
|
|
template <class mat_type, class copy_type>
|
|
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
|
cl_int error_flag;
|
|
cl_context context;
|
|
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
|
&context,NULL));
|
|
|
|
cl_mem_flags buffer_perm;
|
|
cl_map_flags map_perm;
|
|
if (kind2==UCL_NOT_SPECIFIED) {
|
|
if (kind==UCL_READ_ONLY) {
|
|
#ifdef CL_VERSION_1_2
|
|
buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
|
|
#else
|
|
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
|
#endif
|
|
map_perm=CL_MAP_READ;
|
|
} else if (kind==UCL_WRITE_ONLY) {
|
|
#ifdef CL_VERSION_1_2
|
|
buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
|
|
#else
|
|
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
|
#endif
|
|
map_perm=CL_MAP_WRITE;
|
|
} else {
|
|
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
|
|
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
|
}
|
|
} else {
|
|
if (kind2==UCL_READ_ONLY)
|
|
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
|
else if (kind2==UCL_WRITE_ONLY)
|
|
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
|
else
|
|
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
|
|
|
|
if (kind==UCL_READ_ONLY) {
|
|
#ifdef CL_VERSION_1_2
|
|
buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
|
|
#endif
|
|
map_perm=CL_MAP_READ;
|
|
} else if (kind==UCL_WRITE_ONLY) {
|
|
#ifdef CL_VERSION_1_2
|
|
buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY;
|
|
#endif
|
|
map_perm=CL_MAP_WRITE;
|
|
} else
|
|
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
|
}
|
|
|
|
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
|
|
if (error_flag != CL_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
*mat.host_ptr() = (typename mat_type::data_type*)
|
|
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
|
|
map_perm,0,n,0,NULL,NULL,NULL);
|
|
|
|
mat.cq()=cm.cq();
|
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type, class copy_type>
|
|
inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
|
|
cl_int error_flag;
|
|
cl_context context;
|
|
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
|
&context,NULL));
|
|
cl_mem_flags orig_flags;
|
|
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
|
|
&orig_flags,NULL));
|
|
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
|
|
|
|
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
|
|
*mat.host_ptr(), &error_flag);
|
|
|
|
CL_CHECK_ERR(error_flag);
|
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
|
cl_mem_flags buffer_perm;
|
|
cl_map_flags map_perm;
|
|
if (kind==UCL_READ_ONLY) {
|
|
#ifdef CL_VERSION_1_2
|
|
buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
|
|
#else
|
|
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
|
#endif
|
|
map_perm=CL_MAP_READ;
|
|
} else if (kind==UCL_WRITE_ONLY) {
|
|
#ifdef CL_VERSION_1_2
|
|
buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
|
|
#else
|
|
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
|
#endif
|
|
map_perm=CL_MAP_WRITE;
|
|
} else {
|
|
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
|
|
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
|
}
|
|
|
|
cl_int error_flag;
|
|
mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
|
|
if (error_flag != CL_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
|
|
*mat.host_ptr() = (typename mat_type::data_type*)
|
|
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
|
|
map_perm,0,n,0,NULL,NULL,NULL);
|
|
mat.cq()=dev.cq();
|
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
|
|
cl_int error_flag;
|
|
mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
|
|
n,*mat.host_ptr(),&error_flag);
|
|
CL_CHECK_ERR(error_flag);
|
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline void _host_free(mat_type &mat) {
|
|
if (mat.cols()>0) {
|
|
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
|
|
}
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline int _host_resize(mat_type &mat, const size_t n) {
|
|
cl_int error_flag;
|
|
cl_context context;
|
|
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
|
&context,NULL));
|
|
cl_mem_flags buffer_perm;
|
|
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm),
|
|
&buffer_perm,NULL));
|
|
|
|
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
|
|
|
cl_map_flags map_perm;
|
|
if (mat.kind()==UCL_READ_ONLY)
|
|
map_perm=CL_MAP_READ;
|
|
else if (mat.kind()==UCL_WRITE_ONLY)
|
|
map_perm=CL_MAP_WRITE;
|
|
else
|
|
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
|
|
|
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
|
|
if (error_flag != CL_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
*mat.host_ptr() = (typename mat_type::data_type*)
|
|
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
|
|
map_perm,0,n,0,NULL,NULL,NULL);
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - DEVICE MEMORY ALLOCATION ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
|
|
template <class mat_type, class copy_type>
|
|
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|
const enum UCL_MEMOPT kind) {
|
|
cl_int error_flag;
|
|
|
|
cl_context context;
|
|
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
|
&context,NULL));
|
|
cl_mem_flags flag;
|
|
if (kind==UCL_READ_WRITE)
|
|
flag=CL_MEM_READ_WRITE;
|
|
else if (kind==UCL_READ_ONLY)
|
|
#ifdef CL_VERSION_1_2
|
|
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
|
|
#else
|
|
flag=CL_MEM_READ_ONLY;
|
|
#endif
|
|
else if (kind==UCL_WRITE_ONLY)
|
|
#ifdef CL_VERSION_1_2
|
|
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
|
|
#else
|
|
flag=CL_MEM_WRITE_ONLY;
|
|
#endif
|
|
else
|
|
assert(0==1);
|
|
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
|
|
if (error_flag != CL_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
mat.cq()=cm.cq();
|
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|
const enum UCL_MEMOPT kind) {
|
|
cl_int error_flag;
|
|
cl_mem_flags flag;
|
|
if (kind==UCL_READ_WRITE)
|
|
flag=CL_MEM_READ_WRITE;
|
|
else if (kind==UCL_READ_ONLY)
|
|
#ifdef CL_VERSION_1_2
|
|
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
|
|
#else
|
|
flag=CL_MEM_READ_ONLY;
|
|
#endif
|
|
else if (kind==UCL_WRITE_ONLY)
|
|
#ifdef CL_VERSION_1_2
|
|
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
|
|
#else
|
|
flag=CL_MEM_WRITE_ONLY;
|
|
#endif
|
|
else
|
|
assert(0==1);
|
|
mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
|
|
&error_flag);
|
|
if (error_flag != CL_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
mat.cq()=dev.cq();
|
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type, class copy_type>
|
|
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
|
const size_t cols, size_t &pitch,
|
|
const enum UCL_MEMOPT kind) {
|
|
size_t padded_cols=cols;
|
|
if (cols%256!=0)
|
|
padded_cols+=256-cols%256;
|
|
pitch=padded_cols*sizeof(typename mat_type::data_type);
|
|
return _device_alloc(mat,cm,pitch*rows,kind);
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
|
|
const size_t cols, size_t &pitch,
|
|
const enum UCL_MEMOPT kind) {
|
|
size_t padded_cols=cols;
|
|
if (dev.device_type()!=UCL_CPU && cols%256!=0)
|
|
padded_cols+=256-cols%256;
|
|
pitch=padded_cols*sizeof(typename mat_type::data_type);
|
|
return _device_alloc(mat,dev,pitch*rows,kind);
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline void _device_free(mat_type &mat) {
|
|
if (mat.cols()>0) {
|
|
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
|
|
}
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline int _device_resize(mat_type &mat, const size_t n) {
|
|
cl_int error_flag;
|
|
|
|
cl_context context;
|
|
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
|
&context,NULL));
|
|
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
|
|
|
cl_mem_flags flag;
|
|
if (mat.kind()==UCL_READ_WRITE)
|
|
flag=CL_MEM_READ_WRITE;
|
|
else if (mat.kind()==UCL_READ_ONLY)
|
|
#ifdef CL_VERSION_1_2
|
|
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
|
|
#else
|
|
flag=CL_MEM_READ_ONLY;
|
|
#endif
|
|
else if (mat.kind()==UCL_WRITE_ONLY)
|
|
#ifdef CL_VERSION_1_2
|
|
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
|
|
#else
|
|
flag=CL_MEM_WRITE_ONLY;
|
|
#endif
|
|
else
|
|
assert(0==1);
|
|
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
|
|
if (error_flag != CL_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline int _device_resize(mat_type &mat, const size_t rows,
|
|
const size_t cols, size_t &pitch) {
|
|
size_t padded_cols=cols;
|
|
if (cols%256!=0)
|
|
padded_cols+=256-cols%256;
|
|
pitch=padded_cols*sizeof(typename mat_type::data_type);
|
|
|
|
cl_int error_flag;
|
|
|
|
cl_context context;
|
|
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
|
&context,NULL));
|
|
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
|
|
|
cl_mem_flags flag;
|
|
if (mat.kind()==UCL_READ_WRITE)
|
|
flag=CL_MEM_READ_WRITE;
|
|
else if (mat.kind()==UCL_READ_ONLY)
|
|
#ifdef CL_VERSION_1_2
|
|
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
|
|
#else
|
|
flag=CL_MEM_READ_ONLY;
|
|
#endif
|
|
else if (mat.kind()==UCL_WRITE_ONLY)
|
|
#ifdef CL_VERSION_1_2
|
|
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
|
|
#else
|
|
flag=CL_MEM_WRITE_ONLY;
|
|
#endif
|
|
else
|
|
assert(0==1);
|
|
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
|
|
if (error_flag != CL_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - ZERO ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
inline void _host_zero(void *ptr, const size_t n) {
|
|
memset(ptr,0,n);
|
|
}
|
|
|
|
inline void _ocl_build(cl_program &program, cl_device_id &device,
|
|
const char* options = "") {
|
|
clBuildProgram(program,1,&device,options,NULL,NULL);
|
|
|
|
cl_build_status build_status;
|
|
CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
|
|
sizeof(cl_build_status),&build_status,
|
|
NULL));
|
|
if (build_status == CL_SUCCESS)
|
|
return;
|
|
|
|
size_t ms;
|
|
CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
|
|
NULL, &ms));
|
|
char *build_log = new char[ms];
|
|
CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms,
|
|
build_log, NULL));
|
|
|
|
std::cerr << std::endl
|
|
<< "----------------------------------------------------------\n"
|
|
<< " Error compiling OpenCL Program...\n"
|
|
<< "----------------------------------------------------------\n";
|
|
std::cerr << build_log << std::endl;
|
|
delete[] build_log;
|
|
}
|
|
|
|
inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
|
|
const char **source, const size_t lines,
|
|
cl_kernel &kernel, const char *function,
|
|
const char *options="") {
|
|
cl_int error_flag;
|
|
|
|
cl_program program=clCreateProgramWithSource(context,lines,source,
|
|
NULL,&error_flag);
|
|
CL_CHECK_ERR(error_flag);
|
|
_ocl_build(program,device,options);
|
|
kernel=clCreateKernel(program,function,&error_flag);
|
|
CL_CHECK_ERR(error_flag);
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
|
|
#ifdef CL_VERSION_1_2
|
|
#ifndef __APPLE__
|
|
#define UCL_CL_ZERO
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef UCL_CL_ZERO
|
|
cl_int zeroint=0;
|
|
CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int),
|
|
mat.byteoff(),n,0,NULL,NULL));
|
|
|
|
#else
|
|
cl_context context;
|
|
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
|
&context,NULL));
|
|
cl_device_id device;
|
|
CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES,
|
|
sizeof(cl_device_id),&device,NULL));
|
|
|
|
const char * szero[3]={
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
|
"__kernel void _device_zero(__global NUMTYP *a, const int offset)",
|
|
" { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
|
|
};
|
|
|
|
cl_kernel kzero;
|
|
_ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
|
|
_UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
|
|
|
|
cl_int offset=mat.offset();
|
|
CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
|
|
CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
|
|
size_t kn=n/sizeof(typename mat_type::data_type);
|
|
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
|
|
#endif
|
|
}
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - MEMCPY ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
|
|
template<int mem1, int mem2> struct _ucl_memcpy;
|
|
|
|
// Both are images
|
|
template<> struct _ucl_memcpy<2,2> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
cl_command_queue &cq, const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, cl_command_queue &cq,
|
|
const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
};
|
|
|
|
// Destination is texture, source on device
|
|
template<> struct _ucl_memcpy<2,0> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
cl_command_queue &cq, const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, cl_command_queue &cq,
|
|
const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
};
|
|
|
|
// Destination is texture, source on host
|
|
template<> struct _ucl_memcpy<2,1> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
cl_command_queue &cq, const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, cl_command_queue &cq,
|
|
const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
};
|
|
|
|
// Source is texture, dest on device
|
|
template<> struct _ucl_memcpy<0,2> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
cl_command_queue &cq, const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, cl_command_queue &cq,
|
|
const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
};
|
|
|
|
// Source is texture, dest on host
|
|
template<> struct _ucl_memcpy<1,2> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
cl_command_queue &cq, const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, cl_command_queue &cq,
|
|
const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
assert(0==1);
|
|
}
|
|
};
|
|
|
|
// Neither are textures, destination on host
|
|
template <> struct _ucl_memcpy<1,0> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
cl_command_queue &cq, const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
if (src.cbegin()==dst.cbegin()) {
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 1S\n";
|
|
#endif
|
|
if (block) ucl_sync(cq);
|
|
return;
|
|
}
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 1NS\n";
|
|
#endif
|
|
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
|
|
dst.begin(),0,NULL,NULL));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, cl_command_queue &cq,
|
|
const cl_bool block,
|
|
size_t dst_offset, size_t src_offset) {
|
|
if (src.cbegin()==dst.cbegin()) {
|
|
if (block) ucl_sync(cq);
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 2S\n";
|
|
#endif
|
|
return;
|
|
}
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 2NS\n";
|
|
#endif
|
|
if (spitch==dpitch && dst.cols()==src.cols() &&
|
|
src.cols()==cols/src.element_size())
|
|
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
|
|
spitch*rows,
|
|
(char *)dst.begin()+dst_offset,0,NULL,
|
|
NULL));
|
|
else
|
|
for (size_t i=0; i<rows; i++) {
|
|
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,cols,
|
|
(char *)dst.begin()+dst_offset,0,NULL,
|
|
NULL));
|
|
src_offset+=spitch;
|
|
dst_offset+=dpitch;
|
|
}
|
|
}
|
|
};
|
|
|
|
// Neither are textures, source on host
|
|
template <> struct _ucl_memcpy<0,1> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
cl_command_queue &cq, const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
if (src.cbegin()==dst.cbegin()) {
|
|
if (block) ucl_sync(cq);
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 3S\n";
|
|
#endif
|
|
return;
|
|
}
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 3NS\n";
|
|
#endif
|
|
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
|
|
src.begin(),0,NULL,NULL));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, cl_command_queue &cq,
|
|
const cl_bool block,
|
|
size_t dst_offset, size_t src_offset) {
|
|
if (src.cbegin()==dst.cbegin()) {
|
|
if (block) ucl_sync(cq);
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 4S\n";
|
|
#endif
|
|
return;
|
|
}
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 4NS\n";
|
|
#endif
|
|
if (spitch==dpitch && dst.cols()==src.cols() &&
|
|
src.cols()==cols/src.element_size())
|
|
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
|
|
spitch*rows,
|
|
(char *)src.begin()+src_offset,0,NULL,
|
|
NULL));
|
|
else
|
|
for (size_t i=0; i<rows; i++) {
|
|
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,cols,
|
|
(char *)src.begin()+src_offset,0,NULL,
|
|
NULL));
|
|
src_offset+=spitch;
|
|
dst_offset+=dpitch;
|
|
}
|
|
}
|
|
};
|
|
|
|
// Neither are textures, both on device
|
|
template <int mem1, int mem2> struct _ucl_memcpy {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
cl_command_queue &cq, const cl_bool block,
|
|
const size_t dst_offset, const size_t src_offset) {
|
|
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
|
|
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
|
|
dst_offset,n,0,NULL,NULL));
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 6NS\n";
|
|
#endif
|
|
}
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
else std::cerr << "UCL_COPY 6S\n";
|
|
#endif
|
|
|
|
if (block==CL_TRUE) ucl_sync(cq);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, cl_command_queue &cq,
|
|
const cl_bool block,
|
|
size_t dst_offset, size_t src_offset) {
|
|
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 7NS\n";
|
|
#endif
|
|
if (spitch==dpitch && dst.cols()==src.cols() &&
|
|
src.cols()==cols/src.element_size())
|
|
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
|
|
dst_offset,spitch*rows,0,NULL,NULL));
|
|
|
|
else
|
|
for (size_t i=0; i<rows; i++) {
|
|
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),
|
|
src_offset,dst_offset,cols,0,
|
|
NULL,NULL));
|
|
src_offset+=spitch;
|
|
dst_offset+=dpitch;
|
|
}
|
|
}
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
else std::cerr << "UCL_COPY 7S\n";
|
|
#endif
|
|
|
|
if (block==CL_TRUE) ucl_sync(cq);
|
|
}
|
|
};
|
|
|
|
template<class mat1, class mat2>
|
|
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
|
|
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,dst.cq(),CL_TRUE,
|
|
dst.byteoff(),src.byteoff());
|
|
}
|
|
|
|
template<class mat1, class mat2>
|
|
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
|
|
cl_command_queue &cq) {
|
|
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,cq,CL_FALSE,
|
|
dst.byteoff(),src.byteoff());
|
|
}
|
|
|
|
template<class mat1, class mat2>
|
|
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
|
rows,dst.cq(),CL_TRUE,
|
|
dst.byteoff(),src.byteoff());
|
|
}
|
|
|
|
template<class mat1, class mat2>
|
|
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows,cl_command_queue &cq) {
|
|
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
|
rows,cq,CL_FALSE,
|
|
dst.byteoff(),src.byteoff());
|
|
}
|
|
|
|
} // namespace ucl_cudart
|
|
|
|
#endif
|
|
|