forked from lijiext/lammps
941 lines
39 KiB
C++
941 lines
39 KiB
C++
/***************************************************************************
|
|
ucl_copy.h
|
|
-------------------
|
|
W. Michael Brown
|
|
|
|
Routines for copying matrix/vector data onto and off coprocessor device
|
|
|
|
__________________________________________________________________________
|
|
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
|
__________________________________________________________________________
|
|
|
|
begin : Mon Jan 4 2010
|
|
copyright : (C) 2010 by W. Michael Brown
|
|
email : brownw@ornl.gov
|
|
***************************************************************************/
|
|
|
|
/* -----------------------------------------------------------------------
|
|
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the Simplified BSD License.
|
|
----------------------------------------------------------------------- */
|
|
|
|
/***************************************************************************
|
|
The ucl_copy and ucl_cast_copy routines provide a general prototype for
|
|
copying data between host and device memory (including texture memory)
|
|
for the matrix and vector types in nvc_memory.
|
|
|
|
For host/host and host/device transfers, typecasting is performed
|
|
automatically as necessary.
|
|
|
|
The routines are written so that all branches can be removed by the
|
|
compiler during template instantiation.
|
|
|
|
The routines currently assume row-major ordering for all types.
|
|
|
|
For asynchronous copy in the default command queue, async is boolean true;
|
|
For asynchronous copy in a specified command queue, async is command queue
|
|
Otherwise, set async to boolean false;
|
|
|
|
When performing frequent data copies that require casting, it is more
|
|
efficient to allocate a casting buffer once and then pass that buffer
|
|
to the copy routine. This can be accomplished with the ucl_cast_copy
|
|
routines.
|
|
|
|
Examples
|
|
(x's represent alignment padding - to maintain alignment)
|
|
(o's represent a larger matrix in memory)
|
|
(vectors represented as single row)
|
|
----------------------------------------------------------------
|
|
dst src command
|
|
----------------------------------------------------------------
|
|
0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async)
|
|
|
|
0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async)
|
|
|
|
0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async)
|
|
3 4 5
|
|
|
|
0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async)
|
|
3 4 5
|
|
|
|
0 1 2 <-- 0 1 2 ucl_copy(dst,src,async)
|
|
3 4 5 3 4 5
|
|
|
|
0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async)
|
|
3 4 5 3 4 5
|
|
5 6 7
|
|
|
|
0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async)
|
|
4 5 6 4 5 6 7
|
|
8 9 10 11
|
|
|
|
0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async)
|
|
3 4 5 x x 3 4 5
|
|
|
|
0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async)
|
|
3 4 5 3 4 5 x x
|
|
|
|
0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async)
|
|
3 4 5 o o 3 4 5
|
|
o o o o o
|
|
|
|
0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async)
|
|
3 4 5 o o
|
|
o o o o o
|
|
|
|
0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async)
|
|
2 3 o o o
|
|
o o o o o
|
|
|
|
0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
|
|
5 6 7 o o 5 6 7 8 9
|
|
o o o o o 10 11 12 13 14
|
|
|
|
0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async)
|
|
5 6 7 8 9
|
|
10 11 12 13 14
|
|
|
|
***************************************************************************/
|
|
|
|
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
|
|
#ifdef UCL_COPY_ALLOW
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY
|
|
// --------------------------------------------------------------------------
|
|
template <class mat1, class mat2>
|
|
inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
|
|
if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) {
|
|
if (dst.kind()==UCL_READ_ONLY) {
|
|
std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n";
|
|
assert(0==1);
|
|
} else if (src.kind()==UCL_WRITE_ONLY) {
|
|
std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n";
|
|
assert(0==1);
|
|
}
|
|
} else {
|
|
if (dst.kind()==UCL_WRITE_ONLY) {
|
|
std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n";
|
|
assert(0==1);
|
|
} else if (src.kind()==UCL_READ_ONLY) {
|
|
std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n";
|
|
assert(0==1);
|
|
}
|
|
}
|
|
}
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - HOST-HOST COPY ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
|
|
// Have to use specialization because some types don't have operator[]
|
|
template <int host_t1, int host_t2> struct _host_host_copy;
|
|
|
|
// Both on host
|
|
template <> struct _host_host_copy<1,1> {
|
|
template <class mat1, class mat2>
|
|
static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
|
|
#ifdef UCL_DEBUG
|
|
assert(mat1::PADDED==0 && mat2::PADDED==0);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
#endif
|
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
|
|
#ifdef _OCL_MAT
|
|
if (dst.begin()==src.begin()) {
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 7S\n";
|
|
#endif
|
|
return;
|
|
}
|
|
#endif
|
|
memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 7NS\n";
|
|
#endif
|
|
} else
|
|
for (size_t i=0; i<numel; i++)
|
|
dst[i]=static_cast<typename mat1::data_type>(src[i]);
|
|
}
|
|
template <class mat1, class mat2>
|
|
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols) {
|
|
#ifdef UCL_DEBUG
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
#endif
|
|
size_t dst_row_size, src_row_size;
|
|
if (mat1::VECTOR)
|
|
dst_row_size=cols;
|
|
else
|
|
dst_row_size=dst.row_size();
|
|
if (mat2::VECTOR)
|
|
src_row_size=cols;
|
|
else
|
|
src_row_size=src.row_size();
|
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
|
|
#ifdef _OCL_MAT
|
|
if (dst.begin()==src.begin()) {
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 8S\n";
|
|
#endif
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifdef UCL_DBG_MEM_TRACE
|
|
std::cerr << "UCL_COPY 8NS\n";
|
|
#endif
|
|
for (size_t i=0; i<rows; i++)
|
|
memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
|
|
cols*sizeof(typename mat1::data_type));
|
|
} else
|
|
for (size_t j=0; j<rows; j++) {
|
|
size_t dst_i=j*dst_row_size;
|
|
size_t d_end=dst_i+cols;
|
|
size_t src_i=j*src_row_size;
|
|
for (; dst_i<d_end; dst_i++) {
|
|
dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
|
|
src_i++;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// Should never be here
|
|
template <int host_t1, int host_t2> struct _host_host_copy {
|
|
template <class mat1, class mat2>
|
|
static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
|
|
assert(0==1);
|
|
}
|
|
template <class mat1, class mat2>
|
|
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols) {
|
|
assert(0==1);
|
|
}
|
|
};
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - TEMPLATE HELPER FUNCTIONS FOR SPECIALIZED CASTING
|
|
// --------------------------------------------------------------------------
|
|
|
|
// Helper functions for ucl_cast_copy
|
|
template <int host_type1, int host_type2> struct _ucl_cast_copy;
|
|
|
|
// Destination is on host
|
|
template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer) {
|
|
ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type));
|
|
for (size_t i=0; i<numel; i++)
|
|
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer,command_queue &cq) {
|
|
ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type),cq);
|
|
cast_buffer.sync();
|
|
for (size_t i=0; i<numel; i++)
|
|
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer) {
|
|
// Asynchronous currently pointless here
|
|
#ifdef UCL_DEBUG
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
|
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
|
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
|
#endif
|
|
if (mat1::VECTOR) {
|
|
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
|
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
|
|
for (size_t i=0; i<rows*cols; i++)
|
|
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
|
} else {
|
|
if (mat2::VECTOR)
|
|
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
|
cols*sizeof(typename mat2::data_type),
|
|
cols*sizeof(typename mat2::data_type),rows);
|
|
else
|
|
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
|
src.row_bytes(),cols*sizeof(typename mat2::data_type),
|
|
rows);
|
|
size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
|
|
for (size_t i=0; i<rows; i++) {
|
|
for (size_t j=0; j<cols; j++) {
|
|
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
|
|
buff_i++;
|
|
dst_i++;
|
|
}
|
|
dst_i+=doff;
|
|
}
|
|
}
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer,
|
|
command_queue &cq) {
|
|
// Asynchronous currently pointless here
|
|
#ifdef UCL_DEBUG
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
|
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
|
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
|
#endif
|
|
if (mat1::VECTOR) {
|
|
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
|
src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
|
|
cast_buffer.sync();
|
|
for (size_t i=0; i<rows*cols; i++)
|
|
dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
|
|
} else {
|
|
if (mat2::VECTOR)
|
|
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
|
cols*sizeof(typename mat2::data_type),
|
|
cols*sizeof(typename mat2::data_type),rows,cq);
|
|
else
|
|
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
|
src.row_bytes(),cols*sizeof(typename mat2::data_type),
|
|
rows,cq);
|
|
cast_buffer.sync();
|
|
size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
|
|
for (size_t i=0; i<rows; i++) {
|
|
for (size_t j=0; j<cols; j++) {
|
|
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
|
|
buff_i++;
|
|
dst_i++;
|
|
}
|
|
dst_i+=doff;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// Source is on host
|
|
template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer) {
|
|
for (size_t i=0; i<numel; i++)
|
|
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
|
ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type));
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer, command_queue &cq) {
|
|
for (size_t i=0; i<numel; i++)
|
|
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
|
ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type),cq);
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer) {
|
|
#ifdef UCL_DEBUG
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
|
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
|
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
|
if (mat3::VECTOR==0) {
|
|
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
|
|
assert(dst.rows()>=rows && dst.cols()>=cols);
|
|
}
|
|
#endif
|
|
if (mat2::VECTOR) {
|
|
if (mat3::VECTOR==0) {
|
|
size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
|
|
for (size_t i=0; i<rows; i++) {
|
|
for (size_t j=0; j<cols; j++) {
|
|
cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
|
|
ci++;
|
|
si++;
|
|
}
|
|
ci+=co;
|
|
si+=so;
|
|
}
|
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
|
|
cols*sizeof(typename mat1::data_type),rows);
|
|
} else {
|
|
for (size_t i=0; i<rows*cols; i++)
|
|
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
|
|
cols*sizeof(typename mat1::data_type),
|
|
cols*sizeof(typename mat1::data_type),rows);
|
|
}
|
|
} else if (mat1::VECTOR) {
|
|
size_t src_i=0, buf_i=0, soff=src.cols()-cols;
|
|
for (size_t i=0; i<rows; i++) {
|
|
for (size_t j=0; j<cols; j++) {
|
|
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
|
buf_i++;
|
|
src_i++;
|
|
}
|
|
src_i+=soff;
|
|
}
|
|
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
|
|
} else {
|
|
size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
|
|
if (mat3::VECTOR==0) {
|
|
co=cast_buffer.cols()-cols;
|
|
spitch=cast_buffer.row_bytes();
|
|
} else {
|
|
co=0;
|
|
spitch=cols*sizeof(typename mat1::data_type);
|
|
}
|
|
for (size_t i=0; i<rows; i++) {
|
|
for (size_t j=0; j<cols; j++) {
|
|
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
|
buf_i++;
|
|
src_i++;
|
|
}
|
|
src_i+=so;
|
|
buf_i+=co;
|
|
}
|
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
|
|
cols*sizeof(typename mat1::data_type),rows);
|
|
}
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer,
|
|
command_queue &cq) {
|
|
#ifdef UCL_DEBUG
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
|
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
|
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
|
if (mat3::VECTOR==0) {
|
|
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
|
|
assert(dst.rows()>=rows && dst.cols()>=cols);
|
|
}
|
|
#endif
|
|
if (mat2::VECTOR) {
|
|
if (mat3::VECTOR==0) {
|
|
size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
|
|
for (size_t i=0; i<rows; i++) {
|
|
for (size_t j=0; j<cols; j++) {
|
|
cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
|
|
ci++;
|
|
si++;
|
|
}
|
|
ci+=co;
|
|
si+=so;
|
|
}
|
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
|
|
cols*sizeof(typename mat1::data_type),rows);
|
|
} else {
|
|
for (size_t i=0; i<rows*cols; i++)
|
|
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
|
ucl_mv_cpy(dst,dst.row_bytes(),
|
|
cast_buffer,cols*sizeof(typename mat1::data_type),
|
|
cols*sizeof(typename mat1::data_type),rows,cq);
|
|
}
|
|
} else if (mat1::VECTOR) {
|
|
size_t src_i=0, buf_i=0, soff=src.cols()-cols;
|
|
for (size_t i=0; i<rows; i++) {
|
|
for (size_t j=0; j<cols; j++) {
|
|
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
|
buf_i++;
|
|
src_i++;
|
|
}
|
|
src_i+=soff;
|
|
}
|
|
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
|
|
} else {
|
|
size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
|
|
if (mat3::VECTOR==0) {
|
|
co=cast_buffer.cols()-cols;
|
|
spitch=cast_buffer.row_bytes();
|
|
} else {
|
|
co=0;
|
|
spitch=cols*sizeof(typename mat1::data_type);
|
|
}
|
|
for (size_t i=0; i<rows; i++) {
|
|
for (size_t j=0; j<cols; j++) {
|
|
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
|
buf_i++;
|
|
src_i++;
|
|
}
|
|
src_i+=so;
|
|
buf_i+=co;
|
|
}
|
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
|
|
cols*sizeof(typename mat1::data_type),rows,cq);
|
|
}
|
|
}
|
|
};
|
|
|
|
// Neither on host or both on host
|
|
template <> struct _ucl_cast_copy<1,1> {
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer, command_queue &cq) {
|
|
assert(0==1);
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer) {
|
|
assert(0==1);
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer) {
|
|
assert(0==1);
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer,
|
|
command_queue &cq) {
|
|
assert(0==1);
|
|
}
|
|
};
|
|
|
|
// Neither on host or both on host
|
|
template <> struct _ucl_cast_copy<0,0> {
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer, command_queue &cq) {
|
|
assert(0==1);
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer) {
|
|
assert(0==1);
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer) {
|
|
assert(0==1);
|
|
}
|
|
template <class mat1, class mat2, class mat3>
|
|
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer,
|
|
command_queue &cq) {
|
|
assert(0==1);
|
|
}
|
|
};
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - 1D COPY - SPECIFIED NUMBER OF BYTES
|
|
// --------------------------------------------------------------------------
|
|
|
|
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
|
/** \param numel Number of elements (not bytes) to copy
|
|
* \param cast_buffer Buffer on host with enough storage for casting
|
|
* - If the data types for the two matrices are same, no cast performed
|
|
* - Padding for 2D matrices is not considered in this routine.
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2, class mat3>
|
|
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer, command_queue &cq) {
|
|
#ifdef UCL_DEBUG
|
|
assert(dst.numel()>=numel && src.numel()>=numel);
|
|
assert(cast_buffer.numel()>=numel);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
#endif
|
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
|
ucl_copy(dst,src,numel,cq);
|
|
else {
|
|
#ifdef UCL_DEBUG
|
|
_check_ucl_copy_perm(dst,src);
|
|
#endif
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
|
cast_buffer,cq);
|
|
}
|
|
}
|
|
|
|
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
|
/** \param numel Number of elements (not bytes) to copy
|
|
* \param async Perform non-blocking copy on default stream
|
|
* \param cast_buffer Buffer on host with enough storage for casting
|
|
* - If the data types for the two matrices are same, no cast performed
|
|
* - Padding for 2D matrices is not considered in this routine.
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2, class mat3>
|
|
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|
mat3 &cast_buffer, const bool async) {
|
|
#ifdef UCL_DEBUG
|
|
assert(dst.numel()>=numel && src.numel()>=numel);
|
|
assert(cast_buffer.numel()>=numel);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
_check_ucl_copy_perm(dst,src);
|
|
#endif
|
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
|
ucl_copy(dst,src,numel,async);
|
|
else if (async)
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
|
cast_buffer,dst.cq());
|
|
else
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
|
cast_buffer);
|
|
}
|
|
|
|
/// Asynchronous copy of matrix/vector (memory already allocated)
|
|
/** \param numel Number of elements (not bytes) to copy
|
|
* - If the data types of the two matrices are not the same,
|
|
* casting will be performed automatically as long as the copy is
|
|
* not device to device. For host/device transfers, a temporary
|
|
* buffer is created for copy. When multiple casts occur, it is
|
|
* more efficient to create a permanent casting buffer that can
|
|
* be passed to an alternative copy routine.
|
|
* - Padding for 2D matrices is not considered in this routine.
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2>
|
|
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|
command_queue &cq) {
|
|
#ifdef UCL_DEBUG
|
|
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
_check_ucl_copy_perm(dst,src);
|
|
#endif
|
|
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
|
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
|
|
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
|
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
|
if (mat1::MEM_TYPE==1) {
|
|
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
|
cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
|
cast_buffer,cq);
|
|
} else {
|
|
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
|
cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
|
cast_buffer,cq);
|
|
}
|
|
} else
|
|
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
|
|
}
|
|
|
|
/// Copy matrix/vector (memory already allocated)
|
|
/** \param numel Number of elements (not bytes) to copy
|
|
* \param async Perform non-blocking copy (ignored for host to host copy)
|
|
* - If the data types of the two matrices are not the same,
|
|
* casting will be performed automatically as long as the copy is
|
|
* not device to device. For host/device transfers, a temporary
|
|
* buffer is created for copy. When multiple casts occur, it is
|
|
* more efficient to create a permanent casting buffer that can
|
|
* be passed to an alternative copy routine.
|
|
* - Padding for 2D matrices is not considered in this routine.
|
|
* - The default stream is used for asynchronous copy
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2>
|
|
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|
const bool async) {
|
|
#ifdef UCL_DEBUG
|
|
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
_check_ucl_copy_perm(dst,src);
|
|
#endif
|
|
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
|
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
|
|
else if (async)
|
|
ucl_copy(dst,src,numel,dst.cq());
|
|
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
|
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
|
if (mat1::MEM_TYPE==1) {
|
|
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
|
cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
|
cast_buffer);
|
|
} else {
|
|
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
|
cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
|
cast_buffer);
|
|
}
|
|
} else
|
|
ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
|
|
}
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - 2D COPY - SPECIFIED NUMBER OF ROWS/COLS
|
|
// --------------------------------------------------------------------------
|
|
|
|
/// Asynchronous copy subset matrix rows/cols with cast (Device/Host transfer)
|
|
/** \param async Perform non-blocking copy on default stream
|
|
* \param cast_buffer Buffer on host with enough storage for casting
|
|
* - If src is a vector, routine assumes row-major rows by cols copy
|
|
* - If src is a matrix, routine will copy upper left tile of matrix
|
|
* - If dst is a vector, routine assumes row-major rows by cols copy
|
|
* - If dst is a matrix, routine will copy into left tile of matrix
|
|
* - If the data types for the two matrices are same, no cast performed
|
|
* - Padding for 2D matrices is not considered in this routine.
|
|
* - Copy from vector to matrix and vice versa allowed
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2, class mat3>
|
|
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer,
|
|
const bool async) {
|
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
|
ucl_copy(dst,src,rows,cols,async);
|
|
else if (async)
|
|
ucl_copy(dst,src,rows,cols,dst.cq());
|
|
else {
|
|
#ifdef UCL_DEBUG
|
|
_check_ucl_copy_perm(dst,src);
|
|
#endif
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
|
cast_buffer);
|
|
}
|
|
}
|
|
|
|
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
|
|
/** \param cast_buffer Buffer on host with enough storage for casting
|
|
* - If src is a vector, routine assumes row-major rows by cols copy
|
|
* - If src is a matrix, routine will copy upper left tile of matrix
|
|
* - If dst is a vector, routine assumes row-major rows by cols copy
|
|
* - If dst is a matrix, routine will copy into upper left tile of matrix
|
|
* - If the data types for the two matrices are same, no cast performed
|
|
* - Padding for 2D matrices is not considered in this routine.
|
|
* - Copy from vector to matrix and vice versa allowed
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2, class mat3>
|
|
inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, mat3 &cast_buffer,
|
|
command_queue &cq) {
|
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
|
ucl_copy(dst,src,rows,cols,cq);
|
|
else {
|
|
#ifdef UCL_DEBUG
|
|
_check_ucl_copy_perm(dst,src);
|
|
#endif
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
|
cast_buffer,cq);
|
|
}
|
|
}
|
|
|
|
/// Asynchronous copy of subset matrix rows,cols (memory already allocated)
|
|
/** - If src is a vector, routine assumes row-major rows by cols copy
|
|
* - If src is a matrix, routine will copy upper left tile of matrix
|
|
* - If dst is a vector, routine assumes row-major rows by cols copy
|
|
* - If dst is a matrix, routine will copy into left tile of matrix
|
|
* - If the data types of the two matrices are not the same,
|
|
* casting will be performed automatically as long as the copy is
|
|
* not device to device. For host/device transfers, a temporary
|
|
* buffer is created for copy. When multiple casts occur, it is
|
|
* more efficient to create a permanent casting buffer that can
|
|
* be passed to an alternative copy routine.
|
|
* - The copy should handle padding for 2D alignment correctly
|
|
* - Copy from vector to matrix and vice versa allowed
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2>
|
|
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, command_queue &cq) {
|
|
#ifdef UCL_DEBUG
|
|
_check_ucl_copy_perm(dst,src);
|
|
#endif
|
|
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
|
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
|
|
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
|
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
|
if (mat1::MEM_TYPE==1) {
|
|
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
|
cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
|
cast_buffer,cq);
|
|
} else {
|
|
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
|
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
|
cast_buffer,cq);
|
|
}
|
|
// If we are here, at least one of the matrices must have VECTOR=0
|
|
} else if (mat1::VECTOR) {
|
|
#ifdef UCL_DEBUG
|
|
assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
#endif
|
|
ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
|
|
cols*sizeof(typename mat1::data_type),rows,
|
|
cq);
|
|
} else if (mat2::VECTOR) {
|
|
#ifdef UCL_DEBUG
|
|
assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
#endif
|
|
ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
|
|
cols*sizeof(typename mat1::data_type),rows,cq);
|
|
} else {
|
|
#ifdef UCL_DEBUG
|
|
assert(src.rows()>=rows && src.cols()>=cols);
|
|
assert(dst.rows()>=rows && dst.cols()>=cols);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
#endif
|
|
ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
|
|
cols*sizeof(typename mat1::data_type),rows,cq);
|
|
}
|
|
}
|
|
|
|
/// Copy subset of matrix rows,cols (memory already allocated)
|
|
/** \param async Perform non-blocking copy (ignored for host to host copy)
|
|
* - If src is a vector, routine assumes row-major rows by cols copy
|
|
* - If src is a matrix, routine will copy upper left tile of matrix
|
|
* - If dst is a vector, routine assumes row-major rows by cols copy
|
|
* - If dst is a matrix, routine will copy into left tile of matrix
|
|
* - If the data types of the two matrices are not the same,
|
|
* casting will be performed automatically as long as the copy is
|
|
* not device to device. For host/device transfers, a temporary
|
|
* buffer is created for copy. When multiple casts occur, it is
|
|
* more efficient to create a permanent casting buffer that can
|
|
* be passed to an alternative copy routine.
|
|
* - The copy should handle padding for 2D alignment correctly
|
|
* - Copy from vector to matrix and vice versa allowed
|
|
* - The default stream is used for asynchronous copy
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2>
|
|
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|
const size_t cols, const bool async) {
|
|
#ifdef UCL_DEBUG
|
|
_check_ucl_copy_perm(dst,src);
|
|
#endif
|
|
if (async)
|
|
ucl_copy(dst,src,rows,cols,dst.cq());
|
|
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
|
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
|
|
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
|
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
|
if (mat1::MEM_TYPE==1) {
|
|
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
|
cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
|
cast_buffer);
|
|
} else {
|
|
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
|
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
|
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
|
cast_buffer);
|
|
}
|
|
// If we are here, at least one of the matrices must have VECTOR=0
|
|
} else if (mat1::VECTOR) {
|
|
#ifdef UCL_DEBUG
|
|
assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
assert(mat2::VECTOR==0);
|
|
#endif
|
|
ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
|
|
cols*sizeof(typename mat1::data_type),rows);
|
|
} else if (mat2::VECTOR) {
|
|
#ifdef UCL_DEBUG
|
|
assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
assert(mat1::VECTOR==0);
|
|
#endif
|
|
ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
|
|
cols*sizeof(typename mat1::data_type),rows);
|
|
} else {
|
|
#ifdef UCL_DEBUG
|
|
assert(src.rows()>=rows && src.cols()>=cols);
|
|
assert(dst.rows()>=rows && dst.cols()>=cols);
|
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
|
#endif
|
|
ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
|
|
cols*sizeof(typename mat1::data_type),rows);
|
|
}
|
|
}
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - 1D/2D COPY
|
|
// --------------------------------------------------------------------------
|
|
|
|
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
|
/** \param async Perform non-blocking copy on default stream
|
|
* \param cast_buffer Buffer on host with enough storage for casting
|
|
* - If the data types for the two matrices are same, no cast performed
|
|
* - The number of bytes copied is determined by entire src data
|
|
* - Padding for 2D matrices is not considered in this routine.
|
|
* - Copy from vector to matrix and vice versa allowed
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2, class mat3>
|
|
inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
|
|
mat3 &cast_buffer, const bool async) {
|
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
|
ucl_copy(dst,src,async);
|
|
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
|
|
ucl_cast_copy(dst,src,src.rows(),src.cols(),cast_buffer,async);
|
|
else if (mat1::PADDED==1)
|
|
ucl_cast_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,async);
|
|
else
|
|
ucl_cast_copy(dst,src,src.numel(),cast_buffer,async);
|
|
}
|
|
|
|
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
|
/** \param cast_buffer Buffer on host with enough storage for casting
|
|
* - If the data types for the two matrices are same, no cast performed
|
|
* - The number of bytes copied is determined by entire src data
|
|
* - Padding for 2D matrices is not considered in this routine.
|
|
* - Copy from vector to matrix and vice versa allowed
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2, class mat3>
|
|
inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
|
|
mat3 &cast_buffer, command_queue &cq) {
|
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
|
ucl_copy(dst,src,cq);
|
|
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
|
|
ucl_copy(dst,src,src.rows(),src.cols(),cast_buffer,cq);
|
|
else if (mat1::PADDED==1)
|
|
ucl_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,cq);
|
|
else
|
|
ucl_copy(dst,src,src.numel(),cast_buffer,cq);
|
|
}
|
|
|
|
/// Asynchronous copy of matrix/vector (memory already allocated)
|
|
/** - The number of bytes copied is determined by entire src data
|
|
* - If the data types of the two matrices are not the same,
|
|
* casting will be performed automatically as long as the copy is
|
|
* not device to device. For host/device transfers, a temporary
|
|
* buffer is created for copy. When multiple casts occur, it is
|
|
* more efficient to create a permanent casting buffer that can
|
|
* be passed to an alternative copy routine.
|
|
* - The copy should handle padding for 2D alignment correctly
|
|
* - Copy from vector to matrix and vice versa allowed
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2>
|
|
inline void ucl_copy(mat1 &dst, const mat2 &src, command_queue &cq) {
|
|
if (dst.row_bytes()==src.row_bytes() &&
|
|
src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
|
|
(int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
|
ucl_copy(dst,src,src.row_size()*src.rows(),cq);
|
|
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
|
|
ucl_copy(dst,src,src.rows(),src.cols(),cq);
|
|
else if (mat1::PADDED==1)
|
|
ucl_copy(dst,src,dst.rows(),dst.cols(),cq);
|
|
else
|
|
ucl_copy(dst,src,src.numel(),cq);
|
|
}
|
|
|
|
/// Copy matrix/vector (memory already allocated)
|
|
/** \param async Perform non-blocking copy (ignored for host to host copy)
|
|
* - The number of bytes copied is determined by entire src data
|
|
* - If the data types of the two matrices are not the same,
|
|
* casting will be performed automatically as long as the copy is
|
|
* not device to device. For host/device transfers, a temporary
|
|
* buffer is created for copy. When multiple casts occur, it is
|
|
* more efficient to create a permanent casting buffer that can
|
|
* be passed to an alternative copy routine.
|
|
* - The copy should handle padding for 2D alignment correctly
|
|
* - Copy from vector to matrix and vice versa allowed
|
|
* - The default stream is used for asynchronous copy
|
|
* - Currently does not handle textures **/
|
|
template <class mat1, class mat2>
|
|
inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
|
|
if (async)
|
|
ucl_copy(dst,src,dst.cq());
|
|
else if (dst.row_bytes()==src.row_bytes() &&
|
|
src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
|
|
(int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
|
ucl_copy(dst,src,src.row_size()*src.rows(),async);
|
|
else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
|
|
ucl_copy(dst,src,src.rows(),src.cols(),async);
|
|
else if (mat1::PADDED==1)
|
|
ucl_copy(dst,src,dst.rows(),dst.cols(),async);
|
|
else
|
|
ucl_copy(dst,src,src.numel(),async);
|
|
}
|
|
|
|
#endif
|
|
|