forked from lijiext/lammps
617 lines
23 KiB
C++
617 lines
23 KiB
C++
/***************************************************************************
|
|
nvd_memory.h
|
|
-------------------
|
|
W. Michael Brown
|
|
|
|
CUDA Driver Specific Memory Management and Vector/Matrix Containers
|
|
|
|
__________________________________________________________________________
|
|
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
|
__________________________________________________________________________
|
|
|
|
begin : Thu Jan 21 2010
|
|
copyright : (C) 2010 by W. Michael Brown
|
|
email : brownw@ornl.gov
|
|
***************************************************************************/
|
|
|
|
/* -----------------------------------------------------------------------
|
|
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the Simplified BSD License.
|
|
----------------------------------------------------------------------- */
|
|
|
|
#ifndef NVD_MEMORY_H
|
|
#define NVD_MEMORY_H
|
|
|
|
#include <iostream>
|
|
#include <cassert>
|
|
#include <cstring>
|
|
#include "nvd_macros.h"
|
|
#include "ucl_types.h"
|
|
|
|
namespace ucl_cudadr {
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - API Specific Types
|
|
// --------------------------------------------------------------------------
|
|
//typedef dim3 ucl_kernel_dim;
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - API SPECIFIC DEVICE POINTERS
|
|
// --------------------------------------------------------------------------
|
|
typedef CUdeviceptr device_ptr;
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - HOST MEMORY ALLOCATION ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
template <class mat_type, class copy_type>
|
|
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|
const enum UCL_MEMOPT kind) {
|
|
CUresult err=CUDA_SUCCESS;
|
|
if (kind==UCL_RW_OPTIMIZED)
|
|
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
|
else if (kind==UCL_WRITE_OPTIMIZED)
|
|
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
|
else
|
|
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
|
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
|
return UCL_MEMORY_ERROR;
|
|
mat.cq()=cm.cq();
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|
const enum UCL_MEMOPT kind) {
|
|
CUresult err=CUDA_SUCCESS;
|
|
if (kind==UCL_RW_OPTIMIZED)
|
|
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
|
else if (kind==UCL_WRITE_OPTIMIZED)
|
|
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
|
else
|
|
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
|
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
|
return UCL_MEMORY_ERROR;
|
|
mat.cq()=dev.cq();
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
|
|
if (kind!=UCL_NOT_PINNED)
|
|
CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin()));
|
|
else
|
|
free(mat.begin());
|
|
}
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - DEVICE MEMORY ALLOCATION ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
template <class mat_type, class copy_type>
|
|
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|
const enum UCL_MEMOPT kind) {
|
|
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
|
if (err!=CUDA_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
mat.cq()=cm.cq();
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|
const enum UCL_MEMOPT kind) {
|
|
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
|
if (err!=CUDA_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
mat.cq()=dev.cq();
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type, class copy_type>
|
|
inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
|
const size_t cols, size_t &pitch,
|
|
const enum UCL_MEMOPT kind) {
|
|
CUresult err;
|
|
CUDA_INT_TYPE upitch;
|
|
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
|
|
cols*sizeof(typename mat_type::data_type),rows,16);
|
|
pitch=static_cast<size_t>(upitch);
|
|
if (err!=CUDA_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
mat.cq()=cm.cq();
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type, class copy_type>
|
|
inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
|
const size_t cols, size_t &pitch,
|
|
const enum UCL_MEMOPT kind) {
|
|
CUresult err;
|
|
unsigned upitch;
|
|
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
|
|
cols*sizeof(typename mat_type::data_type),rows,16);
|
|
pitch=static_cast<size_t>(upitch);
|
|
if (err!=CUDA_SUCCESS)
|
|
return UCL_MEMORY_ERROR;
|
|
mat.cq()=d.cq();
|
|
return UCL_SUCCESS;
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline void _device_free(mat_type &mat) {
|
|
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
|
|
}
|
|
|
|
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
|
|
*ptr=in;
|
|
}
|
|
|
|
template <class numtyp>
|
|
inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
|
|
*ptr=0;
|
|
}
|
|
|
|
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
|
|
const size_t offset, const size_t numsize) {
|
|
*ptr=in+offset*numsize;
|
|
}
|
|
|
|
template <class numtyp>
|
|
inline void _device_view(CUdeviceptr *ptr, numtyp *in,
|
|
const size_t offset, const size_t numsize) {
|
|
*ptr=0;
|
|
}
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - DEVICE IMAGE ALLOCATION ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
template <class mat_type, class copy_type>
|
|
inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
|
const size_t cols) {
|
|
assert(0==1);
|
|
}
|
|
|
|
template <class mat_type, class copy_type>
|
|
inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
|
const size_t cols) {
|
|
assert(0==1);
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline void _device_image_free(mat_type &mat) {
|
|
assert(0==1);
|
|
}
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - ZERO ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
inline void _host_zero(void *ptr, const size_t n) {
|
|
memset(ptr,0,n);
|
|
}
|
|
|
|
template <class mat_type>
|
|
inline void _device_zero(mat_type &mat, const size_t n) {
|
|
if (n%32==0)
|
|
CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4));
|
|
else if (n%16==0)
|
|
CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2));
|
|
else
|
|
CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n));
|
|
}
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - HELPER FUNCTIONS FOR MEMCPY ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
|
|
inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
ins.srcXInBytes=0;
|
|
ins.srcY=0;
|
|
ins.srcPitch=spitch;
|
|
ins.dstXInBytes=0;
|
|
ins.dstY=0;
|
|
ins.dstPitch=dpitch;
|
|
ins.WidthInBytes=cols;
|
|
ins.Height=rows;
|
|
}
|
|
|
|
template <int mem> struct _nvd_set_2D_mem;
|
|
template <> struct _nvd_set_2D_mem<1>
|
|
{ static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } };
|
|
template <> struct _nvd_set_2D_mem<2>
|
|
{ static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } };
|
|
template <int mem> struct _nvd_set_2D_mem
|
|
{ static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } };
|
|
|
|
|
|
// --------------------------------------------------------------------------
|
|
// - MEMCPY ROUTINES
|
|
// --------------------------------------------------------------------------
|
|
|
|
template<int mem1, int mem2> struct _ucl_memcpy;
|
|
|
|
// Both are images
|
|
template<> struct _ucl_memcpy<2,2> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
CUstream &cq) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstArray=dst.cbegin();
|
|
ins.srcArray=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, CUstream &cq) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstArray=dst.cbegin();
|
|
ins.srcArray=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
|
}
|
|
};
|
|
|
|
// Destination is texture, source on device
|
|
template<> struct _ucl_memcpy<2,0> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
CUstream &cq) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstArray=dst.cbegin();
|
|
ins.srcDevice=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, CUstream &cq) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstArray=dst.cbegin();
|
|
ins.srcDevice=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
|
}
|
|
};
|
|
|
|
// Destination is texture, source on host
|
|
template<> struct _ucl_memcpy<2,1> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
CUstream &cq) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstArray=dst.cbegin();
|
|
ins.srcHost=src.begin();
|
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, CUstream &cq) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstArray=dst.cbegin();
|
|
ins.srcHost=src.begin();
|
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
|
}
|
|
};
|
|
|
|
// Source is texture, dest on device
|
|
template<> struct _ucl_memcpy<0,2> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
CUstream &cq) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstDevice=dst.cbegin();
|
|
ins.srcArray=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, CUstream &cq) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstDevice=dst.cbegin();
|
|
ins.srcArray=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
|
}
|
|
};
|
|
|
|
// Source is texture, dest on host
|
|
template<> struct _ucl_memcpy<1,2> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
CUstream &cq) {
|
|
assert(0==1);
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstHost=dst.begin();
|
|
ins.srcArray=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, CUstream &cq) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstHost=dst.begin();
|
|
ins.srcArray=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
|
}
|
|
};
|
|
|
|
// Neither are textures, destination on host
|
|
template <> struct _ucl_memcpy<1,0> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
|
CU_SAFE_CALL(cuMemcpyDtoH(dst.begin(),src.cbegin(),n));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
CUstream &cq) {
|
|
CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstHost=dst.begin();
|
|
ins.srcDevice=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, CUstream &cq) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstHost=dst.begin();
|
|
ins.srcDevice=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
|
}
|
|
};
|
|
|
|
// Neither are textures, source on host
|
|
template <> struct _ucl_memcpy<0,1> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
|
CU_SAFE_CALL(cuMemcpyHtoD(dst.cbegin(),src.begin(),n));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
CUstream &cq) {
|
|
CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstDevice=dst.cbegin();
|
|
ins.srcHost=src.begin();
|
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, CUstream &cq) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstDevice=dst.cbegin();
|
|
ins.srcHost=src.begin();
|
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
|
}
|
|
};
|
|
|
|
// Neither are textures, both on host
|
|
template <> struct _ucl_memcpy<1,1> {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n)
|
|
{ memcpy(dst.begin(),src.begin(),n); }
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
CUstream &cq)
|
|
{ memcpy(dst.begin(),src.begin(),n); }
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstHost=dst.begin();
|
|
ins.srcHost=src.begin();
|
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, CUstream &cq) {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstHost=dst.begin();
|
|
ins.srcHost=src.begin();
|
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
|
}
|
|
};
|
|
|
|
// Neither are textures, both on device
|
|
template <int mem1, int mem2> struct _ucl_memcpy {
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n) {
|
|
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
|
CUstream &cq) {
|
|
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
if (p1::PADDED==0 || p2::PADDED==0) {
|
|
size_t src_offset=0, dst_offset=0;
|
|
for (size_t i=0; i<rows; i++) {
|
|
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
|
|
src.cbegin()+src_offset,cols));
|
|
src_offset+=spitch;
|
|
dst_offset+=dpitch;
|
|
}
|
|
} else {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstDevice=dst.cbegin();
|
|
ins.srcDevice=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
|
}
|
|
}
|
|
template <class p1, class p2>
|
|
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows, CUstream &cq) {
|
|
if (p1::PADDED==0 || p2::PADDED==0) {
|
|
size_t src_offset=0, dst_offset=0;
|
|
for (size_t i=0; i<rows; i++) {
|
|
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
|
|
src.cbegin()+src_offset,cols,cq));
|
|
src_offset+=spitch;
|
|
dst_offset+=dpitch;
|
|
}
|
|
} else {
|
|
CUDA_MEMCPY2D ins;
|
|
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
|
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
|
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
|
ins.dstDevice=dst.cbegin();
|
|
ins.srcDevice=src.cbegin();
|
|
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
|
}
|
|
}
|
|
};
|
|
|
|
template<class mat1, class mat2>
|
|
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
|
|
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n);
|
|
}
|
|
|
|
template<class mat1, class mat2>
|
|
inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
|
|
CUstream &cq) {
|
|
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,cq);
|
|
}
|
|
|
|
template<class mat1, class mat2>
|
|
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows) {
|
|
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
|
rows);
|
|
}
|
|
|
|
template<class mat1, class mat2>
|
|
inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
|
|
const size_t spitch, const size_t cols,
|
|
const size_t rows,CUstream &cq) {
|
|
_ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
|
|
rows,cq);
|
|
}
|
|
|
|
} // namespace ucl_cudart
|
|
|
|
#endif
|
|
|