fix opencl allocator mem leak

This commit is contained in:
wandongdong 2020-09-18 19:53:20 -07:00
parent 3f2650af5b
commit a94bccb43d
2 changed files with 68 additions and 49 deletions

View File

@ -82,6 +82,8 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
void *host_ptr = nullptr;
void *device_ptr = nullptr;
void *image_ptr = nullptr;
cl::Buffer *buffer = nullptr;
cl::Image2D *image = nullptr;
if (svm_capabilities) {
cl_svm_mem_flags flags = (svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0;
@ -90,7 +92,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
host_ptr = clSVMAlloc((*ocl_runtime_->Context())(), flags, size, 0);
} else {
cl_int ret = CL_SUCCESS;
cl::Buffer *buffer = new (std::nothrow)
buffer = new (std::nothrow)
cl::Buffer(*ocl_runtime_->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret);
if (buffer == nullptr || ret != CL_SUCCESS) {
UnLock();
@ -100,6 +102,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
device_ptr = static_cast<void *>(buffer);
host_ptr = ocl_runtime_->MapBuffer(*buffer, CL_MAP_READ | CL_MAP_WRITE, size);
if (host_ptr == nullptr) {
delete buffer;
UnLock();
MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr;
return nullptr;
@ -108,7 +111,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
ocl_runtime_->UnmapBuffer(*mem, host_ptr);
if (!img_size.empty()) {
cl::ImageFormat image_format(CL_RGBA, img_size[2]);
cl::Image2D *image = new (std::nothrow) cl::Image2D(*ocl_runtime_->Context(), image_format, *buffer, img_size[0],
image = new (std::nothrow) cl::Image2D(*ocl_runtime_->Context(), image_format, *buffer, img_size[0],
img_size[1], img_pitch * dtype_size, &ret);
if (image == nullptr || ret != CL_SUCCESS) {
delete buffer;
@ -120,17 +123,22 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
image_ptr = static_cast<void *>(image);
}
}
std::unique_ptr<MemBuf> mem_buf = std::make_unique<MemBuf>();
MemBuf *mem_buf = new (std::nothrow) MemBuf;
if (mem_buf == nullptr) {
delete buffer;
delete image;
return nullptr;
}
mem_buf->size_ = size;
mem_buf->device_ptr_ = device_ptr;
mem_buf->host_ptr_ = host_ptr;
mem_buf->image_ptr_ = image_ptr;
mem_buf->img_size = img_size;
std::string type_name = img_size.empty() ? "buffer" : "Image2D";
allocated_list_[host_ptr] = mem_buf;
UnLock();
MS_LOG(DEBUG) << "Malloc a new " << type_name << ". size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
<< ", device addr: " << mem_buf->device_ptr_ << ", image_addr: " << image_ptr;
allocated_list_[host_ptr] = mem_buf.release();
UnLock();
return host_ptr;
}
@ -175,22 +183,27 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
std::vector<size_t> region{img_size[0], img_size[1], 1};
host_ptr = ocl_runtime_->MapBuffer(*image, 0, CL_MAP_READ | CL_MAP_WRITE, region);
if (host_ptr == nullptr) {
MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr;
delete image;
UnLock();
MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr;
return nullptr;
}
cl::Memory *mem = image;
ocl_runtime_->UnmapBuffer(*mem, host_ptr);
std::unique_ptr<MemBuf> mem_buf = std::make_unique<MemBuf>();
MemBuf *mem_buf = new (std::nothrow) MemBuf;
if (mem_buf == nullptr) {
delete image;
return nullptr;
}
mem_buf->size_ = size;
mem_buf->device_ptr_ = device_ptr;
mem_buf->image_ptr_ = image_ptr;
mem_buf->host_ptr_ = host_ptr;
mem_buf->img_size = img_size;
allocated_list_[host_ptr] = mem_buf;
UnLock();
MS_LOG(DEBUG) << "Malloc a new Image2D. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
<< ", device addr: " << mem_buf->device_ptr_ << ", image addr: " << mem_buf->image_ptr_;
allocated_list_[host_ptr] = mem_buf.release();
UnLock();
return host_ptr;
}
void OpenCLAllocator::Free(void *buf) {
@ -268,6 +281,7 @@ void OpenCLAllocator::Clear() {
it->second->image_ptr_ = nullptr;
}
}
delete it->second;
}
allocated_list_.clear();
@ -289,6 +303,7 @@ void OpenCLAllocator::Clear() {
it->second->image_ptr_ = nullptr;
}
}
delete it->second;
}
free_list_.clear();
UnLock();

View File

@ -36,6 +36,9 @@ void DepthWiseTestMain(ConvParameter *conv_param, T2 *input_data, T1 *weight_dat
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init();
auto allocator = ocl_runtime->GetAllocator();
if (dtype == kNumberTypeFloat16) {
ocl_runtime->SetFp16Enable(true);
}
// pack input
int IC4 = UP_DIV(conv_param->input_channel_, C4NUM);
@ -101,7 +104,7 @@ void DepthWiseTestMain(ConvParameter *conv_param, T2 *input_data, T1 *weight_dat
pKernel->SetFormatType(format);
pKernel->Init();
std::vector<kernel::LiteKernel *> kernels{pKernel.get()};
std::vector<kernel::LiteKernel *> kernels{pKernel.release()};
std::vector<lite::Tensor *> inputs_{&tensor_a};
auto pGraph = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_, outputs, kernels, kernels, kernels);
if (pGraph.get() == nullptr) {
@ -117,18 +120,18 @@ void DepthWiseTestMain(ConvParameter *conv_param, T2 *input_data, T1 *weight_dat
pGraph->Run();
if (is_compare) {
T2 *packed_output = reinterpret_cast<T2 *>(outputs[0]->data_c());
auto packed_correct_data = std::make_unique<T2>(packed_output_size);
if (packed_correct_data.get() == nullptr) {
auto packed_correct_data = new (std::nothrow) T2[packed_output_size];
if (packed_correct_data == nullptr) {
delete[] packed_input;
return;
}
memset(packed_correct_data.get(), 0, packed_output_size * sizeof(T2));
memset(packed_correct_data, 0, packed_output_size * sizeof(T2));
if (format == schema::Format_NC4HW4) {
kernel::PackNHWCToNC4HW4<T2, T2>(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
kernel::PackNHWCToNC4HW4<T2, T2>(gnd_data, packed_correct_data, conv_param->output_batch_,
conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_,
to_dtype);
} else {
kernel::PackNHWCToNHWC4<T2, T2>(gnd_data, packed_correct_data.get(), conv_param->output_batch_,
kernel::PackNHWCToNHWC4<T2, T2>(gnd_data, packed_correct_data, conv_param->output_batch_,
conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_,
to_dtype);
}
@ -153,22 +156,25 @@ void DepthWiseTestMain(ConvParameter *conv_param, T2 *input_data, T1 *weight_dat
std::cout << std::endl;
printf("==================expected output data=================\n");
for (int i = 0; i < packed_output_size; i++) {
std::cout << packed_correct_data.get()[i] << ", ";
std::cout << packed_correct_data[i] << ", ";
}
std::cout << std::endl;
// compare
CommonTest::CompareOutputData<T2>(packed_output, packed_correct_data.get(), packed_output_size, err_max);
CommonTest::CompareOutputData<T2>(packed_output, packed_correct_data, packed_output_size, err_max);
delete [] packed_correct_data;
}
inputs[1]->SetData(nullptr);
inputs[2]->SetData(nullptr);
delete[] packed_input;
lite::opencl::OpenCLRuntime::DeleteInstance();
inputs[0]->SetData(nullptr);
outputs[0]->SetData(nullptr);
return;
}
TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
auto conv_param = std::make_unique<ConvParameter>();
auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
{
conv_param->input_batch_ = 1;
conv_param->input_h_ = 4;
@ -209,11 +215,11 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686,
2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4);
}
TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
auto conv_param = std::make_unique<ConvParameter>();
auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
{
conv_param->input_batch_ = 1;
conv_param->input_h_ = 3;
@ -281,11 +287,11 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933,
1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203};
DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4);
}
TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
auto conv_param = std::make_unique<ConvParameter>();
auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
{
conv_param->input_batch_ = 1;
conv_param->input_h_ = 4;
@ -326,12 +332,12 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686,
2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, gnd_data, schema::Format_NHWC4);
// delete conv_param;
}
TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
auto conv_param = std::make_unique<ConvParameter>();
auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
{
conv_param->input_batch_ = 1;
conv_param->input_h_ = 3;
@ -399,11 +405,11 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933,
1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203};
DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, gnd_data, schema::Format_NHWC4);
}
TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) {
auto conv_param = std::make_unique<ConvParameter>();
auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
{
conv_param->input_batch_ = 1;
conv_param->input_h_ = 4;
@ -446,12 +452,12 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) {
2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true);
DepthWiseTestMain<float16_t, float16_t>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
DepthWiseTestMain<float16_t, float16_t>(conv_param, input_data, weight_data, gnd_data, schema::Format_NHWC4,
kNumberTypeFloat16, true, 1e-2);
}
TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp16) {
auto conv_param = std::make_unique<ConvParameter>();
auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
{
conv_param->input_batch_ = 1;
conv_param->input_h_ = 3;
@ -519,8 +525,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp16) {
0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933,
1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203};
lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true);
DepthWiseTestMain<float16_t, float16_t>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
DepthWiseTestMain<float16_t, float16_t>(conv_param, input_data, weight_data, gnd_data, schema::Format_NHWC4,
kNumberTypeFloat16, true, 1e-2);
}
@ -565,31 +570,30 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2Fp32) {
printf("========profiling depthwise, in shape(%d,%d,%d,%d), out shape(%d,%d,%d,%d), iter%d========\n",
src_shape[i][0], src_shape[i][1], src_shape[i][2], src_shape[i][3], dst_shape[i][0], dst_shape[i][1],
dst_shape[i][2], dst_shape[i][3], j);
auto conv_param = ConvParameter();
auto conv_param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
{
conv_param.input_batch_ = 1;
conv_param.input_h_ = src_shape[i][2];
conv_param.input_w_ = src_shape[i][3];
conv_param.input_channel_ = src_shape[i][1];
conv_param.output_batch_ = 1;
conv_param.output_h_ = dst_shape[i][2];
conv_param.output_w_ = dst_shape[i][3];
conv_param.output_channel_ = dst_shape[i][1];
conv_param.kernel_h_ = filter_shape[i][1];
conv_param.kernel_w_ = filter_shape[i][2];
conv_param.stride_h_ = conv_param.output_h_ / conv_param.input_h_;
conv_param.stride_w_ = conv_param.output_w_ / conv_param.input_w_;
conv_param.pad_u_ = (conv_param.kernel_h_ - 1) / 2;
conv_param.pad_l_ = (conv_param.kernel_w_ - 1) / 2;
conv_param.dilation_h_ = 1;
conv_param.dilation_w_ = 1;
conv_param->input_batch_ = 1;
conv_param->input_h_ = src_shape[i][2];
conv_param->input_w_ = src_shape[i][3];
conv_param->input_channel_ = src_shape[i][1];
conv_param->output_batch_ = 1;
conv_param->output_h_ = dst_shape[i][2];
conv_param->output_w_ = dst_shape[i][3];
conv_param->output_channel_ = dst_shape[i][1];
conv_param->kernel_h_ = filter_shape[i][1];
conv_param->kernel_w_ = filter_shape[i][2];
conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2;
conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2;
conv_param->dilation_h_ = 1;
conv_param->dilation_w_ = 1;
}
DepthWiseTestMain<float, float>(&conv_param, input_data, weight_data, nullptr, schema::Format_NHWC4,
DepthWiseTestMain<float, float>(conv_param, input_data, weight_data, nullptr, schema::Format_NHWC4,
kNumberTypeFloat32, false);
}
}
delete[] input_data;
delete[] weight_data;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore