forked from mindspore-Ecosystem/mindspore
!4141 add opencl convolution testcase and support image2d
Merge pull request !4141 from 王东旭/add_conv_testcase_codegen
This commit is contained in:
commit
ea8b3c5dc2
|
@ -18,109 +18,204 @@
|
|||
#include <set>
|
||||
#include <algorithm>
|
||||
#include "src/runtime/kernel/opencl/kernel/convolution.h"
|
||||
#include "src/runtime/kernel/opencl/cl/fp32/convolution.cl.inc"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_Conv2D;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int ConvolutionOpenCLKernel::Init() {
|
||||
MS_LOG(INFO) << "ConvolutionOpenCLKernel::Init()";
|
||||
|
||||
if (inputs_[0]->Batch() != 1 || outputs_[0]->Batch() != 1) {
|
||||
MS_LOG(ERROR) << "ConvolutionOpenCLKernel only support Batch=1!";
|
||||
}
|
||||
|
||||
auto io_NHWC = inputs_[0]->GetFormat() == schema::Format_NHWC && outputs_[0]->GetFormat() == schema::Format_NHWC;
|
||||
auto io_NHWC4 = inputs_[0]->GetFormat() == schema::Format_NHWC4 && outputs_[0]->GetFormat() == schema::Format_NHWC4;
|
||||
if (!io_NHWC && !io_NHWC4) {
|
||||
MS_LOG(ERROR) << "input and output data_format is invalid!";
|
||||
}
|
||||
io_dataformat_ = inputs_[0]->GetFormat();
|
||||
|
||||
if (inputs_[1]->GetFormat() != schema::Format_KHWC) {
|
||||
MS_LOG(ERROR) << "weight data_format is invalid!";
|
||||
}
|
||||
|
||||
std::cout << "ConvolutionOpenCLKernel::Init()\n";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = convolution_source_fp32;
|
||||
std::string source = CodeGen();
|
||||
std::string program_name = "convolution";
|
||||
std::string kernel_name = io_NHWC4 ? "convolution_NHWC4_OHWIIO_float8" : "convolution_NHWC_OHWI";
|
||||
std::string kernel_name = "convolution";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
||||
ocl_runtime->LoadSource(program_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
this->InitBuffer();
|
||||
|
||||
return 0;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
std::string ConvolutionOpenCLKernel::CodeGen() {
|
||||
auto param = reinterpret_cast<ConvParameter *>(opParameter);
|
||||
|
||||
auto input_tensor = inputs_[0];
|
||||
auto output_tensor = outputs_[0];
|
||||
const size_t CI = input_tensor->Channel();
|
||||
const size_t CI_SLICES = UP_DIV(CI, C4NUM);
|
||||
const size_t CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM;
|
||||
const size_t IH = input_tensor->Height();
|
||||
const size_t IW = input_tensor->Width();
|
||||
const size_t CO = output_tensor->Channel();
|
||||
const size_t CO_SLICES = UP_DIV(CO, C4NUM);
|
||||
const size_t CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM;
|
||||
const size_t OH = output_tensor->Height();
|
||||
const size_t OW = output_tensor->Width();
|
||||
const size_t KH = param->kernel_h_;
|
||||
const size_t KW = param->kernel_w_;
|
||||
const size_t strideH = param->stride_h_;
|
||||
const size_t strideW = param->stride_w_;
|
||||
const size_t padTop = param->pad_u_;
|
||||
const size_t padBottom = param->pad_d_;
|
||||
const size_t padLeft = param->pad_l_;
|
||||
const size_t padRight = param->pad_r_;
|
||||
|
||||
std::string code;
|
||||
code += "#define CI_TILE 4\n";
|
||||
code += "#define CO_TILE 4\n\n";
|
||||
code += "#define CI " + std::to_string(CI_ALIGN) + "\n";
|
||||
code += "#define IH " + std::to_string(IH) + "\n";
|
||||
code += "#define IW " + std::to_string(IW) + "\n";
|
||||
code += "#define CO " + std::to_string(CO_ALIGN) + "\n";
|
||||
code += "#define OH " + std::to_string(OH) + "\n";
|
||||
code += "#define OW " + std::to_string(OW) + "\n";
|
||||
code += "#define KH " + std::to_string(KH) + "\n";
|
||||
code += "#define KW " + std::to_string(KW) + "\n";
|
||||
code += "#define strideH " + std::to_string(strideH) + "\n";
|
||||
code += "#define strideW " + std::to_string(strideW) + "\n";
|
||||
code += "#define padTop " + std::to_string(padTop) + "\n";
|
||||
code += "#define padBottom " + std::to_string(padBottom) + "\n";
|
||||
code += "#define padLeft " + std::to_string(padLeft) + "\n";
|
||||
code += "#define padRight " + std::to_string(padRight) + "\n";
|
||||
code += "#define CI_SLICES " + std::to_string(CI_SLICES) + "\n";
|
||||
code += "#define CO_SLICES " + std::to_string(CO_SLICES) + "\n\n";
|
||||
|
||||
#ifdef ENABLE_FP16
|
||||
code +=
|
||||
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
|
||||
"#define FLT4 half4\n"
|
||||
"#define READ_FLT4 read_imageh\n"
|
||||
"#define WRITE_FLT4 write_imageh\n\n";
|
||||
#else
|
||||
code +=
|
||||
"#define FLT4 float4\n"
|
||||
"#define READ_FLT4 read_imagef\n"
|
||||
"#define WRITE_FLT4 write_imagef\n\n";
|
||||
#endif
|
||||
|
||||
code += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n\n";
|
||||
|
||||
code +=
|
||||
"__kernel void convolution(__read_only image2d_t input,\n"
|
||||
" __global FLT4 *weight,\n"
|
||||
" __global FLT4 *bias,\n"
|
||||
" __write_only image2d_t output)\n"
|
||||
"{\n";
|
||||
|
||||
code +=
|
||||
" int oh = get_global_id(0); // [0, OH)\n"
|
||||
" int ow = get_global_id(1); // [0, OW)\n"
|
||||
" int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )\n"
|
||||
"\n"
|
||||
" if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
|
||||
" __global FLT4 *w0_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE;\n";
|
||||
|
||||
code +=
|
||||
" for (int kh = 0; kh < KH; ++kh)\n"
|
||||
" {\n"
|
||||
" int ih = kh + oh * strideH - padTop;\n"
|
||||
" for (int kw = 0; kw < KW; ++kw)\n"
|
||||
" {\n"
|
||||
" int iw = kw + ow * strideW - padLeft;\n"
|
||||
" if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)\n"
|
||||
" {\n"
|
||||
" for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
|
||||
" {\n";
|
||||
|
||||
// NHWC4 NHC4W4 NC4HW4
|
||||
code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n";
|
||||
// code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw, ih * CI_SLICES + ci_slice)); // NHC4W4: HC W\n\n";
|
||||
// code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw, ci_slice * IH + ih)); // NC4HW4: CH W\n\n";
|
||||
|
||||
code +=
|
||||
" out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n"
|
||||
" out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n"
|
||||
" out0_c4 += w0_ic1_oc4[2] * in_c4.z;\n"
|
||||
" out0_c4 += w0_ic1_oc4[3] * in_c4.w;\n"
|
||||
" w0_ic1_oc4 += 4;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" w0_ic1_oc4 += 4 * CI_SLICES;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n\n";
|
||||
code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n";
|
||||
|
||||
// NHWC4 NHC4W4 NC4HW4
|
||||
if (OW * CO_SLICES < 65536) {
|
||||
code += " WRITE_FLT4(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
|
||||
} else {
|
||||
code += " WRITE_FLT4(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
|
||||
}
|
||||
// code += " WRITE_FLT4(output, (int2)(ow, oh * CO_SLICES + co_slice), out0_c4_bias);// NHC4W4: HC W\n}";
|
||||
// code += " WRITE_FLT4(output, (int2)(ow ,co_slice * OH + oh), out0_c4_bias);// NC4HW4: CH W\n}";
|
||||
|
||||
// std::cout << code << std::endl;
|
||||
return code;
|
||||
}
|
||||
|
||||
int ConvolutionOpenCLKernel::InitBuffer() {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
|
||||
// weight: OHWI -> OHWIIO
|
||||
auto weight_tensor = inputs_[1];
|
||||
auto bias_tensor = inputs_[2];
|
||||
if (io_dataformat_ == schema::Format_NHWC) {
|
||||
packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(weight_tensor->Size()));
|
||||
packed_weight_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true));
|
||||
memcpy(packed_weight_, weight_tensor->Data(), weight_tensor->Size());
|
||||
allocator->UnmapBuffer(packed_weight_);
|
||||
|
||||
packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(bias_tensor->Size()));
|
||||
packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true));
|
||||
memcpy(packed_bias_, bias_tensor->Data(), bias_tensor->Size());
|
||||
allocator->UnmapBuffer(packed_bias_);
|
||||
} else if (io_dataformat_ == schema::Format_NHWC4) {
|
||||
// OHWI -> OHWIIO
|
||||
auto weight_shape = weight_tensor->shape();
|
||||
size_t CO = weight_shape[0];
|
||||
size_t KH = weight_shape[1];
|
||||
size_t KW = weight_shape[2];
|
||||
size_t CI = weight_shape[3];
|
||||
size_t CI_SLICES = UP_DIV(CI, C4NUM);
|
||||
size_t CO_SLICES = UP_DIV(CO, C4NUM);
|
||||
constexpr size_t CI_TILE = C4NUM;
|
||||
constexpr size_t CO_TILE = C4NUM;
|
||||
size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float);
|
||||
|
||||
packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size));
|
||||
packed_weight_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true));
|
||||
memset(packed_weight_, 0x00, packed_weight_size);
|
||||
auto weight_data = reinterpret_cast<float *>(weight_tensor->Data());
|
||||
for (int co = 0; co < CO; ++co) {
|
||||
for (int kh = 0; kh < KH; ++kh) {
|
||||
for (int kw = 0; kw < KW; ++kw) {
|
||||
for (int ci = 0; ci < CI; ++ci) {
|
||||
auto co_outer = co / CO_TILE;
|
||||
auto co_inner = co % CO_TILE;
|
||||
auto ci_outer = ci / CI_TILE;
|
||||
auto ci_inner = ci % CI_TILE;
|
||||
packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE +
|
||||
co_inner] = *(weight_data++);
|
||||
}
|
||||
auto weight_shape = weight_tensor->shape();
|
||||
size_t CO = weight_shape[0];
|
||||
size_t KH = weight_shape[1];
|
||||
size_t KW = weight_shape[2];
|
||||
size_t CI = weight_shape[3];
|
||||
size_t CI_SLICES = UP_DIV(CI, C4NUM);
|
||||
size_t CO_SLICES = UP_DIV(CO, C4NUM);
|
||||
constexpr size_t CI_TILE = C4NUM;
|
||||
constexpr size_t CO_TILE = C4NUM;
|
||||
size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float);
|
||||
packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size));
|
||||
packed_weight_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true));
|
||||
memset(packed_weight_, 0x00, packed_weight_size);
|
||||
auto weight_data = reinterpret_cast<float *>(weight_tensor->Data());
|
||||
for (int co = 0; co < CO; ++co) {
|
||||
for (int kh = 0; kh < KH; ++kh) {
|
||||
for (int kw = 0; kw < KW; ++kw) {
|
||||
for (int ci = 0; ci < CI; ++ci) {
|
||||
auto co_outer = co / CO_TILE;
|
||||
auto co_inner = co % CO_TILE;
|
||||
auto ci_outer = ci / CI_TILE;
|
||||
auto ci_inner = ci % CI_TILE;
|
||||
packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE +
|
||||
co_inner] = *(weight_data++);
|
||||
}
|
||||
}
|
||||
}
|
||||
allocator->UnmapBuffer(packed_weight_);
|
||||
size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float);
|
||||
packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(packed_bias_size));
|
||||
packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true));
|
||||
memset(packed_bias_, 0x00, packed_bias_size);
|
||||
auto bias_data = reinterpret_cast<float *>(bias_tensor->Data());
|
||||
for (int co = 0; co < CO; ++co) {
|
||||
packed_bias_[co] = bias_data[co];
|
||||
}
|
||||
allocator->UnmapBuffer(packed_bias_);
|
||||
}
|
||||
allocator->UnmapBuffer(packed_weight_);
|
||||
|
||||
return 0;
|
||||
// align bias
|
||||
auto bias_tensor = inputs_[2];
|
||||
size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float);
|
||||
packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(packed_bias_size));
|
||||
packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true));
|
||||
memset(packed_bias_, 0x00, packed_bias_size);
|
||||
auto bias_data = reinterpret_cast<float *>(bias_tensor->Data());
|
||||
for (int co = 0; co < CO; ++co) {
|
||||
packed_bias_[co] = bias_data[co];
|
||||
}
|
||||
allocator->UnmapBuffer(packed_bias_);
|
||||
|
||||
return RET_OK;
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
int ConvolutionOpenCLKernel::ReSize() { return 0; }
|
||||
|
||||
static int GetBiggestDivider(int x, int y) {
|
||||
for (int i = y; i != 0; i--) {
|
||||
if (x % i == 0) {
|
||||
|
@ -130,18 +225,19 @@ static int GetBiggestDivider(int x, int y) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
static void GetLocalSize(const ConvParameter *param, std::vector<size_t> *global, std::vector<size_t> *local) {
|
||||
int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::vector<size_t> *local) {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
auto param = reinterpret_cast<ConvParameter *>(opParameter);
|
||||
|
||||
constexpr size_t work_group_size[] = {4, 4, 1};
|
||||
constexpr size_t max_work_item_sizes[] = {512, 512, 512};
|
||||
constexpr size_t max_work_group_size = 512;
|
||||
auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
|
||||
size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime->Device())());
|
||||
const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);
|
||||
|
||||
// 先用OH OW CO_SLICES初始化global,并且441对齐
|
||||
size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0];
|
||||
size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1];
|
||||
size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2];
|
||||
|
||||
// 使用策略计算local
|
||||
size_t local_c = GetBiggestDivider(global_c, max_z_size);
|
||||
size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c;
|
||||
size_t local_w = std::min(global_w, local_hw_size);
|
||||
|
@ -158,62 +254,53 @@ static void GetLocalSize(const ConvParameter *param, std::vector<size_t> *global
|
|||
local->push_back(local_h);
|
||||
local->push_back(local_w);
|
||||
local->push_back(local_c);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
||||
size_t CO_SLICES = UP_DIV(outputs_[0]->Channel(), C4NUM);
|
||||
size_t im_dst_x, im_dst_y;
|
||||
if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
|
||||
if (outputs_[0]->Width() * CO_SLICES < 65536) {
|
||||
{
|
||||
im_dst_x = outputs_[0]->Width() * CO_SLICES;
|
||||
im_dst_y = outputs_[0]->Height();
|
||||
}
|
||||
} else {
|
||||
im_dst_x = outputs_[0]->Height() * CO_SLICES;
|
||||
im_dst_y = outputs_[0]->Width();
|
||||
}
|
||||
} else {
|
||||
im_dst_y = outputs_[0]->Height() * CO_SLICES;
|
||||
im_dst_x = outputs_[0]->Width();
|
||||
}
|
||||
#ifdef ENABLE_FP16
|
||||
size_t img_dtype = CL_HALF_FLOAT;
|
||||
#else
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
#endif
|
||||
img_size->clear();
|
||||
img_size->push_back(im_dst_x);
|
||||
img_size->push_back(im_dst_y);
|
||||
img_size->push_back(img_dtype);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionOpenCLKernel::Run() {
|
||||
MS_LOG(INFO) << "ConvolutionOpenCLKernel::Run()";
|
||||
std::cout << "ConvolutionOpenCLKernel::Run()\n";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
|
||||
auto param = reinterpret_cast<ConvParameter *>(opParameter);
|
||||
auto input0_shape = inputs_[0]->shape(); // NHWC
|
||||
auto input1_shape = inputs_[1]->shape(); // OHWI
|
||||
auto outpu0_shape = outputs_[0]->shape(); // NHWC
|
||||
cl_int N = input0_shape[0];
|
||||
cl_int CI = input0_shape[3];
|
||||
cl_int IH = input0_shape[1];
|
||||
cl_int IW = input0_shape[2];
|
||||
cl_int CO = outpu0_shape[3];
|
||||
cl_int OH = outpu0_shape[1];
|
||||
cl_int OW = outpu0_shape[2];
|
||||
cl_int KH = input1_shape[1];
|
||||
cl_int KW = input1_shape[2];
|
||||
cl_int CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM;
|
||||
cl_int CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM;
|
||||
|
||||
cl_int4 input_shape;
|
||||
cl_int4 output_shape;
|
||||
if (io_dataformat_ == schema::Format_NHWC) {
|
||||
input_shape = {N, IH, IW, CI};
|
||||
output_shape = {N, OH, OW, CO};
|
||||
} else if (io_dataformat_ == schema::Format_NHWC4) {
|
||||
input_shape = {N, IH, IW, CI_ALIGN};
|
||||
output_shape = {N, OH, OW, CO_ALIGN};
|
||||
}
|
||||
cl_int4 kernel_stride = {KH, KW, param->stride_h_, param->stride_w_};
|
||||
cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_};
|
||||
|
||||
int arg_cn = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_weight_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, kernel_stride);
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_cn++, pad);
|
||||
|
||||
std::vector<size_t> global;
|
||||
std::vector<size_t> local;
|
||||
GetLocalSize(reinterpret_cast<ConvParameter *>(this->opParameter), &global, &local);
|
||||
// float8 per thread
|
||||
if (io_dataformat_ == schema::Format_NHWC4) {
|
||||
local[2] = UP_DIV(local[2], 2);
|
||||
global[2] = UP_DIV(global[2], 2);
|
||||
global[2] = UP_DIV(global[2], global[2]) * global[2];
|
||||
}
|
||||
GetGlobalLocal(&global, &local);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
|
||||
return 0;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "src/ir/tensor.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "schema/model_generated.h"
|
||||
|
@ -26,23 +27,25 @@
|
|||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
class ConvolutionOpenCLKernel : public LiteKernel {
|
||||
class ConvolutionOpenCLKernel : public OpenCLKernel {
|
||||
public:
|
||||
explicit ConvolutionOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
|
||||
const std::vector<lite::tensor::Tensor *> &outputs)
|
||||
: LiteKernel(parameter, inputs, outputs) {}
|
||||
: OpenCLKernel(parameter, inputs, outputs) {}
|
||||
~ConvolutionOpenCLKernel() override{};
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int InitBuffer();
|
||||
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
|
||||
|
||||
private:
|
||||
schema::Format io_dataformat_ = schema::Format_NHWC4;
|
||||
float *packed_weight_ = nullptr;
|
||||
float *packed_bias_ = nullptr;
|
||||
cl::Kernel kernel_;
|
||||
|
||||
std::string CodeGen();
|
||||
int GetGlobalLocal(std::vector<size_t> *global, std::vector<size_t> *local);
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -297,6 +297,7 @@ if (SUPPORT_GPU)
|
|||
${TEST_DIR}/ut/src/runtime/kernel/opencl/utils_tests.cc
|
||||
${TEST_DIR}/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
|
||||
${TEST_DIR}/ut/src/runtime/kernel/opencl/transpose_tests.cc
|
||||
${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc
|
||||
)
|
||||
endif()
|
||||
|
||||
|
|
|
@ -0,0 +1,179 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <memory>
|
||||
#include "utils/log_adapter.h"
|
||||
#include "common/common_test.h"
|
||||
#include "mindspore/lite/src/common/file_utils.h"
|
||||
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/arm/nnacl/pack.h"
|
||||
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
|
||||
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
|
||||
|
||||
using mindspore::kernel::ConvolutionOpenCLKernel;
|
||||
using mindspore::kernel::LiteKernel;
|
||||
using mindspore::kernel::SubGraphOpenCLKernel;
|
||||
|
||||
namespace mindspore {
|
||||
|
||||
class TestConvolutionOpenCL : public mindspore::Common {};
|
||||
|
||||
void LoadData(void *dst, size_t dst_size, const std::string &file_path) {
|
||||
if (file_path.empty()) {
|
||||
memset(dst, 0x00, dst_size);
|
||||
} else {
|
||||
auto src_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &dst_size));
|
||||
memcpy(dst, src_data, dst_size);
|
||||
}
|
||||
}
|
||||
|
||||
void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path) {
|
||||
auto *output_data = reinterpret_cast<float *>(output_tensor->Data());
|
||||
printf("output[0:10]:");
|
||||
for (int i = 0; i < 10; i++) {
|
||||
printf("%d:%.3f ", i, output_data[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
size_t output_size = output_tensor->Size();
|
||||
auto expect_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &output_size));
|
||||
constexpr float atol = 0.5;
|
||||
for (int i = 0; i < output_tensor->ElementsNum(); ++i) {
|
||||
if (std::fabs(output_data[i] - expect_data[i]) > atol) {
|
||||
printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]);
|
||||
printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]);
|
||||
printf("error at idx[%d] expect=%.3f output=%.3f\n\n\n", i, expect_data[i], output_data[i]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
printf("compare success!\n");
|
||||
printf("compare success!\n");
|
||||
printf("compare success!\n\n\n");
|
||||
}
|
||||
|
||||
void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::string &input_file,
|
||||
const std::string &weight_file, const std::string &bias_file, const std::string &expect_file) {
|
||||
assert(data_format == schema::Format_NHWC || data_format == schema::Format_NHWC4);
|
||||
|
||||
std::cout << "initialize OpenCLRuntime";
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
ocl_runtime->Init();
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
|
||||
std::cout << "create inputs/weights/outputs Tensors(framework do)";
|
||||
std::vector<int> input_shape = {param->input_batch_, param->input_h_, param->input_w_, param->input_channel_};
|
||||
std::vector<int> weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_};
|
||||
std::vector<int> bias_shape = {param->output_channel_};
|
||||
std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_};
|
||||
auto data_type = kNumberTypeFloat32;
|
||||
auto tensorType = schema::NodeType_ValueNode;
|
||||
auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, data_format, tensorType);
|
||||
auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensorType);
|
||||
auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensorType);
|
||||
auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, data_format, tensorType);
|
||||
std::vector<lite::tensor::Tensor *> inputs{input_tensor, weight_tensor, bias_tensor};
|
||||
std::vector<lite::tensor::Tensor *> outputs{output_tensor};
|
||||
|
||||
std::cout << "initialize weight Tensors data(framework do)";
|
||||
std::vector<float> weight_vec(weight_tensor->ElementsNum());
|
||||
std::vector<float> bias_vec(weight_tensor->ElementsNum());
|
||||
weight_tensor->SetData(weight_vec.data());
|
||||
bias_tensor->SetData(bias_vec.data());
|
||||
LoadData(weight_tensor->Data(), weight_tensor->Size(), weight_file);
|
||||
LoadData(bias_tensor->Data(), bias_tensor->Size(), bias_file);
|
||||
|
||||
std::cout << "create OpenCL Kernel"; // weight has been allcated by framework
|
||||
auto *conv_kernel = new ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
|
||||
conv_kernel->Init();
|
||||
std::vector<LiteKernel *> kernels{conv_kernel};
|
||||
|
||||
// freamework to do!!! allocate memory by hand
|
||||
inputs[0]->MallocData(allocator);
|
||||
|
||||
std::cout << "create SubGraphOpenCLKernel";
|
||||
auto *sub_graph = new SubGraphOpenCLKernel({input_tensor}, outputs, kernels, kernels, kernels);
|
||||
sub_graph->Init();
|
||||
|
||||
std::cout << "initialize input Tensors data"; // inputs has been allcated by sub_graph->Init()
|
||||
LoadData(input_tensor->Data(), input_tensor->Size(), input_file);
|
||||
printf("input[0] =%.3f\n", reinterpret_cast<float *>(input_tensor->Data())[0]);
|
||||
printf("weight[0]=%.3f\n", reinterpret_cast<float *>(weight_tensor->Data())[0]);
|
||||
printf("bias[0] =%.3f\n", reinterpret_cast<float *>(bias_tensor->Data())[0]);
|
||||
|
||||
std::cout << "sub_graph->Run()";
|
||||
sub_graph->Run();
|
||||
printf("output_tensor->Size() =%zu\n", output_tensor->Size());
|
||||
|
||||
std::cout << "compare result";
|
||||
MyCompareOutput(output_tensor, expect_file);
|
||||
// lite::CompareOutput(reinterpret_cast<float *>(output_tensor->Data()), expect_file);
|
||||
|
||||
mindspore::lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
|
||||
std::array<std::string, 4> GenFilenames(ConvParameter *param, schema::Format data_format, const std::string &path) {
|
||||
auto full_path = path + "inputNHWC_" + std::to_string(param->input_batch_) + "x" + std::to_string(param->input_h_) +
|
||||
"x" + std::to_string(param->input_w_) + "x" + std::to_string(param->input_channel_) +
|
||||
"_outputNHWC_" + std::to_string(param->output_batch_) + "x" + std::to_string(param->output_h_) +
|
||||
"x" + std::to_string(param->output_w_) + "x" + std::to_string(param->output_channel_) +
|
||||
"_kernelHW_" + std::to_string(param->kernel_h_) + "x" + std::to_string(param->kernel_w_) +
|
||||
"_strideHW_" + std::to_string(param->stride_h_) + "x" + std::to_string(param->stride_w_) +
|
||||
"_padTopBottomLeftRight_" + std::to_string(param->pad_u_) + "x" + std::to_string(param->pad_d_) +
|
||||
"x" + std::to_string(param->pad_l_) + "x" + std::to_string(param->pad_r_) + "_dilationHW_1x1/";
|
||||
|
||||
if (data_format == schema::Format_NHWC4) {
|
||||
return std::array<std::string, 4>{full_path + "input_NHWC4.bin", full_path + "weight_OHWI.bin",
|
||||
full_path + "bias_C4.bin", full_path + "expect_NHWC4.bin"};
|
||||
} else {
|
||||
return std::array<std::string, 4>{full_path + "input_NHWC.bin", full_path + "weight_OHWI.bin",
|
||||
full_path + "bias_C.bin", full_path + "expect_NHWC.bin"};
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
|
||||
auto param = new ConvParameter;
|
||||
param->input_batch_ = 1, param->input_h_ = 224, param->input_w_ = 224, param->input_channel_ = 3;
|
||||
param->output_batch_ = 1, param->output_h_ = 112, param->output_w_ = 112, param->output_channel_ = 32;
|
||||
param->kernel_h_ = 3, param->kernel_w_ = 3;
|
||||
param->stride_h_ = 2, param->stride_w_ = 2;
|
||||
param->pad_u_ = 0, param->pad_d_ = 1, param->pad_l_ = 0, param->pad_r_ = 1;
|
||||
|
||||
auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/");
|
||||
// std::cout << filenames[0] << std::endl;
|
||||
// std::cout << filenames[1] << std::endl;
|
||||
// std::cout << filenames[2] << std::endl;
|
||||
// std::cout << filenames[3] << std::endl;
|
||||
TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]);
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
|
||||
TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) {
|
||||
auto param = new ConvParameter;
|
||||
param->input_batch_ = 1, param->input_h_ = 1, param->input_w_ = 64, param->input_channel_ = 512;
|
||||
param->output_batch_ = 1, param->output_h_ = 1, param->output_w_ = 64, param->output_channel_ = 7358;
|
||||
param->kernel_h_ = 1, param->kernel_w_ = 1;
|
||||
param->stride_h_ = 1, param->stride_w_ = 1;
|
||||
param->pad_u_ = 0, param->pad_d_ = 0, param->pad_l_ = 0, param->pad_r_ = 0;
|
||||
|
||||
auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/02_fp32/");
|
||||
// std::cout << filenames[0] << std::endl;
|
||||
// std::cout << filenames[1] << std::endl;
|
||||
// std::cout << filenames[2] << std::endl;
|
||||
// std::cout << filenames[3] << std::endl;
|
||||
TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]);
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
|
||||
} // namespace mindspore
|
Loading…
Reference in New Issue