opencl convolution kernel support batchsize>1

This commit is contained in:
wangdongxu 2020-10-10 09:49:02 +08:00
parent 265e7b93d5
commit 1a080605e1
3 changed files with 182 additions and 136 deletions

View File

@ -43,20 +43,24 @@ int ConvolutionOpenCLKernel::Init() {
std::set<std::string> build_options; std::set<std::string> build_options;
use_fp16_ = ocl_runtime_->GetFp16Enable(); use_fp16_ = ocl_runtime_->GetFp16Enable();
auto input_tensor = in_tensors_[0];
auto output_tensor = out_tensors_[0];
in_ori_format_ = input_tensor->GetFormat();
out_ori_format_ = output_tensor->GetFormat();
if (op_format_ != Format_NHWC4 && op_format_ != Format_NC4HW4) { if (op_format_ != Format_NHWC4 && op_format_ != Format_NC4HW4) {
MS_LOG(ERROR) << "op_format_ " << op_format_ << " not support!"; MS_LOG(ERROR) << "op_format_ " << op_format_ << " not support!";
return RET_ERROR;
} }
in_ori_format_ = in_tensors_[0]->GetFormat(); input_tensor->SetFormat(op_format_);
out_ori_format_ = out_tensors_[0]->GetFormat(); output_tensor->SetFormat(op_format_);
in_tensors_[0]->SetFormat(op_format_);
out_tensors_[0]->SetFormat(op_format_);
CI_ = in_tensors_[0]->Channel(); batch_size_ = input_tensor->Batch();
IH_ = in_tensors_[0]->Height(); CI_ = input_tensor->Channel();
IW_ = in_tensors_[0]->Width(); IH_ = input_tensor->Height();
CO_ = out_tensors_[0]->Channel(); IW_ = input_tensor->Width();
OH_ = out_tensors_[0]->Height(); CO_ = output_tensor->Channel();
OW_ = out_tensors_[0]->Width(); OH_ = output_tensor->Height();
OW_ = output_tensor->Width();
CI_SLICES_ = UP_DIV(CI_, C4NUM); CI_SLICES_ = UP_DIV(CI_, C4NUM);
CO_SLICES_ = UP_DIV(CO_, C4NUM); CO_SLICES_ = UP_DIV(CO_, C4NUM);
KH_ = param->kernel_h_; KH_ = param->kernel_h_;
@ -68,46 +72,11 @@ int ConvolutionOpenCLKernel::Init() {
TILES_XY_ = TILES_X_ * TILES_Y_; TILES_XY_ = TILES_X_ * TILES_Y_;
use_winograd_ = UseWinograd4x4To6x6(); use_winograd_ = UseWinograd4x4To6x6();
std::vector<int> vpara{IH_,
IW_,
OH_,
OW_,
KH_,
KW_,
CI_SLICES_,
CO_SLICES_,
param->stride_h_,
param->stride_w_,
param->pad_u_,
param->pad_l_,
param->pad_d_,
param->pad_r_};
std::string code_id;
for (auto &iv : vpara) {
code_id += "_" + std::to_string(iv);
}
std::vector<bool> vflag{
use_fp16_,
op_format_ == schema::Format_NC4HW4,
op_format_ == schema::Format_NHWC4,
param->act_type_ == ActType_Relu6,
param->act_type_ == ActType_Relu,
param->pad_u_ || param->pad_d_,
OW_ % 2 == 1,
OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE,
};
unsigned int init_count = 0;
for (size_t i = 0; i < vflag.size(); ++i) {
init_count |= ((unsigned int)vflag[i]) << i;
}
code_id += "_" + std::to_string(init_count);
// build kernel // build kernel
auto code_id = get_code_id();
std::string program_name;
if (use_winograd_) { if (use_winograd_) {
MS_LOG(DEBUG) << "use winograd"; MS_LOG(DEBUG) << "use winograd";
std::string program_name;
program_name = "Winograd4x4To36" + code_id; program_name = "Winograd4x4To36" + code_id;
ocl_runtime_->LoadSource(program_name, CodeGenWinograd4x4To36()); ocl_runtime_->LoadSource(program_name, CodeGenWinograd4x4To36());
ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options); ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);
@ -120,7 +89,7 @@ int ConvolutionOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, CodeGenWinograd36To4x4()); ocl_runtime_->LoadSource(program_name, CodeGenWinograd36To4x4());
ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options); ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
} else { } else {
std::string program_name = "convolution" + code_id; program_name = "Convolution" + code_id;
std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4(); std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_conv_, program_name, "Convolution", build_options); ocl_runtime_->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
@ -281,18 +250,18 @@ int ConvolutionOpenCLKernel::InitBuffer() {
int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t im_dst_x, im_dst_y; size_t im_dst_x, im_dst_y;
if (in_tensors_[0]->GetFormat() == Format_NHWC4) { if (in_tensors_[0]->GetFormat() == Format_NHWC4) {
if (out_tensors_[0]->Width() * CO_SLICES_ <= MAX_IMAGE2D_SIZE) { if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
{ {
im_dst_x = out_tensors_[0]->Width() * CO_SLICES_; im_dst_y = batch_size_ * OH_;
im_dst_y = out_tensors_[0]->Height(); im_dst_x = OW_ * CO_SLICES_;
} }
} else { } else {
im_dst_x = out_tensors_[0]->Height() * CO_SLICES_; im_dst_y = OW_;
im_dst_y = out_tensors_[0]->Width(); im_dst_x = batch_size_ * OH_ * CO_SLICES_;
} }
} else { } else {
im_dst_y = out_tensors_[0]->Height() * CO_SLICES_; im_dst_y = batch_size_ * CO_SLICES_ * OH_;
im_dst_x = out_tensors_[0]->Width(); im_dst_x = OW_;
} }
size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
img_size->clear(); img_size->clear();
@ -374,6 +343,8 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
std::string code; std::string code;
code += "#define CI_TILE 4\n"; code += "#define CI_TILE 4\n";
code += "#define CO_TILE 4\n\n"; code += "#define CO_TILE 4\n\n";
code += "#define N " + std::to_string(batch_size_) + "\n";
code += "#define N_OH " + std::to_string(batch_size_ * OH_) + "\n";
code += "#define CI " + std::to_string(CI_ALIGN) + "\n"; code += "#define CI " + std::to_string(CI_ALIGN) + "\n";
code += "#define IH " + std::to_string(IH_) + "\n"; code += "#define IH " + std::to_string(IH_) + "\n";
code += "#define IW " + std::to_string(IW_) + "\n"; code += "#define IW " + std::to_string(IW_) + "\n";
@ -404,13 +375,22 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
" __global FLT4 *bias)" " __global FLT4 *bias)"
"{\n"; "{\n";
code += " int n_oh = get_global_id(0); // [0, N*OH)\n";
if (batch_size_ == 1) {
code += " #define n 0\n";
code += " int oh = n_oh;\n";
} else {
code += " int n = n_oh / " + std::to_string(OH_) + ";\n";
code += " int oh = n_oh % " + std::to_string(OH_) + ";\n";
}
code += code +=
" int ow = get_global_id(0); // [0, OW)\n" " int ow = get_global_id(1); // [0, OW)\n"
" int oh = get_global_id(1); // [0, OH)\n"
" int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )\n" " int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )\n"
"\n" "\n"
" if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n" " if (n_oh >= N_OH || ow >= OW || co_slice >= CO_SLICES) {\n"
" return;\n" " return;\n"
" }\n"
"\n" "\n"
" FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" " FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
" __global FLT4 *w0_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE;\n"; " __global FLT4 *w0_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE;\n";
@ -426,7 +406,10 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
" {\n" " {\n"
" for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
" {\n"; " {\n";
code += "FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n";
code +=
"FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, n * IH + ih)); // NHWC4: NH WC\n\n";
code += code +=
" out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n" " out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n"
" out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n" " out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n"
@ -448,11 +431,10 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
} else if (param->act_type_ == ActType_Relu6) { } else if (param->act_type_ == ActType_Relu6) {
code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
} }
if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) { if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}"; code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4_bias);// NHWC4: NH WC\n}";
} else { } else {
code += " WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}"; code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4_bias);\n}";
} }
return code; return code;
} }
@ -480,14 +462,26 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
" __global FLT4 *bias,\n" " __global FLT4 *bias,\n"
" const int4 input_shape,\n" " const int4 input_shape,\n"
" const int4 output_shape)\n" " const int4 output_shape)\n"
"{\n" "{\n";
" int ow = get_global_id(0) * 2;\n"
" int oh = get_global_id(1);\n" code += " int n_oh = get_global_id(0); // [0, N*OH)\n";
if (batch_size_ == 1) {
code += " #define n 0\n";
code += " int oh = n_oh;\n";
} else {
code += " int n = n_oh / " + std::to_string(OH_) + ";\n";
code += " int oh = n_oh % " + std::to_string(OH_) + ";\n";
}
code +=
" int ow = get_global_id(1) * 2;\n"
" int co_slice = get_global_id(2);\n" " int co_slice = get_global_id(2);\n"
"\n" "\n"
" int CI_SLICES = input_shape.w;\n" " int CI_SLICES = input_shape.w;\n"
" int CO_SLICES = output_shape.w;\n\n"; " int CO_SLICES = output_shape.w;\n\n";
code += " #define N " + std::to_string(batch_size_) + "\n";
code += " #define N_OH " + std::to_string(batch_size_ * OH_) + "\n";
code += " #define IH " + std::to_string(IH_) + "\n"; code += " #define IH " + std::to_string(IH_) + "\n";
code += " #define IW " + std::to_string(IW_) + "\n"; code += " #define IW " + std::to_string(IW_) + "\n";
code += " #define OH " + std::to_string(OH_) + "\n"; code += " #define OH " + std::to_string(OH_) + "\n";
@ -500,8 +494,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
code += " #define padLeft " + std::to_string(padLeft) + "\n\n"; code += " #define padLeft " + std::to_string(padLeft) + "\n\n";
code += code +=
" if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n" " if (n_oh >= N_OH || ow >= OW || co_slice >= CO_SLICES) {\n"
" return;\n\n"; " return;\n"
" }\n";
bool check_ow = (OW_ % 2) == 1; bool check_ow = (OW_ % 2) == 1;
if (check_ow) { if (check_ow) {
@ -539,12 +534,12 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
" int iw1 = kw + (ow + 1) * strideW - padLeft;\n" " int iw1 = kw + (ow + 1) * strideW - padLeft;\n"
" for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
" {\n" " {\n"
" FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, ci_slice * IH + ih));\n" " FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, (n * CI_SLICES + ci_slice) * IH + ih));\n"
" out0 += w[0] * in0.x;\n" " out0 += w[0] * in0.x;\n"
" out0 += w[1] * in0.y;\n" " out0 += w[1] * in0.y;\n"
" out0 += w[2] * in0.z;\n" " out0 += w[2] * in0.z;\n"
" out0 += w[3] * in0.w;\n" " out0 += w[3] * in0.w;\n"
" FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(iw1, ci_slice * IH + ih));\n" " FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(iw1, (n * CI_SLICES + ci_slice) * IH + ih));\n"
" out1 += w[0] * in1.x;\n" " out1 += w[0] * in1.x;\n"
" out1 += w[1] * in1.y;\n" " out1 += w[1] * in1.y;\n"
" out1 += w[2] * in1.z;\n" " out1 += w[2] * in1.z;\n"
@ -558,7 +553,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
" {\n" " {\n"
" for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
" {\n" " {\n"
" FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, ci_slice * IH + ih));\n" " FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(iw0, (n * CI_SLICES + ci_slice) * IH + ih));\n"
" out0 += w[0] * in0.x;\n" " out0 += w[0] * in0.x;\n"
" out0 += w[1] * in0.y;\n" " out0 += w[1] * in0.y;\n"
" out0 += w[2] * in0.z;\n" " out0 += w[2] * in0.z;\n"
@ -585,7 +580,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
} else if (param->act_type_ == ActType_Relu6) { } else if (param->act_type_ == ActType_Relu6) {
code += " out0 = clamp(out0, (FLT4)(0.0f), (FLT4)(6.0f));\n"; code += " out0 = clamp(out0, (FLT4)(0.0f), (FLT4)(6.0f));\n";
} }
code += " WRITE_IMAGE(output, (int2)(ow + 0, co_slice * OH + oh), out0);\n"; code += " WRITE_IMAGE(output, (int2)(ow + 0, (n * CO_SLICES + co_slice) * OH + oh), out0);\n";
if (check_ow) { if (check_ow) {
code += code +=
@ -598,7 +593,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
} else if (param->act_type_ == ActType_Relu6) { } else if (param->act_type_ == ActType_Relu6) {
code += " out1 = clamp(out1, (FLT4)(0.0f), (FLT4)(6.0f));\n"; code += " out1 = clamp(out1, (FLT4)(0.0f), (FLT4)(6.0f));\n";
} }
code += " WRITE_IMAGE(output, (int2)(ow + 1, co_slice * OH + oh), out1);\n"; code += " WRITE_IMAGE(output, (int2)(ow + 1, (n * CO_SLICES + co_slice) * OH + oh), out1);\n";
if (check_ow) { if (check_ow) {
code += "}\n"; code += "}\n";
} }
@ -860,7 +855,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime_->Device())()); size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime_->Device())());
const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]); const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);
size_t global_h = UP_DIV(OH_, work_group_size[0]) * work_group_size[0]; size_t global_nh = UP_DIV(batch_size_ * OH_, work_group_size[0]) * work_group_size[0];
size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1]; size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1];
size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2]; size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2];
@ -871,9 +866,9 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
} }
size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c; size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c;
size_t local_w = std::min(global_w, local_hw_size); size_t local_w = std::min(global_w, local_hw_size);
size_t local_h = std::min(local_hw_size / local_w, global_h); size_t local_nh = std::min(local_hw_size / local_w, global_nh);
if (local_h == global_h && global_h % 2 == 0) { if (local_nh == global_nh && global_nh % 2 == 0) {
local_h = global_h / 2; local_nh = global_nh / 2;
} }
if (op_format_ == Format_NHWC4) { if (op_format_ == Format_NHWC4) {
@ -883,22 +878,21 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
} }
global->clear(); global->clear();
global->push_back(UP_DIV(batch_size_ * OH_, local_nh) * local_nh);
global->push_back(UP_DIV(OW_, local_w) * local_w); global->push_back(UP_DIV(OW_, local_w) * local_w);
global->push_back(UP_DIV(OH_, local_h) * local_h);
global->push_back(UP_DIV(CO_SLICES_, local_c) * local_c); global->push_back(UP_DIV(CO_SLICES_, local_c) * local_c);
local->clear(); local->clear();
local->push_back(local_nh);
local->push_back(local_w); local->push_back(local_w);
local->push_back(local_h);
local->push_back(local_c); local->push_back(local_c);
if (op_format_ == Format_NC4HW4) { if (op_format_ == Format_NC4HW4) {
// calculate 2 FLT4 along width per work-item // calculate 2 FLT4 along width per work-item
global->at(0) = UP_DIV(global->at(0), 2); global->at(1) = UP_DIV(global->at(1), 2);
if (local->at(0) > global->at(0)) { if (local->at(1) > global->at(1)) {
local->at(0) = global->at(0); local->at(1) = global->at(1);
} }
} }
return RET_OK; return RET_OK;
} }

View File

@ -41,6 +41,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
private: private:
bool use_fp16_ = false; bool use_fp16_ = false;
int batch_size_{};
int CI_{}; int CI_{};
int IH_{}; int IH_{};
int IW_{}; int IW_{};
@ -84,11 +85,40 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 && const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 &&
param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 && param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 &&
param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ && param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ &&
IW_ == OW_; IW_ == OW_ && batch_size_ == 1;
const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12; const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
const bool hw_good = TILES_X_ * TILES_Y_ >= 16; const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
return attr_valid && channel_good && hw_good; return attr_valid && channel_good && hw_good;
} }
std::string get_code_id() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
std::vector<int> vpara{batch_size_,
CI_,
IH_,
IW_,
CO_,
OH_,
OW_,
KH_,
KW_,
param->stride_h_,
param->stride_w_,
param->pad_u_,
param->pad_l_,
param->pad_d_,
param->pad_r_,
param->dilation_h_,
param->dilation_w_,
use_fp16_,
op_format_,
param->act_type_};
std::string code_id;
for (auto &iv : vpara) {
code_id += "_" + std::to_string(iv);
}
return code_id;
}
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel

View File

@ -96,7 +96,7 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
for (int l = 0; l < L; ++l) { for (int l = 0; l < L; ++l) {
for (int m = 0; m < M; ++m) { for (int m = 0; m < M; ++m) {
auto err = std::fabs(output_data[cn] - expect_data[cn]); auto err = std::fabs(output_data[cn] - expect_data[cn]);
if (first_err_idx == -1 && max_err > atol) { if (first_err_idx == -1 && err > atol) {
first_err_idx = cn; first_err_idx = cn;
} }
if (err > max_err) { if (err > max_err) {
@ -128,22 +128,10 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
} }
} }
Format get_op_format(Format input_format) { void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, Format op_format,
switch (input_format) { const TypeId data_type, const float atol, const float *input_data, const float *weight_data,
case Format_NHWC: const float *bias_data, const float *expect_data) {
case Format_NHWC4: auto param = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
return Format_NHWC4;
case Format_NCHW:
return Format_NHWC4;
default:
return Format_NC4HW4;
}
}
void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, const TypeId data_type,
const float atol, const float *input_data, const float *weight_data, const float *bias_data,
const float *expect_data) {
auto param = std::make_unique<ConvParameter>();
if (param == nullptr) { if (param == nullptr) {
MS_LOG(ERROR) << "ConvParameter create error."; MS_LOG(ERROR) << "ConvParameter create error.";
return; return;
@ -179,13 +167,15 @@ void TEST_MAIN(const std::string &attr, Format input_format, Format output_forma
LoadData(&bias, bias_data); LoadData(&bias, bias_data);
MS_LOG(DEBUG) << "create OpenCL Kernel"; MS_LOG(DEBUG) << "create OpenCL Kernel";
auto kernel = std::vector<lite::Tensor *> inputs{&input, &weight, &bias};
ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(param.release()), {&input, &weight, &bias}, {&output}); std::vector<lite::Tensor *> outputs{&output};
kernel.SetFormatType(get_op_format(input_format)); auto kernel = std::make_unique<ConvolutionOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
kernel.Init(); kernel->SetFormatType(op_format);
kernel->Init();
MS_LOG(DEBUG) << "create SubGraph"; MS_LOG(DEBUG) << "create SubGraph";
auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel({&input}, {&output}, {&kernel}, {&kernel}, {&kernel}); std::vector<kernel::LiteKernel *> kernels{kernel.release()};
auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel({&input}, {&output}, kernels, kernels, kernels);
if (sub_graph == nullptr) { if (sub_graph == nullptr) {
return; return;
} }
@ -203,8 +193,8 @@ void TEST_MAIN(const std::string &attr, Format input_format, Format output_forma
delete sub_graph; delete sub_graph;
} }
void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, const TypeId data_type, void TEST_MAIN(const std::string &attr, Format input_format, Format output_format, Format op_format,
const float atol, const std::string &data_path) { const TypeId data_type, const float atol, const std::string &data_path) {
auto testcase_path = data_path + "/" + attr + "/"; auto testcase_path = data_path + "/" + attr + "/";
std::map<Format, std::string> format_str{ std::map<Format, std::string> format_str{
{Format_NCHW, "NCHW"}, {Format_NHWC, "NHWC"}, {Format_NHWC4, "NHWC4"}, {Format_NC4HW4, "NC4HW4"}}; {Format_NCHW, "NCHW"}, {Format_NHWC, "NHWC"}, {Format_NHWC4, "NHWC4"}, {Format_NC4HW4, "NC4HW4"}};
@ -227,27 +217,29 @@ void TEST_MAIN(const std::string &attr, Format input_format, Format output_forma
printf("bias [0-3]: %7.3f %7.3f %7.3f\n", bias_data[0], bias_data[1], bias_data[2]); printf("bias [0-3]: %7.3f %7.3f %7.3f\n", bias_data[0], bias_data[1], bias_data[2]);
printf("expect[0-3]: %7.3f %7.3f %7.3f\n", expect_data[0], expect_data[1], expect_data[2]); printf("expect[0-3]: %7.3f %7.3f %7.3f\n", expect_data[0], expect_data[1], expect_data[2]);
TEST_MAIN(attr, input_format, output_format, data_type, atol, input_data, weight_data, bias_data, expect_data); TEST_MAIN(attr, input_format, output_format, op_format, data_type, atol, input_data, weight_data, bias_data,
expect_data);
} }
TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
std::string attr = std::string attr =
"inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_" "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_"
"1x1"; "1x1";
TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 2e-6f, "testcases/mobilenetv2_fp32/"); // TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat32, 2e-6f,
TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 2e-2f, "testcases/mobilenetv2_fp32/"); // "testcases/mobilenetv2_fp32/"); TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat16,
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 2e-6f, "testcases/mobilenetv2_fp32/"); // 2e-2f, "testcases/mobilenetv2_fp32/");
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 2e-2f, "testcases/mobilenetv2_fp32/"); TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 2e-6f, "testcases/mobilenetv2_fp32/");
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 2e-2f, "testcases/mobilenetv2_fp32/");
} }
TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) { TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) {
std::string attr = std::string attr =
"inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_dilationHW_" "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_dilationHW_"
"1x1"; "1x1";
TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/"); // TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/");
TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); // TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/");
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/"); TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-4f, "testcases/test_fp32/");
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/");
} }
TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) { TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) {
@ -257,9 +249,12 @@ TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) {
float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
float bias_data[] = {0.0f, 0.0f}; float bias_data[] = {0.0f, 0.0f};
float expect_data[] = {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f, 13.0f, 13.0f}; float expect_data[] = {1.0f, 1.0f, 5.0f, 5.0f, 9.0f, 9.0f, 13.0f, 13.0f};
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); expect_data);
TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
expect_data);
} }
TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) { TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) {
std::string attr = std::string attr =
"inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
@ -267,8 +262,10 @@ TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) {
float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
float bias_data[] = {0.0f, 0.0f}; float bias_data[] = {0.0f, 0.0f};
float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f}; float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f};
TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); TEST_MAIN(attr, Format_NCHW, Format_NCHW, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); expect_data);
TEST_MAIN(attr, Format_NCHW, Format_NCHW, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
expect_data);
} }
TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) { TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) {
@ -279,14 +276,14 @@ TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) {
float bias_data[] = {0.0f, 0.0f}; float bias_data[] = {0.0f, 0.0f};
float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f, 5.0f, 0.0f, 0.0f, float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f, 5.0f, 0.0f, 0.0f,
9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f}; 9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f};
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data,
expect_data); bias_data, expect_data);
TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data,
expect_data); bias_data, expect_data);
TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data,
expect_data); bias_data, expect_data);
TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data,
expect_data); bias_data, expect_data);
} }
TEST_F(TestConvolutionOpenCL, simple_test1) { TEST_F(TestConvolutionOpenCL, simple_test1) {
@ -296,31 +293,56 @@ TEST_F(TestConvolutionOpenCL, simple_test1) {
float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
float bias_data[] = {0.5f, -0.5f}; float bias_data[] = {0.5f, -0.5f};
float expect_data[] = {2.5f, 3.5f, 8.5f, 17.5f, 14.5f, 31.5f, 20.5f, 45.5f}; float expect_data[] = {2.5f, 3.5f, 8.5f, 17.5f, 14.5f, 31.5f, 20.5f, 45.5f};
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); expect_data);
TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
expect_data);
} }
TEST_F(TestConvolutionOpenCL, simple_test2) { TEST_F(TestConvolutionOpenCL, simple_test2) {
std::string attr = std::string attr =
"inputNHWC_1x2x2x2_outputNHWC_1x2x2x1_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; "inputNHWC_1x2x2x2_outputNHWC_1x2x2x1_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
float bias_data[] = {0.0f}; float bias_data[] = {0.0f};
float expect_data[] = {28.0f, 18.0f, 22.0f, 13.0f}; float expect_data[] = {28.0f, 18.0f, 22.0f, 13.0f};
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); expect_data);
TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
expect_data);
} }
TEST_F(TestConvolutionOpenCL, simple_test3) { TEST_F(TestConvolutionOpenCL, simple_test3) {
std::string attr = std::string attr =
"inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}; 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
float bias_data[] = {0.5f, -0.5f}; float bias_data[] = {0.5f, -0.5f};
float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f}; float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f};
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); expect_data);
TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
expect_data);
}
TEST_F(TestConvolutionOpenCL, simple_test3_batch2) {
std::string attr =
"inputNHWC_2x2x2x2_outputNHWC_2x2x2x2_kernelHW_2x2_strideHW_1x1_padTopBottomLeftRight_0x1x0x1_dilationHW_1x1";
float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
float weight_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
float bias_data[] = {0.5f, -0.5f};
float expect_data[] = {168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f,
168.5f, 391.5f, 80.5f, 223.5f, 60.5f, 235.5f, 20.5f, 123.5f};
TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
expect_data);
TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
expect_data);
TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data,
bias_data, expect_data);
TEST_MAIN(attr, Format_NHWC, Format_NHWC, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data,
bias_data, expect_data);
} }
} // namespace mindspore } // namespace mindspore