diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index e5e1c877fc6..1b0085fe898 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -76,7 +76,9 @@ int ConcatOpenCLKernel::Init() { auto param = reinterpret_cast(this->op_parameter_); MS_LOG(DEBUG) << " concat at axis=: " << param->axis_; - param->axis_ = (param->axis_ == -1) ? (in_tensors_[0]->shape().size() - 1) : param->axis_; + if (param->axis_ < 0) { + param->axis_ += in_tensors_.front()->shape().size(); + } if (param->axis_ < 0 || param->axis_ > 3) { MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 "; return RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index 5cef4ff0024..bddb03a6f82 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -65,6 +65,7 @@ int ConvolutionOpenCLKernel::Init() { CO_SLICES_ = UP_DIV(CO_, C4NUM); KH_ = param->kernel_h_; KW_ = param->kernel_w_; + has_bias_ = in_tensors_.size() == 3; // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true TILES_X_ = UP_DIV(OW_, 4); @@ -243,7 +244,9 @@ int ConvolutionOpenCLKernel::InitBias() { int ConvolutionOpenCLKernel::InitBuffer() { InitWeight(); - InitBias(); + if (has_bias_) { + InitBias(); + } return RET_OK; } @@ -298,7 +301,9 @@ int ConvolutionOpenCLKernel::Run() { cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_}; ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); - ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); + if (has_bias_) { + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); + } ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape); } else { @@ -306,7 +311,9 @@ int ConvolutionOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); + if (has_bias_) { + ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); + } if (op_format_ == Format_NC4HW4) { cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_}; cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_}; @@ -372,10 +379,14 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { code += "__kernel void Convolution(__read_only image2d_t input,\n" - " __write_only image2d_t output,\n" - " __global FLT4 *weight,\n" - " __global FLT4 *bias)" - "{\n"; + " __write_only image2d_t output,\n"; + if (has_bias_) { + code += + " __global FLT4 *weight,\n" + " __global FLT4 *bias) {\n"; + } else { + code += " __global FLT4 *weight) {\n"; + } code += " int n_oh = get_global_id(0); // [0, N*OH)\n"; if (batch_size_ == 1) { @@ -426,17 +437,20 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { " }\n" " }\n" " }\n\n"; - code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n"; + + if (has_bias_) { + code += " out0_c4 = out0_c4 + bias[co_slice];\n"; + } if (param->act_type_ == ActType_Relu) { - code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n"; + code += " out0_c4 = max(out0_c4, (FLT4)(0.0f));\n"; } else if (param->act_type_ == ActType_Relu6) { - code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; + code += " out0_c4 = clamp(out0_c4, (FLT4)(0.0f), (FLT4)(6.0f));\n"; } if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) { - code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4_bias);// NHWC4: NH WC\n}"; + code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4);// NHWC4: NH WC\n}"; } else { - code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4_bias);\n}"; + code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4);\n}"; } return code; } @@ -460,8 +474,11 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { "\n" "__kernel void Convolution(__read_only image2d_t input,\n" " __write_only image2d_t output,\n" - " __global FLT4 *weight,\n" - " __global FLT4 *bias,\n" + " __global FLT4 *weight,\n"; + if (has_bias_) { + code += " __global FLT4 *bias,\n"; + } + code += " const int4 input_shape,\n" " const int4 output_shape)\n" "{\n"; @@ -578,7 +595,10 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { " }\n" " }\n\n"; - code += " out0 = out0 + bias[co_slice];\n"; + if (has_bias_) { + code += " out0 = out0 + bias[co_slice];\n"; + } + if (param->act_type_ == ActType_Relu) { code += " out0 = max(out0, (FLT4)(0.0f));\n"; } else if (param->act_type_ == ActType_Relu6) { @@ -591,7 +611,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { " if (last_is_double)" " {\n"; } - code += " out1 = out1 + bias[co_slice];\n"; + if (has_bias_) { + code += " out1 = out1 + bias[co_slice];\n"; + } if (param->act_type_ == ActType_Relu) { code += " out1 = max(out1, (FLT4)(0.0f));\n"; } else if (param->act_type_ == ActType_Relu6) { @@ -788,8 +810,11 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { "};\n" "\n" "__kernel void Winograd36To4x4(__read_only image2d_t input,\n" - " __write_only image2d_t output,\n" - " __global FLT4 *bias,\n" + " __write_only image2d_t output,\n"; + if (has_bias_) { + code += " __global FLT4 *bias,\n"; + } + code += " int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n" " int4 output_shape) // N H W CO_SLICES\n" "{\n" @@ -824,9 +849,10 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { " for (int y = 0; y < 6; y++)\n" " {\n" " acc += AtM_row[y] * At[x * 6 + y];\n" - " }\n" - " acc += bias[slice];\n" - "\n"; + " }\n"; + if (has_bias_) { + code += " acc += bias[slice];\n"; + } auto param = reinterpret_cast(op_parameter_); if (param->act_type_ == ActType_Relu) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index f7306a548db..f5f3e149ab2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -35,44 +35,15 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { int Init() override; int Run() override; - int InitBuffer(); int GetImageSize(size_t idx, std::vector *img_size) override; private: - bool use_fp16_ = false; - - int batch_size_{}; - int CI_{}; - int IH_{}; - int IW_{}; - int CO_{}; - int OH_{}; - int OW_{}; - int CI_SLICES_{}; - int CO_SLICES_{}; - int KH_{}; - int KW_{}; - void *packed_weight_ = nullptr; - void *packed_bias_ = nullptr; - - bool use_winograd_ = false; - int TILES_X_{}; - int TILES_Y_{}; - int TILES_XY_{}; - void *winograd_mem0_ = nullptr; - void *winograd_mem1_ = nullptr; - - cl::Kernel kernel_4x4to36_; - cl::Kernel kernel_conv_; - cl::Kernel kernel_36to4x4_; - + int InitBuffer(); int InitWeight(); int InitBias(); int GenerateWinogradWeight(); - std::string CodeGenConvolutionNHWC4(); std::string CodeGenConvolutionNC4HW4(); - std::string CodeGenWinograd4x4To36(); std::string CodeGenWinogradConvolution(); std::string CodeGenWinograd36To4x4(); @@ -110,6 +81,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { param->pad_r_, param->dilation_h_, param->dilation_w_, + has_bias_, use_fp16_, op_format_, param->act_type_}; @@ -119,6 +91,34 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { } return code_id; } + + bool use_fp16_ = false; + + int batch_size_{}; + int CI_{}; + int IH_{}; + int IW_{}; + int CO_{}; + int OH_{}; + int OW_{}; + int CI_SLICES_{}; + int CO_SLICES_{}; + int KH_{}; + int KW_{}; + void *packed_weight_ = nullptr; + void *packed_bias_ = nullptr; + bool has_bias_ = false; + + bool use_winograd_ = false; + int TILES_X_{}; + int TILES_Y_{}; + int TILES_XY_{}; + void *winograd_mem0_ = nullptr; + void *winograd_mem1_ = nullptr; + + cl::Kernel kernel_4x4to36_; + cl::Kernel kernel_conv_; + cl::Kernel kernel_36to4x4_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index 27a59c92fd0..a3e2c4c656e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -34,134 +34,71 @@ namespace mindspore::kernel { int ToFormatOpenCLKernel::Init() { auto parameter = reinterpret_cast(op_parameter_); out_mem_type_ = parameter->out_mem_type; - std::string program_name = "to_format"; - std::map format_str{ - {schema::Format::Format_NCHW, "NCHW"}, {schema::Format::Format_NHWC, "NHWC"}, - {schema::Format::Format_NC4HW4, "NC4HW4"}, {schema::Format::Format_NC4, "NHWC4"}, - {schema::Format::Format_NC, "NHWC"}, {schema::Format::Format_NHWC4, "NHWC4"}}; - std::string kernel_name = - "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()]; - std::map dtype_str{ - {kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt8, "Int8"}}; - if (out_mem_type_ == OpenCLMemType::IMG) { - kernel_name += "_IMG_" + dtype_str[in_tensors_[0]->data_type()]; + std::map dtype_str{{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}}; + std::string kernel_name; + if (parameter->out_mem_type == OpenCLMemType::IMG) { + kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_[0]->data_type()]; } else { - kernel_name += "_BUF_" + dtype_str[out_tensors_[0]->data_type()]; + kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_[0]->data_type()]; } - this->set_name(kernel_name); + #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else + std::string program_name = "to_format"; std::set build_options; std::string source = to_format_source; ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - InitNHWCShape(); + + InitNHWC(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } -int ToFormatOpenCLKernel::InitNHWCShape() { - std::vector shapex = out_tensors_[0]->shape(); - size_t n, h, w, c; - if (shapex.size() == 2) { - n = shapex[0]; - h = 1; - w = 1; - c = shapex[1]; - nhwc_shape_ = {n, h, w, c}; - return RET_OK; +int ToFormatOpenCLKernel::InitNHWC() { + std::vector out_shape = out_tensors_[0]->shape(); + if (out_shape.size() == 1) { + N_ = out_shape[0]; + H_ = 1; + W_ = 1; + C_ = 1; + } else if (out_shape.size() == 2) { + N_ = out_shape[0]; + H_ = 1; + W_ = 1; + C_ = out_shape[1]; + } else if (out_shape.size() == 3) { + N_ = out_shape[0]; + H_ = 1; + W_ = out_shape[1]; + C_ = out_shape[2]; + } else if (out_shape.size() == 4) { + N_ = out_shape[0]; + H_ = out_shape[1]; + W_ = out_shape[2]; + C_ = out_shape[3]; } - if (shapex.size() == 3) { - n = 1; - h = 1; - w = 1; - c = 1; - nhwc_shape_ = {n, h, w, c}; - return RET_OK; - } - if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4HW4 || - out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4 || - out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC) { - n = shapex[0]; - h = shapex[1]; - w = shapex[2]; - c = shapex[3]; - } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NCHW) { - n = shapex[0]; - h = shapex[2]; - w = shapex[3]; - c = shapex[1]; - } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4 || - out_tensors_[0]->GetFormat() == schema::Format::Format_NC) { - n = shapex[0]; - h = 1; - w = 1; - c = shapex[1]; - } else { - n = shapex[0]; - h = shapex[1]; - w = shapex[2]; - c = shapex[3]; - } - nhwc_shape_ = {n, h, w, c}; - return RET_OK; -} - -int ToFormatOpenCLKernel::ReSize() { return RET_OK; } - -int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector *global_size) { - std::vector vec = {nhwc_shape_[0] * nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)}; - *global_size = std::move(vec); - return RET_OK; -} -int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector &global_size, - std::vector *local_size) { return RET_OK; } int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { - size_t im_dst_x, im_dst_y; - if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4HW4) { - int c = nhwc_shape_[3]; - int h = nhwc_shape_[1]; - int w = nhwc_shape_[2]; - im_dst_y = nhwc_shape_[0] * h * UP_DIV(c, C4NUM); - im_dst_x = w; - } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) { - int h = nhwc_shape_[0] * nhwc_shape_[1]; - int w = nhwc_shape_[2]; - int c = nhwc_shape_[3]; - im_dst_x = w * UP_DIV(c, C4NUM); - im_dst_y = h; - } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4) { - int c = nhwc_shape_[3]; - im_dst_x = UP_DIV(c, C4NUM); - im_dst_y = 1; - } else { - MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat(); - return RET_ERROR; - } - img_size->clear(); - auto enable_fp16_ = ocl_runtime_->GetFp16Enable(); - size_t img_dtype = CL_FLOAT; - if (enable_fp16_) { - img_dtype = CL_HALF_FLOAT; - } - std::vector vec{im_dst_x, im_dst_y, img_dtype}; - *img_size = vec; + size_t img_height = N_ * H_; + size_t img_width = W_ * UP_DIV(C_, C4NUM); + size_t img_dtype = ocl_runtime_->GetFp16Enable() ? CL_HALF_FLOAT : CL_FLOAT; + *img_size = {img_width, img_height, img_dtype}; return RET_OK; } + int ToFormatOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; - std::vector local = {}; - std::vector global; - GetGlobalSize(0, &global); - - cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]}; + std::vector global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; + std::vector local = {16, 8, 1}; + cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_}; cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; + auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG; auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF; ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h index f272ff1b355..5db1b83cbf2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h @@ -31,16 +31,18 @@ class ToFormatOpenCLKernel : public OpenCLKernel { ~ToFormatOpenCLKernel() override{}; int Init() override; - int ReSize() override; + int ReSize() override { return RET_OK; }; int Run() override; int GetImageSize(size_t idx, std::vector *img_size) override; - int GetGlobalSize(size_t idx, std::vector *global_size) override; - int GetLocalSize(size_t idx, const std::vector &global_size, std::vector *local_size) override; - int InitNHWCShape(); private: + int InitNHWC(); + cl::Kernel kernel_; - std::vector nhwc_shape_; + size_t N_{1}; + size_t H_{1}; + size_t W_{1}; + size_t C_{1}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index be5c598eb4e..fc38b78d098 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -237,33 +237,33 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) { } auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); auto runtime = runtime_wrapper.GetInstance(); - runtime->SyncCommandQueue(); - auto allocator = runtime->GetAllocator(); auto origin_data = tensor->data_c(); - allocator->MapBuffer(origin_data, CL_MAP_READ | CL_MAP_WRITE, nullptr, true); - tensor->SetData(origin_data); + runtime->SyncCommandQueue(); + allocator->MapBuffer(origin_data, CL_MAP_READ, nullptr, true); - auto Batch = tensor->Batch(); - auto Height = tensor->shape().size() == 4 ? tensor->Height() : 1; - auto Width = tensor->shape().size() == 4 ? tensor->Width() : 1; - auto SLICES = UP_DIV(tensor->Channel(), C4NUM); + auto shape = tensor->shape(); + auto N = shape.size() > 0 ? shape[0] : 1; + auto H = shape.size() > 1 ? shape[1] : 1; + auto W = shape.size() > 2 ? shape[2] : 1; + auto C = shape.size() > 3 ? shape[3] : 1; + auto SLICES = UP_DIV(C, C4NUM); + auto ElementsC4Num = N * H * W * UP_ROUND(C, C4NUM); auto alignment = runtime->GetImagePitchAlignment(); - auto dtype_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half4) : sizeof(cl_float4); - auto row_pitch = (Width * SLICES + alignment - 1) / alignment * alignment * dtype_size; - auto row_size = Width * SLICES * dtype_size; - std::vector data(tensor->Size()); - for (int i = 0; i < Batch * Height; ++i) { + auto FLT4_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half4) : sizeof(cl_float4); + auto row_pitch = (W * SLICES + alignment - 1) / alignment * alignment * FLT4_size; + auto row_size = W * SLICES * FLT4_size; + std::vector data(N * H * row_size); + for (int i = 0; i < N * H; ++i) { memcpy(static_cast(data.data()) + i * row_size, static_cast(origin_data) + i * row_pitch, row_size); } std::cout << "shape=("; - for (auto x : tensor->shape()) { + for (auto x : shape) { printf("%3d,", x); } printf("): "); - - for (size_t i = 0; i < num && i < tensor->ElementsNum(); ++i) { + for (size_t i = 0; i < num && i < ElementsC4Num; ++i) { if (tensor->data_type() == kNumberTypeFloat16) printf("%zu %6.3f | ", i, (reinterpret_cast(data.data()))[i]); else @@ -272,7 +272,7 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) { printf("\n"); if (!out_file.empty()) { - Write2File(data.data(), out_file, tensor->Size()); + Write2File(data.data(), out_file, data.size()); } allocator->UnmapBuffer(origin_data); }