From c01ab1150970424e459c07f1edecf40d52fa3a62 Mon Sep 17 00:00:00 2001 From: zhujingxuan Date: Wed, 10 Feb 2021 15:52:53 +0800 Subject: [PATCH] add conv and conv3x3 coders --- mindspore/lite/micro/cmake/file_list.cmake | 4 + .../nnacl/int8/conv2d_3x3_int8_coder.cc | 161 +++++++++++ .../nnacl/int8/conv2d_3x3_int8_coder.h | 61 ++++ .../opcoders/nnacl/int8/conv2d_int8_coder.cc | 265 ++++++++++++++++++ .../opcoders/nnacl/int8/conv2d_int8_coder.h | 71 +++++ .../nnacl_serializer/nnacl_int8_serializer.cc | 41 +++ .../nnacl_serializer/nnacl_int8_serializer.h | 2 +- .../nnacl_serializer/nnacl_stream_utils.h | 12 + .../lite/micro/wrapper/int8/conv_init_int8.c | 88 ++++++ .../lite/micro/wrapper/int8/conv_init_int8.h | 26 ++ 10 files changed, 730 insertions(+), 1 deletion(-) create mode 100644 mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc create mode 100644 mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h create mode 100644 mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc create mode 100644 mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h create mode 100644 mindspore/lite/micro/wrapper/int8/conv_init_int8.c create mode 100644 mindspore/lite/micro/wrapper/int8/conv_init_int8.h diff --git a/mindspore/lite/micro/cmake/file_list.cmake b/mindspore/lite/micro/cmake/file_list.cmake index 5ec38ce1383..d170115f26f 100644 --- a/mindspore/lite/micro/cmake/file_list.cmake +++ b/mindspore/lite/micro/cmake/file_list.cmake @@ -78,6 +78,8 @@ set(CODER_OPCODERS_SRC ${MICRO_DIR}/coder/opcoders/nnacl/int8/concat_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/fullconnection_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/matmul_int8_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/reduce_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/reshape_int8_coder.cc @@ -120,10 +122,12 @@ set(LITE_KERNEL_SRC ${LITE_DIR}/nnacl/int8/matmul_int8.c ${LITE_DIR}/nnacl/int8/fixed_point.c ${LITE_DIR}/nnacl/fp32/matmul_fp32.c + ${LITE_DIR}/nnacl/int8/conv3x3_int8.c ) set(MICRO_ADAPTER_SRC ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c + ${MICRO_DIR}/wrapper/int8/conv_init_int8.c ) list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC} diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc new file mode 100644 index 00000000000..176e5ddbddb --- /dev/null +++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc @@ -0,0 +1,161 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h" +#include +#include +#include "securec/include/securec.h" +#include "nnacl/int8/conv3x3_int8.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h" +#include "micro/coder/opcoders/file_collector.h" +#include "micro/coder/log.h" +#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h" + +namespace mindspore::lite::micro::nnacl { +void ProcessFilterUint8(int8_t *origin_weight, int16_t *dst_weight, ConvParameter *conv_param) { + int input_channel = conv_param->input_channel_; + int output_channel = conv_param->output_channel_; + int kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_; + int iC8 = UP_DIV(input_channel, C8NUM); + + size_t tmp_size = output_channel * iC8 * C8NUM * kernel_plane * sizeof(int16_t); + auto tmp_addr = reinterpret_cast(malloc(tmp_size)); + MS_CHECK_PTR_IF_NULL(tmp_addr); + int ret = memset_s(tmp_addr, tmp_size, 0, tmp_size); + if (ret != EOK) { + free(tmp_addr); + MS_LOG(ERROR) << "memset_s tmp_addr failed."; + return; + } + PackWeightToC8Int8(origin_weight, tmp_addr, conv_param); + Conv3x3Int8FilterTransform(tmp_addr, dst_weight, iC8, output_channel, kernel_plane); + free(tmp_addr); +} + +int Conv2D3x3Int8Coder::InitWeightBias() { + int input_channel = conv_param_->input_channel_; + int output_channel = conv_param_->output_channel_; + MS_CHECK_TRUE(input_channel > 0, "invalid input_channel"); + MS_CHECK_TRUE(output_channel > 0, "invalid output_channel"); + int iC8 = UP_DIV(input_channel, C8NUM); + int oC4 = UP_DIV(output_channel, C4NUM); + // init weight + int transformed_size = iC8 * C8NUM * oC4 * C4NUM * 16 * sizeof(int16_t); + transformed_filter_addr_ = + static_cast(allocator_->Malloc(kNumberTypeInt16, transformed_size, kOfflinePackWeight)); + MS_CHECK_PTR(transformed_filter_addr_); + MS_CHECK_RET_CODE(memset_s(transformed_filter_addr_, transformed_size, 0, transformed_size), + "memset_s transformed_filter_addr_ failed."); + auto *original_weight_addr = reinterpret_cast(filter_tensor_->data_c()); + ProcessFilterUint8(original_weight_addr, transformed_filter_addr_, conv_param_); + + // init bias + int new_bias_size = oC4 * C4NUM * sizeof(int32_t); + new_bias_addr_ = static_cast(allocator_->Malloc(kNumberTypeInt32, new_bias_size, kOfflinePackWeight)); + MS_CHECK_PTR(new_bias_addr_); + MS_CHECK_RET_CODE(memset_s(new_bias_addr_, new_bias_size, 0, new_bias_size), "memset_s new_bias_addr_ failed."); + if (input_tensors_.size() == kInputSize2) { + auto *ori_bias_addr = reinterpret_cast(bias_tensor_->data_c()); + MS_CHECK_RET_CODE( + memcpy_s(new_bias_addr_, output_channel * sizeof(int32_t), ori_bias_addr, output_channel * sizeof(int32_t)), + "memset_s new_bias_addr_ failed."); + } else { + MS_ASSERT(input_tensors_.size() == kInputSize1); + } + return RET_OK; +} + +int Conv2D3x3Int8Coder::InitTmpBuffer(CoderContext *const context) { + int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM); + int oc4 = UP_DIV(conv_param_->output_channel_, C4NUM); + int in_batch = conv_param_->input_batch_; + int input_w = conv_param_->input_w_; + int input_h = conv_param_->input_h_; + int output_batch = conv_param_->output_batch_; + int output_w = conv_param_->output_w_; + int output_h = conv_param_->output_h_; + + /*=============================tile_buffer_============================*/ + tile_buffer_size_ = thread_num_ * TILE_NUM * 16 * ic8 * C8NUM * sizeof(int16_t); + tile_buffer_ = static_cast(allocator_->Malloc(kNumberTypeInt16, tile_buffer_size_, kWorkspace)); + + /*=============================block_unit_buffer_============================*/ + block_unit_buffer_size_ = thread_num_ * 4 * 4 * C8NUM * sizeof(int16_t); + block_unit_buffer_ = + static_cast(allocator_->Malloc(kNumberTypeInt16, block_unit_buffer_size_, kWorkspace)); + + /*=============================tmp_dst_buffer_============================*/ + tmp_dst_buffer_size_ = thread_num_ * TILE_NUM * 16 * oc4 * C4NUM * sizeof(int32_t); + tmp_dst_buffer_ = static_cast(allocator_->Malloc(kNumberTypeInt32, tmp_dst_buffer_size_, kWorkspace)); + + /*=============================tmp_out_============================*/ + tmp_out_size_ = oc4 * C4NUM * output_batch * output_w * output_h * sizeof(uint8_t); + tmp_out_ = static_cast(allocator_->Malloc(kNumberTypeUInt8, tmp_out_size_, kWorkspace)); + + /*=============================input_data_============================*/ + c8_input_size_ = in_batch * input_h * input_w * ic8 * C8NUM * sizeof(int16_t); + c8_input_ = static_cast(allocator_->Malloc(kNumberTypeInt16, c8_input_size_, kWorkspace)); + return RET_OK; +} + +void Conv2D3x3Int8Coder::ConfigInputOutput() { output_tensor_->set_format(schema::Format_NHWC); } + +int Conv2D3x3Int8Coder::Prepare(CoderContext *const context) { + conv_param_->thread_num_ = thread_num_; + // to 1, task id is set to 0 + conv_param_->op_parameter_.thread_num_ = thread_num_; + MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "ConvolutionBase init failed."); + MS_CHECK_RET_CODE(SetQuantParam(), "Set quant param failed."); + MS_CHECK_RET_CODE(InitWeightBias(), "Init weight bias failed."); + // init tmp input, output + MS_CHECK_RET_CODE(InitTmpBuffer(context), "Init tmp buffer failed."); + // config input output + ConfigInputOutput(); + return RET_OK; +} + +int Conv2D3x3Int8Coder::DoCode(CoderContext *const context) { + Collect(context, {"nnacl/int8/conv_int8.h"}, {"pack.c", "conv_int8.c", "fixed_point.c"}); + nnacl::NNaclInt8Serializer code; + code.precision(kPrecision); + // call the op function + code.CodeFunction("memset", tile_buffer_, 0, tile_buffer_size_); + code.CodeFunction("memset", block_unit_buffer_, 0, block_unit_buffer_size_); + code.CodeFunction("memset", tmp_dst_buffer_, 0, tmp_dst_buffer_size_); + code.CodeFunction("memset", tmp_out_, 0, tmp_out_size_); + code.CodeFunction("memset", c8_input_, 0, c8_input_size_); + + // define conv params + code.CodeStruct("conv_param_", *conv_param_); + // pack to c8 + code.CodeFunction("PackInputToC8Int8", input_tensor_, c8_input_, "&conv_param_"); + // code operator func + if (thread_num_ > 1) { + code.CodeBaseStruct("Conv3x3Int8Args", "args", c8_input_, transformed_filter_addr_, new_bias_addr_, output_tensor_, + tile_buffer_, block_unit_buffer_, tmp_dst_buffer_, tmp_out_, "&conv_param_"); + code.CodeFunction("ParallelLaunch", "THREAD_POOL_DEFAULT", "Conv3x3Int8Run", "&args", "thread_num"); + } else { + int task_id = 0; + code.CodeFunction("Conv3x3Int8", c8_input_, transformed_filter_addr_, new_bias_addr_, output_tensor_, tile_buffer_, + block_unit_buffer_, tmp_dst_buffer_, tmp_out_, task_id, "&conv_param_"); + } + code.CodeFunction("PackNC4HW4ToNHWCInt8", tmp_out_, output_tensor_, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + context->AppendCode(code.str()); + return RET_OK; +} +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h new file mode 100644 index 00000000000..eb05af43bff --- /dev/null +++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h @@ -0,0 +1,61 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_3X3_INT8_CODER_H_ +#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_3X3_INT8_CODER_H_ +#include "micro/coder/opcoders/base/conv2d_base_coder.h" +#include +#include +#include +#include "nnacl/conv_parameter.h" + +namespace mindspore::lite::micro::nnacl { +class Conv2D3x3Int8Coder final : public Conv2DBaseCoder { + public: + Conv2D3x3Int8Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const Model::Node *node, size_t node_index, Target target) + : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {} + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + ~Conv2D3x3Int8Coder() override = default; + + private: + int InitWeightBias(); + + void ConfigInputOutput(); + + int InitTmpBuffer(CoderContext *ctx); + + int16_t *transformed_filter_addr_{nullptr}; + int32_t *new_bias_addr_{nullptr}; + + int16_t *block_unit_buffer_{nullptr}; + int16_t *tile_buffer_{nullptr}; + int32_t *tmp_dst_buffer_{nullptr}; + uint8_t *tmp_out_{nullptr}; + int16_t *c8_input_{nullptr}; + + size_t tile_buffer_size_{0}; + size_t block_unit_buffer_size_{0}; + size_t tmp_dst_buffer_size_{0}; + size_t tmp_out_size_{0}; + size_t c8_input_size_{0}; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_3X3_INT8_CODER_H_ diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc new file mode 100644 index 00000000000..8d9a8f22200 --- /dev/null +++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc @@ -0,0 +1,265 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h" +#include +#include +#include +#include +#include "securec/include/securec.h" +#include "micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h" +#include "micro/coder/log.h" +#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/int8/convolution_int8.h" +#include "src/ops/populate/populate_register.h" +#include "micro/coder/opcoders/file_collector.h" + +using mindspore::schema::PrimitiveType_Conv2D; + +namespace mindspore::lite::micro::nnacl { + +int Conv2DINT8Coder::InitTmpBuffer(CoderContext *const context) { + int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; + int tmp_size; + if (target_ == kARM64) { + tmp_size = MSMAX(UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM), + UP_ROUND(kernel_plane * conv_param_->input_channel_, C16NUM)); + } else { + if (support_optimize_) { + tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM); + } else { + tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C16NUM); + } + } + // malloc packed input + packed_input_size_ = tmp_size * thread_num_ * tile_num_ * sizeof(int8_t); + packed_input_ = static_cast(allocator_->Malloc(kNumberTypeInt8, packed_input_size_, kWorkspace)); + MS_CHECK_PTR(packed_input_); + matmul_packed_input_size_ = thread_num_ * tile_num_ * kernel_plane * conv_param_->input_channel_ * sizeof(int8_t); + matmul_packed_input_ = + static_cast(allocator_->Malloc(kNumberTypeInt8, matmul_packed_input_size_, kWorkspace)); + MS_CHECK_PTR(matmul_packed_input_); + return RET_OK; +} + +void Conv2DINT8Coder::CheckSupportOptimize() { + tile_num_ = 8; + matmul_func_ = "NULL"; + + switch (target_) { + case kARM32A: + support_optimize_ = false; + tile_num_ = 4; + matmul_func_ = "NULL"; + break; + case kARM64: + // check support_optimize at runtime + matmul_func_ = "MatMulRInt8_optimize_handler"; + tile_num_ = 8; + break; + case kX86: + support_optimize_ = true; + tile_num_ = 8; + break; + default: + MS_LOG(ERROR) << "target not supported"; + return; + } + conv_param_->tile_num_ = tile_num_; +} + +int Conv2DINT8Coder::InitWeightBias(CoderContext *const context) { + int32_t input_channel = filter_tensor_->Channel(); + int32_t output_channel = filter_tensor_->Batch(); + int32_t kernel_h = filter_tensor_->Height(); + int32_t kernel_w = filter_tensor_->Width(); + conv_param_->input_channel_ = input_channel; + conv_param_->output_channel_ = output_channel; + auto output_channel_size = static_cast(output_channel); + auto output_channel_data_size = static_cast(output_channel_size * sizeof(int32_t)); + + int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_; + filter_peroc_ = conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL; + + if (filter_peroc_) { + filter_zp_ptr_ = + static_cast(allocator_->Malloc(kNumberTypeInt32, output_channel_data_size, kOfflinePackWeight)); + MS_CHECK_PTR(filter_zp_ptr_); + MS_CHECK_RET_CODE(memset_s(filter_zp_ptr_, output_channel_data_size, 0, output_channel_data_size), + "memset_s filter_zp_ptr_addr failed."); + for (int oc = 0; oc < output_channel; oc++) { + filter_zp_ptr_[oc] = conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_; + } + } + + int up_round_oc; + switch (target_) { + case kARM32A: + up_round_oc = UP_ROUND(output_channel, C2NUM); + break; + case kARM64: + up_round_oc = MSMAX(UP_ROUND(output_channel, C8NUM), UP_ROUND(output_channel, C4NUM)); + break; + case kX86: + up_round_oc = UP_ROUND(output_channel, C8NUM); + break; + default: + MS_LOG(ERROR) << "target not supported"; + return RET_ERROR; + } + + if (filter_peroc_) { + input_sum_size_ = up_round_oc * tile_num_ * thread_num_ * sizeof(int32_t); + } else { + input_sum_size_ = tile_num_ * thread_num_ * sizeof(int32_t); + } + input_sum_ = + static_cast(allocator_->Malloc(kNumberTypeInt32, static_cast(input_sum_size_), kWorkspace)); + MS_CHECK_PTR(input_sum_); + + packed_weight_ = static_cast(allocator_->Malloc(kNumberTypeInt8, kOnlineSize, kOnlinePackWeight)); + MS_CHECK_PTR(packed_weight_); + bias_data_ = static_cast(allocator_->Malloc(kNumberTypeInt32, kOnlineSize, kOnlinePackWeight)); + MS_CHECK_PTR(bias_data_); + std::string filter_zp_str = ""; + std::string packed_weight_str = "(int8_t **)&" + allocator_->GetRuntimeAddr(packed_weight_); + std::string bias_data_str = "(int32_t **)&" + allocator_->GetRuntimeAddr(bias_data_); + + nnacl::NNaclInt8Serializer code; + + if (filter_peroc_) { + filter_zp_str = allocator_->GetRuntimeAddr(filter_zp_ptr_); + } else { + filter_zp_str = "filter_zp"; + code << "int32_t filter_zp[1] = {" << conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ << "};\n"; + } + + if (target_ == kARM64) { + code.CodeFunctionWithCheck("ConvInit", filter_tensor_, bias_tensor_, filter_zp_str, kernel_h, kernel_w, + input_channel, output_channel, input_zp, filter_peroc_, "GetSupportOptFlag()", + packed_weight_str, bias_data_str); + } else { + code.CodeFunctionWithCheck("ConvInit", filter_tensor_, bias_tensor_, filter_zp_str, kernel_h, kernel_w, + input_channel, output_channel, input_zp, filter_peroc_, support_optimize_, + packed_weight_str, bias_data_str); + } + + context->AppendInitCode(code.str()); + + return RET_OK; +} + +int Conv2DINT8Coder::Prepare(CoderContext *const context) { + Conv2DBaseCoder::Init(); + CheckSupportOptimize(); + MS_CHECK_RET_CODE(SetQuantParam(), "Set quant param failed!"); + MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed."); + MS_CHECK_RET_CODE(Resize(), "Resize failed."); + MS_CHECK_RET_CODE(InitTmpBuffer(context), "InitTmpBuffer failed."); + return RET_OK; +} + +int Conv2DINT8Coder::Resize() { + MS_CHECK_RET_CODE(Conv2DBaseCoder::CheckResizeValid(), "Resize is invalid."); + MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Conv2DBaseCoder init failed."); + return RET_OK; +} + +int Conv2DINT8Coder::DoCode(CoderContext *const context) { + Collect(context, {"nnacl/int8/conv_int8.h", "nnacl/common_func.h", "nnacl/kernel/int8/conv_init_int8.h"}, + {"common_func.c", "pack.c", "conv_int8.c", "winograd_transform.c", "matmul_int8.c", "fixed_point.c", + "conv_init_int8.c"}); + // call the op function + nnacl::NNaclInt8Serializer code; + code.precision(kPrecision); + code.CodeFunction("memset", packed_input_, 0, packed_input_size_); + code.CodeFunction("memset", input_sum_, 0, input_sum_size_); + code.CodeFunction("memset", matmul_packed_input_, 0, matmul_packed_input_size_); + + conv_param_->op_parameter_.thread_num_ = thread_num_; + conv_param_->thread_num_ = thread_num_; + code.CodeStruct("conv_param_", *conv_param_); + + // code operator func + if (thread_num_ > 1) { + code.CodeFunction("memset", matmul_packed_input_, 0, matmul_packed_input_size_); + code.CodeBaseStruct("ConvOptInt8Args", "args", input_tensor_, packed_input_, matmul_packed_input_, packed_weight_, + bias_data_, output_tensor_, input_sum_, thread_num_s_, "(ConvParameter *)&conv_param_", + matmul_func_); + code.CodeFunction("ParallelLaunch", "THREAD_POOL_DEFAULT", "ConvInt8Run", "&args", "thread_num"); + } else { + if (target_ == kARM64) { + code << "if (GetSupportOptFlag()) {\n"; + code << "conv_param_.tile_num_ = " << 8 << ";\n"; + code << "} else {\n"; + code << "conv_param_.tile_num_ = " << 4 << ";\n"; + code << "}\n"; + code.CodeFunction("ConvInt8", input_tensor_, packed_input_, matmul_packed_input_, packed_weight_, bias_data_, + output_tensor_, filter_zp_ptr_, input_sum_, 0, "(ConvParameter *)&conv_param_", matmul_func_, + "GetSupportOptFlag()"); + } else { + code.CodeFunction("ConvInt8", input_tensor_, packed_input_, matmul_packed_input_, packed_weight_, bias_data_, + output_tensor_, filter_zp_ptr_, input_sum_, 0, "(ConvParameter *)&conv_param_", matmul_func_, + support_optimize_); + } + } + context->AppendCode(code.str()); + return RET_OK; +} + +std::unique_ptr CPUConv2DINT8CoderCreator(const std::vector &in_tensors, + const std::vector &out_tensors, + const Model::Node *node, size_t node_index, Target target) { + PrimitiveC *primitive_c = node->primitive_; + if (!primitive_c) { + return nullptr; + } + OpParameter *parameter = + PopulateRegistry::GetInstance()->GetParameterCreator((schema::PrimitiveType(primitive_c->Type())))(primitive_c); + if (parameter == nullptr) { + MS_LOG(ERROR) << "PopulateParameter return nullptr, type: " + << schema::EnumNamePrimitiveType((schema::PrimitiveType)(primitive_c->Type())); + return nullptr; + } + + auto *conv_param = reinterpret_cast(parameter); + int kernel_h = conv_param->kernel_h_; + int kernel_w = conv_param->kernel_w_; + int stride_h = conv_param->stride_h_; + int stride_w = conv_param->stride_w_; + int dilation_h = conv_param->dilation_h_; + int dilation_w = conv_param->dilation_w_; + free(parameter); + std::unique_ptr coder; + if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { + coder = CPUOpCoderCreator(in_tensors, out_tensors, node, node_index, target); + } else if (kernel_h == 1 && kernel_w == 1) { + coder = CPUOpCoderCreator(in_tensors, out_tensors, node, node_index, target); + } else { + coder = CPUOpCoderCreator(in_tensors, out_tensors, node, node_index, target); + } + if (coder == nullptr) { + MS_LOG(ERROR) << "create conv2d int8 coder failed"; + return nullptr; + } + return coder; +} + +REG_OPERATOR_CODER(kX86, kNumberTypeInt8, PrimitiveType_Conv2D, CPUConv2DINT8CoderCreator) +REG_OPERATOR_CODER(kARM32A, kNumberTypeInt8, PrimitiveType_Conv2D, CPUConv2DINT8CoderCreator) +REG_OPERATOR_CODER(kARM64, kNumberTypeInt8, PrimitiveType_Conv2D, CPUConv2DINT8CoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h new file mode 100644 index 00000000000..52859be068c --- /dev/null +++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h @@ -0,0 +1,71 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_INT8_CODER_H_ +#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_INT8_CODER_H_ +#include +#include +#include +#include "micro/coder/opcoders/base/conv2d_base_coder.h" +#include "nnacl/conv_parameter.h" +#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h" + +namespace mindspore::lite::micro::nnacl { +class Conv2DINT8Coder final : public Conv2DBaseCoder { + public: + explicit Conv2DINT8Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const Model::Node *node, size_t node_index, Target target) + : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {} + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + ~Conv2DINT8Coder() override = default; + + private: + int InitWeightBias(CoderContext *ctx); + + void CheckSupportOptimize(); + + int InitTmpBuffer(CoderContext *ctx); + + int Resize(); + + int8_t *packed_weight_{nullptr}; + int32_t *bias_data_{nullptr}; + int32_t *filter_zp_ptr_{nullptr}; + + int thread_count_{1}; + int tile_num_{0}; + + bool support_optimize_{true}; + bool filter_peroc_{false}; + + size_t packed_input_size_{0}; + size_t input_sum_size_{0}; + size_t matmul_packed_input_size_{0}; + + int8_t *packed_input_{nullptr}; + int32_t *input_sum_{nullptr}; + int8_t *matmul_packed_input_{nullptr}; + + string matmul_func_; + + std::function pack_weight_init_{nullptr}; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_MICRO_CODER_OPCODERS_INT8_CODER_H_ diff --git a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc index 14d925fa089..51d60ed3f7d 100644 --- a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc +++ b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc @@ -19,6 +19,47 @@ #include "micro/coder/log.h" namespace mindspore::lite::micro::nnacl { + +void NNaclInt8Serializer::CodeStruct(const std::string &name, const ConvParameter &conv_parameter) { + const ConvQuantArg &quant_arg = conv_parameter.conv_quant_arg_; + std::string quant_arg_in = name + "_quant_arg_in"; + std::string quant_arg_w = name + "_quant_arg_w"; + std::string quant_arg_out = name + "_quant_arg_out"; + CodeArray(quant_arg_in, quant_arg.input_quant_args_, quant_arg.input_arg_num_, false); + CodeArray(quant_arg_w, quant_arg.filter_quant_args_, quant_arg.filter_arg_num_, false); + CodeArray(quant_arg_out, quant_arg.output_quant_args_, quant_arg.output_arg_num_, false); + + std::string real_multiplier = name + "_real_multiplier"; + std::string left_shift = name + "_left_shift"; + std::string right_shift = name + "_right_shift"; + std::string quant_multiplier = name + "_quant_multiplier"; + CodeArray(real_multiplier, quant_arg.real_multiplier_, quant_arg.filter_arg_num_, false); + CodeArray(left_shift, quant_arg.left_shift_, quant_arg.filter_arg_num_, false); + CodeArray(right_shift, quant_arg.right_shift_, quant_arg.filter_arg_num_, false); + CodeArray(quant_multiplier, quant_arg.quant_multiplier_, quant_arg.filter_arg_num_, false); + + std::string out_act_min = name + "_out_act_min"; + std::string out_act_max = name + "_out_act_max"; + CodeArray(out_act_min, quant_arg.out_act_min_, 1, false); + CodeArray(out_act_max, quant_arg.out_act_max_, 1, false); + + std::string conv_quant_arg = name + "_conv_quant_arg"; + + CodeBaseStruct("ConvQuantArg", conv_quant_arg, quant_arg.round_mode_, quant_arg.quant_multiplier_mode_, quant_arg_in, + quant_arg_w, quant_arg_out, real_multiplier, left_shift, right_shift, quant_multiplier, out_act_min, + out_act_max, quant_arg.input_arg_num_, quant_arg.filter_arg_num_, quant_arg.output_arg_num_, + quant_arg.per_channel_); + + CodeBaseStruct( + "ConvParameter", name, conv_parameter.op_parameter_, conv_quant_arg, conv_parameter.kernel_h_, + conv_parameter.kernel_w_, conv_parameter.stride_h_, conv_parameter.stride_w_, conv_parameter.dilation_h_, + conv_parameter.dilation_w_, conv_parameter.pad_u_, conv_parameter.pad_d_, conv_parameter.pad_l_, + conv_parameter.pad_r_, conv_parameter.group_, conv_parameter.tile_num_, conv_parameter.input_batch_, + conv_parameter.input_h_, conv_parameter.input_w_, conv_parameter.input_channel_, conv_parameter.output_batch_, + conv_parameter.output_h_, conv_parameter.output_w_, conv_parameter.output_channel_, conv_parameter.thread_num_, + conv_parameter.input_unit_, conv_parameter.output_unit_, conv_parameter.pad_mode_, conv_parameter.act_type_); +} + void NNaclInt8Serializer::CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter) { CodeBaseStruct("ArithmeticParameter", name, arithmetic_parameter.op_parameter_, arithmetic_parameter.broadcasting_, arithmetic_parameter.ndim_, arithmetic_parameter.activation_type_, diff --git a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h index 204e1f89461..a3c0550edbf 100644 --- a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h +++ b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h @@ -33,7 +33,7 @@ namespace mindspore::lite::micro::nnacl { class NNaclInt8Serializer : public Serializer { public: NNaclInt8Serializer() = default; - ~NNaclInt8Serializer() = default; + ~NNaclInt8Serializer() override = default; void CodeStruct(const std::string &name, const ConvParameter &conv_parameter); void CodeStruct(const std::string &name, const MatMulParameter &matmul_parameter); void CodeStruct(const std::string &name, const AddQuantParameter &add_quant_parameter); diff --git a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_stream_utils.h b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_stream_utils.h index d89549821b9..9fe57a7eb77 100644 --- a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_stream_utils.h +++ b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_stream_utils.h @@ -47,6 +47,18 @@ inline std::ostream &operator<<(std::ostream &code, RoundMode round_mode) { return code; } +inline std::ostream &operator<<(std::ostream &code, RoundingMode rounding_mode) { + code << "(RoundingMode)" + << "(" << static_cast(rounding_mode) << ")"; + return code; +} + +inline std::ostream &operator<<(std::ostream &code, PadMode pad_mode) { + code << "(PadMode)" + << "(" << static_cast(pad_mode) << ")"; + return code; +} + inline std::ostream &operator<<(std::ostream &code, ActType act_type) { code << "(ActType)" << "(" << static_cast(act_type) << ")"; diff --git a/mindspore/lite/micro/wrapper/int8/conv_init_int8.c b/mindspore/lite/micro/wrapper/int8/conv_init_int8.c new file mode 100644 index 00000000000..276e9d99020 --- /dev/null +++ b/mindspore/lite/micro/wrapper/int8/conv_init_int8.c @@ -0,0 +1,88 @@ +/* + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "wrapper/int8/conv_init_int8.h" +#include +#include "nnacl/op_base.h" +#include "nnacl/int8/matmul_int8.h" +#include "nnacl/errorcode.h" + +int ConvInit(int8_t *origin_weight, const int32_t *ori_bias, const int32_t *filter_quant_zps, int kernel_h, + int kernel_w, int input_channel, int output_channel, int32_t input_zp, bool filter_peroc, + bool support_optimize, int8_t **packed_weight, int32_t **bias_data) { + int8_t *packed_weight_ = NULL; + int32_t *bias_data_ = NULL; + int kernel_plane = kernel_h * kernel_w; + int up_round_deep; + int up_round_oc; +#ifdef ENABLE_ARM32 + up_round_oc = UP_ROUND(output_channel, C2NUM); + up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM); +#else + if (support_optimize) { + up_round_oc = UP_ROUND(output_channel, C8NUM); + up_round_deep = UP_ROUND(kernel_plane * input_channel, C4NUM); + } else { + up_round_oc = UP_ROUND(output_channel, C4NUM); + up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM); + } +#endif + int pack_weight_size = up_round_oc * up_round_deep; + size_t bias_size = up_round_oc * sizeof(int32_t); + + // init weight + packed_weight_ = (int8_t *)(malloc(pack_weight_size)); + if (packed_weight_ == NULL) { + return NNACL_ERR; + } + memset(packed_weight_, 0, pack_weight_size); +#ifdef ENABLE_ARM32 + RowMajor2Row2x16MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane); +#else + if (support_optimize) { + RowMajor2Row8x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane); + } else { + RowMajor2Row16x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane); + } +#endif + + // init bias + bias_data_ = (int32_t *)(malloc(bias_size)); + if (bias_data_ == NULL) { + free(packed_weight_); + return NNACL_ERR; + } + memset(bias_data_, 0, bias_size); + if (ori_bias != NULL) { + memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t)); + } + + for (int oc = 0; oc < output_channel; oc++) { + int32_t filter_zp = filter_quant_zps[0]; + if (filter_peroc) { + filter_zp = filter_quant_zps[oc]; + } + int32_t weight_sum_value = up_round_deep * filter_zp; + for (int i = 0; i < kernel_plane * input_channel; i++) { + weight_sum_value += origin_weight[oc * kernel_plane * input_channel + i] - filter_zp; + } + bias_data_[oc] += filter_zp * input_zp * up_round_deep - weight_sum_value * input_zp; + } + + *packed_weight = packed_weight_; + *bias_data = bias_data_; + return NNACL_OK; +} diff --git a/mindspore/lite/micro/wrapper/int8/conv_init_int8.h b/mindspore/lite/micro/wrapper/int8/conv_init_int8.h new file mode 100644 index 00000000000..0eff8d7a3f9 --- /dev/null +++ b/mindspore/lite/micro/wrapper/int8/conv_init_int8.h @@ -0,0 +1,26 @@ +/* + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_MICRO_INT8_CONV_INIT_H_ +#define MINDSPORE_LITE_MICRO_INT8_CONV_INIT_H_ + +#include +#include + +int ConvInit(int8_t *origin_weight, const int32_t *ori_bias, const int32_t *filter_quant_zps, int kernel_h, + int kernel_w, int input_channel, int output_channel, int32_t input_zp, bool filter_peroc, + bool support_optimize, int8_t **packed_weight, int32_t **bias_data); + +#endif // MINDSPORE_LITE_MICRO_INT8_CONV_INIT_H_