!12277 add conv and conv3x3 coder

From: @zhujingxuan Reviewed-by: @wangchengyuan,@HilbertDavid Signed-off-by: @wangchengyuan
2021-02-18 09:04:29 +08:00 · 2021-02-18 09:04:29 +08:00 · f2650ecfc5
parent f9f24ca94d c01ab11509
commit f2650ecfc5
10 changed files with 730 additions and 1 deletions
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@ -78,6 +78,8 @@ set(CODER_OPCODERS_SRC
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/concat_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/fullconnection_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/matmul_int8_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/reduce_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/reshape_int8_coder.cc
@ -120,10 +122,12 @@ set(LITE_KERNEL_SRC
        ${LITE_DIR}/nnacl/int8/matmul_int8.c
        ${LITE_DIR}/nnacl/int8/fixed_point.c
        ${LITE_DIR}/nnacl/fp32/matmul_fp32.c
+        ${LITE_DIR}/nnacl/int8/conv3x3_int8.c
        )
 set(MICRO_ADAPTER_SRC
        ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
        ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
+        ${MICRO_DIR}/wrapper/int8/conv_init_int8.c
        )

 list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc
@ -0,0 +1,161 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h"
+#include <string>
+#include <vector>
+#include "securec/include/securec.h"
+#include "nnacl/int8/conv3x3_int8.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h"
+#include "micro/coder/opcoders/file_collector.h"
+#include "micro/coder/log.h"
+#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
+
+namespace mindspore::lite::micro::nnacl {
+void ProcessFilterUint8(int8_t *origin_weight, int16_t *dst_weight, ConvParameter *conv_param) {
+  int input_channel = conv_param->input_channel_;
+  int output_channel = conv_param->output_channel_;
+  int kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
+  int iC8 = UP_DIV(input_channel, C8NUM);
+
+  size_t tmp_size = output_channel * iC8 * C8NUM * kernel_plane * sizeof(int16_t);
+  auto tmp_addr = reinterpret_cast<int16_t *>(malloc(tmp_size));
+  MS_CHECK_PTR_IF_NULL(tmp_addr);
+  int ret = memset_s(tmp_addr, tmp_size, 0, tmp_size);
+  if (ret != EOK) {
+    free(tmp_addr);
+    MS_LOG(ERROR) << "memset_s tmp_addr failed.";
+    return;
+  }
+  PackWeightToC8Int8(origin_weight, tmp_addr, conv_param);
+  Conv3x3Int8FilterTransform(tmp_addr, dst_weight, iC8, output_channel, kernel_plane);
+  free(tmp_addr);
+}
+
+int Conv2D3x3Int8Coder::InitWeightBias() {
+  int input_channel = conv_param_->input_channel_;
+  int output_channel = conv_param_->output_channel_;
+  MS_CHECK_TRUE(input_channel > 0, "invalid input_channel");
+  MS_CHECK_TRUE(output_channel > 0, "invalid output_channel");
+  int iC8 = UP_DIV(input_channel, C8NUM);
+  int oC4 = UP_DIV(output_channel, C4NUM);
+  // init weight
+  int transformed_size = iC8 * C8NUM * oC4 * C4NUM * 16 * sizeof(int16_t);
+  transformed_filter_addr_ =
+    static_cast<int16_t *>(allocator_->Malloc(kNumberTypeInt16, transformed_size, kOfflinePackWeight));
+  MS_CHECK_PTR(transformed_filter_addr_);
+  MS_CHECK_RET_CODE(memset_s(transformed_filter_addr_, transformed_size, 0, transformed_size),
+                    "memset_s transformed_filter_addr_ failed.");
+  auto *original_weight_addr = reinterpret_cast<int8_t *>(filter_tensor_->data_c());
+  ProcessFilterUint8(original_weight_addr, transformed_filter_addr_, conv_param_);
+
+  // init bias
+  int new_bias_size = oC4 * C4NUM * sizeof(int32_t);
+  new_bias_addr_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, new_bias_size, kOfflinePackWeight));
+  MS_CHECK_PTR(new_bias_addr_);
+  MS_CHECK_RET_CODE(memset_s(new_bias_addr_, new_bias_size, 0, new_bias_size), "memset_s new_bias_addr_ failed.");
+  if (input_tensors_.size() == kInputSize2) {
+    auto *ori_bias_addr = reinterpret_cast<int32_t *>(bias_tensor_->data_c());
+    MS_CHECK_RET_CODE(
+      memcpy_s(new_bias_addr_, output_channel * sizeof(int32_t), ori_bias_addr, output_channel * sizeof(int32_t)),
+      "memset_s new_bias_addr_ failed.");
+  } else {
+    MS_ASSERT(input_tensors_.size() == kInputSize1);
+  }
+  return RET_OK;
+}
+
+int Conv2D3x3Int8Coder::InitTmpBuffer(CoderContext *const context) {
+  int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM);
+  int oc4 = UP_DIV(conv_param_->output_channel_, C4NUM);
+  int in_batch = conv_param_->input_batch_;
+  int input_w = conv_param_->input_w_;
+  int input_h = conv_param_->input_h_;
+  int output_batch = conv_param_->output_batch_;
+  int output_w = conv_param_->output_w_;
+  int output_h = conv_param_->output_h_;
+
+  /*=============================tile_buffer_============================*/
+  tile_buffer_size_ = thread_num_ * TILE_NUM * 16 * ic8 * C8NUM * sizeof(int16_t);
+  tile_buffer_ = static_cast<int16_t *>(allocator_->Malloc(kNumberTypeInt16, tile_buffer_size_, kWorkspace));
+
+  /*=============================block_unit_buffer_============================*/
+  block_unit_buffer_size_ = thread_num_ * 4 * 4 * C8NUM * sizeof(int16_t);
+  block_unit_buffer_ =
+    static_cast<int16_t *>(allocator_->Malloc(kNumberTypeInt16, block_unit_buffer_size_, kWorkspace));
+
+  /*=============================tmp_dst_buffer_============================*/
+  tmp_dst_buffer_size_ = thread_num_ * TILE_NUM * 16 * oc4 * C4NUM * sizeof(int32_t);
+  tmp_dst_buffer_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, tmp_dst_buffer_size_, kWorkspace));
+
+  /*=============================tmp_out_============================*/
+  tmp_out_size_ = oc4 * C4NUM * output_batch * output_w * output_h * sizeof(uint8_t);
+  tmp_out_ = static_cast<uint8_t *>(allocator_->Malloc(kNumberTypeUInt8, tmp_out_size_, kWorkspace));
+
+  /*=============================input_data_============================*/
+  c8_input_size_ = in_batch * input_h * input_w * ic8 * C8NUM * sizeof(int16_t);
+  c8_input_ = static_cast<int16_t *>(allocator_->Malloc(kNumberTypeInt16, c8_input_size_, kWorkspace));
+  return RET_OK;
+}
+
+void Conv2D3x3Int8Coder::ConfigInputOutput() { output_tensor_->set_format(schema::Format_NHWC); }
+
+int Conv2D3x3Int8Coder::Prepare(CoderContext *const context) {
+  conv_param_->thread_num_ = thread_num_;
+  // to 1, task id is set to 0
+  conv_param_->op_parameter_.thread_num_ = thread_num_;
+  MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "ConvolutionBase init failed.");
+  MS_CHECK_RET_CODE(SetQuantParam(), "Set quant param failed.");
+  MS_CHECK_RET_CODE(InitWeightBias(), "Init weight bias failed.");
+  // init tmp input, output
+  MS_CHECK_RET_CODE(InitTmpBuffer(context), "Init tmp buffer failed.");
+  // config input output
+  ConfigInputOutput();
+  return RET_OK;
+}
+
+int Conv2D3x3Int8Coder::DoCode(CoderContext *const context) {
+  Collect(context, {"nnacl/int8/conv_int8.h"}, {"pack.c", "conv_int8.c", "fixed_point.c"});
+  nnacl::NNaclInt8Serializer code;
+  code.precision(kPrecision);
+  // call the op function
+  code.CodeFunction("memset", tile_buffer_, 0, tile_buffer_size_);
+  code.CodeFunction("memset", block_unit_buffer_, 0, block_unit_buffer_size_);
+  code.CodeFunction("memset", tmp_dst_buffer_, 0, tmp_dst_buffer_size_);
+  code.CodeFunction("memset", tmp_out_, 0, tmp_out_size_);
+  code.CodeFunction("memset", c8_input_, 0, c8_input_size_);
+
+  // define conv params
+  code.CodeStruct("conv_param_", *conv_param_);
+  // pack to c8
+  code.CodeFunction("PackInputToC8Int8", input_tensor_, c8_input_, "&conv_param_");
+  // code operator func
+  if (thread_num_ > 1) {
+    code.CodeBaseStruct("Conv3x3Int8Args", "args", c8_input_, transformed_filter_addr_, new_bias_addr_, output_tensor_,
+                        tile_buffer_, block_unit_buffer_, tmp_dst_buffer_, tmp_out_, "&conv_param_");
+    code.CodeFunction("ParallelLaunch", "THREAD_POOL_DEFAULT", "Conv3x3Int8Run", "&args", "thread_num");
+  } else {
+    int task_id = 0;
+    code.CodeFunction("Conv3x3Int8", c8_input_, transformed_filter_addr_, new_bias_addr_, output_tensor_, tile_buffer_,
+                      block_unit_buffer_, tmp_dst_buffer_, tmp_out_, task_id, "&conv_param_");
+  }
+  code.CodeFunction("PackNC4HW4ToNHWCInt8", tmp_out_, output_tensor_, conv_param_->output_batch_,
+                    conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+}  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h
@ -0,0 +1,61 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_3X3_INT8_CODER_H_
+#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_3X3_INT8_CODER_H_
+#include "micro/coder/opcoders/base/conv2d_base_coder.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "nnacl/conv_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class Conv2D3x3Int8Coder final : public Conv2DBaseCoder {
+ public:
+  Conv2D3x3Int8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                     const Model::Node *node, size_t node_index, Target target)
+      : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+  ~Conv2D3x3Int8Coder() override = default;
+
+ private:
+  int InitWeightBias();
+
+  void ConfigInputOutput();
+
+  int InitTmpBuffer(CoderContext *ctx);
+
+  int16_t *transformed_filter_addr_{nullptr};
+  int32_t *new_bias_addr_{nullptr};
+
+  int16_t *block_unit_buffer_{nullptr};
+  int16_t *tile_buffer_{nullptr};
+  int32_t *tmp_dst_buffer_{nullptr};
+  uint8_t *tmp_out_{nullptr};
+  int16_t *c8_input_{nullptr};
+
+  size_t tile_buffer_size_{0};
+  size_t block_unit_buffer_size_{0};
+  size_t tmp_dst_buffer_size_{0};
+  size_t tmp_out_size_{0};
+  size_t c8_input_size_{0};
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_3X3_INT8_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
@ -0,0 +1,265 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include <utility>
+#include "securec/include/securec.h"
+#include "micro/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.h"
+#include "micro/coder/log.h"
+#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "src/runtime/kernel/arm/int8/convolution_int8.h"
+#include "src/ops/populate/populate_register.h"
+#include "micro/coder/opcoders/file_collector.h"
+
+using mindspore::schema::PrimitiveType_Conv2D;
+
+namespace mindspore::lite::micro::nnacl {
+
+int Conv2DINT8Coder::InitTmpBuffer(CoderContext *const context) {
+  int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
+  int tmp_size;
+  if (target_ == kARM64) {
+    tmp_size = MSMAX(UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM),
+                     UP_ROUND(kernel_plane * conv_param_->input_channel_, C16NUM));
+  } else {
+    if (support_optimize_) {
+      tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM);
+    } else {
+      tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C16NUM);
+    }
+  }
+  // malloc packed input
+  packed_input_size_ = tmp_size * thread_num_ * tile_num_ * sizeof(int8_t);
+  packed_input_ = static_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, packed_input_size_, kWorkspace));
+  MS_CHECK_PTR(packed_input_);
+  matmul_packed_input_size_ = thread_num_ * tile_num_ * kernel_plane * conv_param_->input_channel_ * sizeof(int8_t);
+  matmul_packed_input_ =
+    static_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, matmul_packed_input_size_, kWorkspace));
+  MS_CHECK_PTR(matmul_packed_input_);
+  return RET_OK;
+}
+
+void Conv2DINT8Coder::CheckSupportOptimize() {
+  tile_num_ = 8;
+  matmul_func_ = "NULL";
+
+  switch (target_) {
+    case kARM32A:
+      support_optimize_ = false;
+      tile_num_ = 4;
+      matmul_func_ = "NULL";
+      break;
+    case kARM64:
+      // check support_optimize at runtime
+      matmul_func_ = "MatMulRInt8_optimize_handler";
+      tile_num_ = 8;
+      break;
+    case kX86:
+      support_optimize_ = true;
+      tile_num_ = 8;
+      break;
+    default:
+      MS_LOG(ERROR) << "target not supported";
+      return;
+  }
+  conv_param_->tile_num_ = tile_num_;
+}
+
+int Conv2DINT8Coder::InitWeightBias(CoderContext *const context) {
+  int32_t input_channel = filter_tensor_->Channel();
+  int32_t output_channel = filter_tensor_->Batch();
+  int32_t kernel_h = filter_tensor_->Height();
+  int32_t kernel_w = filter_tensor_->Width();
+  conv_param_->input_channel_ = input_channel;
+  conv_param_->output_channel_ = output_channel;
+  auto output_channel_size = static_cast<size_t>(output_channel);
+  auto output_channel_data_size = static_cast<size_t>(output_channel_size * sizeof(int32_t));
+
+  int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
+  filter_peroc_ = conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL;
+
+  if (filter_peroc_) {
+    filter_zp_ptr_ =
+      static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, output_channel_data_size, kOfflinePackWeight));
+    MS_CHECK_PTR(filter_zp_ptr_);
+    MS_CHECK_RET_CODE(memset_s(filter_zp_ptr_, output_channel_data_size, 0, output_channel_data_size),
+                      "memset_s filter_zp_ptr_addr failed.");
+    for (int oc = 0; oc < output_channel; oc++) {
+      filter_zp_ptr_[oc] = conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_;
+    }
+  }
+
+  int up_round_oc;
+  switch (target_) {
+    case kARM32A:
+      up_round_oc = UP_ROUND(output_channel, C2NUM);
+      break;
+    case kARM64:
+      up_round_oc = MSMAX(UP_ROUND(output_channel, C8NUM), UP_ROUND(output_channel, C4NUM));
+      break;
+    case kX86:
+      up_round_oc = UP_ROUND(output_channel, C8NUM);
+      break;
+    default:
+      MS_LOG(ERROR) << "target not supported";
+      return RET_ERROR;
+  }
+
+  if (filter_peroc_) {
+    input_sum_size_ = up_round_oc * tile_num_ * thread_num_ * sizeof(int32_t);
+  } else {
+    input_sum_size_ = tile_num_ * thread_num_ * sizeof(int32_t);
+  }
+  input_sum_ =
+    static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, static_cast<size_t>(input_sum_size_), kWorkspace));
+  MS_CHECK_PTR(input_sum_);
+
+  packed_weight_ = static_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, kOnlineSize, kOnlinePackWeight));
+  MS_CHECK_PTR(packed_weight_);
+  bias_data_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, kOnlineSize, kOnlinePackWeight));
+  MS_CHECK_PTR(bias_data_);
+  std::string filter_zp_str = "";
+  std::string packed_weight_str = "(int8_t **)&" + allocator_->GetRuntimeAddr(packed_weight_);
+  std::string bias_data_str = "(int32_t **)&" + allocator_->GetRuntimeAddr(bias_data_);
+
+  nnacl::NNaclInt8Serializer code;
+
+  if (filter_peroc_) {
+    filter_zp_str = allocator_->GetRuntimeAddr(filter_zp_ptr_);
+  } else {
+    filter_zp_str = "filter_zp";
+    code << "int32_t filter_zp[1] = {" << conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ << "};\n";
+  }
+
+  if (target_ == kARM64) {
+    code.CodeFunctionWithCheck("ConvInit", filter_tensor_, bias_tensor_, filter_zp_str, kernel_h, kernel_w,
+                               input_channel, output_channel, input_zp, filter_peroc_, "GetSupportOptFlag()",
+                               packed_weight_str, bias_data_str);
+  } else {
+    code.CodeFunctionWithCheck("ConvInit", filter_tensor_, bias_tensor_, filter_zp_str, kernel_h, kernel_w,
+                               input_channel, output_channel, input_zp, filter_peroc_, support_optimize_,
+                               packed_weight_str, bias_data_str);
+  }
+
+  context->AppendInitCode(code.str());
+
+  return RET_OK;
+}
+
+int Conv2DINT8Coder::Prepare(CoderContext *const context) {
+  Conv2DBaseCoder::Init();
+  CheckSupportOptimize();
+  MS_CHECK_RET_CODE(SetQuantParam(), "Set quant param failed!");
+  MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed.");
+  MS_CHECK_RET_CODE(Resize(), "Resize failed.");
+  MS_CHECK_RET_CODE(InitTmpBuffer(context), "InitTmpBuffer failed.");
+  return RET_OK;
+}
+
+int Conv2DINT8Coder::Resize() {
+  MS_CHECK_RET_CODE(Conv2DBaseCoder::CheckResizeValid(), "Resize is invalid.");
+  MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Conv2DBaseCoder init failed.");
+  return RET_OK;
+}
+
+int Conv2DINT8Coder::DoCode(CoderContext *const context) {
+  Collect(context, {"nnacl/int8/conv_int8.h", "nnacl/common_func.h", "nnacl/kernel/int8/conv_init_int8.h"},
+          {"common_func.c", "pack.c", "conv_int8.c", "winograd_transform.c", "matmul_int8.c", "fixed_point.c",
+           "conv_init_int8.c"});
+  // call the op function
+  nnacl::NNaclInt8Serializer code;
+  code.precision(kPrecision);
+  code.CodeFunction("memset", packed_input_, 0, packed_input_size_);
+  code.CodeFunction("memset", input_sum_, 0, input_sum_size_);
+  code.CodeFunction("memset", matmul_packed_input_, 0, matmul_packed_input_size_);
+
+  conv_param_->op_parameter_.thread_num_ = thread_num_;
+  conv_param_->thread_num_ = thread_num_;
+  code.CodeStruct("conv_param_", *conv_param_);
+
+  // code operator func
+  if (thread_num_ > 1) {
+    code.CodeFunction("memset", matmul_packed_input_, 0, matmul_packed_input_size_);
+    code.CodeBaseStruct("ConvOptInt8Args", "args", input_tensor_, packed_input_, matmul_packed_input_, packed_weight_,
+                        bias_data_, output_tensor_, input_sum_, thread_num_s_, "(ConvParameter *)&conv_param_",
+                        matmul_func_);
+    code.CodeFunction("ParallelLaunch", "THREAD_POOL_DEFAULT", "ConvInt8Run", "&args", "thread_num");
+  } else {
+    if (target_ == kARM64) {
+      code << "if (GetSupportOptFlag()) {\n";
+      code << "conv_param_.tile_num_ = " << 8 << ";\n";
+      code << "} else {\n";
+      code << "conv_param_.tile_num_ = " << 4 << ";\n";
+      code << "}\n";
+      code.CodeFunction("ConvInt8", input_tensor_, packed_input_, matmul_packed_input_, packed_weight_, bias_data_,
+                        output_tensor_, filter_zp_ptr_, input_sum_, 0, "(ConvParameter *)&conv_param_", matmul_func_,
+                        "GetSupportOptFlag()");
+    } else {
+      code.CodeFunction("ConvInt8", input_tensor_, packed_input_, matmul_packed_input_, packed_weight_, bias_data_,
+                        output_tensor_, filter_zp_ptr_, input_sum_, 0, "(ConvParameter *)&conv_param_", matmul_func_,
+                        support_optimize_);
+    }
+  }
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+std::unique_ptr<OperatorCoder> CPUConv2DINT8CoderCreator(const std::vector<Tensor *> &in_tensors,
+                                                         const std::vector<Tensor *> &out_tensors,
+                                                         const Model::Node *node, size_t node_index, Target target) {
+  PrimitiveC *primitive_c = node->primitive_;
+  if (!primitive_c) {
+    return nullptr;
+  }
+  OpParameter *parameter =
+    PopulateRegistry::GetInstance()->GetParameterCreator((schema::PrimitiveType(primitive_c->Type())))(primitive_c);
+  if (parameter == nullptr) {
+    MS_LOG(ERROR) << "PopulateParameter return nullptr, type: "
+                  << schema::EnumNamePrimitiveType((schema::PrimitiveType)(primitive_c->Type()));
+    return nullptr;
+  }
+
+  auto *conv_param = reinterpret_cast<ConvParameter *>(parameter);
+  int kernel_h = conv_param->kernel_h_;
+  int kernel_w = conv_param->kernel_w_;
+  int stride_h = conv_param->stride_h_;
+  int stride_w = conv_param->stride_w_;
+  int dilation_h = conv_param->dilation_h_;
+  int dilation_w = conv_param->dilation_w_;
+  free(parameter);
+  std::unique_ptr<OperatorCoder> coder;
+  if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
+    coder = CPUOpCoderCreator<Conv2D3x3Int8Coder>(in_tensors, out_tensors, node, node_index, target);
+  } else if (kernel_h == 1 && kernel_w == 1) {
+    coder = CPUOpCoderCreator<Conv2DINT8Coder>(in_tensors, out_tensors, node, node_index, target);
+  } else {
+    coder = CPUOpCoderCreator<Conv2DINT8Coder>(in_tensors, out_tensors, node, node_index, target);
+  }
+  if (coder == nullptr) {
+    MS_LOG(ERROR) << "create conv2d int8 coder failed";
+    return nullptr;
+  }
+  return coder;
+}
+
+REG_OPERATOR_CODER(kX86, kNumberTypeInt8, PrimitiveType_Conv2D, CPUConv2DINT8CoderCreator)
+REG_OPERATOR_CODER(kARM32A, kNumberTypeInt8, PrimitiveType_Conv2D, CPUConv2DINT8CoderCreator)
+REG_OPERATOR_CODER(kARM64, kNumberTypeInt8, PrimitiveType_Conv2D, CPUConv2DINT8CoderCreator)
+}  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h
@ -0,0 +1,71 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_INT8_CODER_H_
+#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_INT8_CODER_H_
+#include <string>
+#include <memory>
+#include <vector>
+#include "micro/coder/opcoders/base/conv2d_base_coder.h"
+#include "nnacl/conv_parameter.h"
+#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
+
+namespace mindspore::lite::micro::nnacl {
+class Conv2DINT8Coder final : public Conv2DBaseCoder {
+ public:
+  explicit Conv2DINT8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                           const Model::Node *node, size_t node_index, Target target)
+      : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+  ~Conv2DINT8Coder() override = default;
+
+ private:
+  int InitWeightBias(CoderContext *ctx);
+
+  void CheckSupportOptimize();
+
+  int InitTmpBuffer(CoderContext *ctx);
+
+  int Resize();
+
+  int8_t *packed_weight_{nullptr};
+  int32_t *bias_data_{nullptr};
+  int32_t *filter_zp_ptr_{nullptr};
+
+  int thread_count_{1};
+  int tile_num_{0};
+
+  bool support_optimize_{true};
+  bool filter_peroc_{false};
+
+  size_t packed_input_size_{0};
+  size_t input_sum_size_{0};
+  size_t matmul_packed_input_size_{0};
+
+  int8_t *packed_input_{nullptr};
+  int32_t *input_sum_{nullptr};
+  int8_t *matmul_packed_input_{nullptr};
+
+  string matmul_func_;
+
+  std::function<int(nnacl::NNaclInt8Serializer &, const std::string &, const std::string &)> pack_weight_init_{nullptr};
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_INT8_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc
+++ b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc
@ -19,6 +19,47 @@
 #include "micro/coder/log.h"

 namespace mindspore::lite::micro::nnacl {
+
+void NNaclInt8Serializer::CodeStruct(const std::string &name, const ConvParameter &conv_parameter) {
+  const ConvQuantArg &quant_arg = conv_parameter.conv_quant_arg_;
+  std::string quant_arg_in = name + "_quant_arg_in";
+  std::string quant_arg_w = name + "_quant_arg_w";
+  std::string quant_arg_out = name + "_quant_arg_out";
+  CodeArray(quant_arg_in, quant_arg.input_quant_args_, quant_arg.input_arg_num_, false);
+  CodeArray(quant_arg_w, quant_arg.filter_quant_args_, quant_arg.filter_arg_num_, false);
+  CodeArray(quant_arg_out, quant_arg.output_quant_args_, quant_arg.output_arg_num_, false);
+
+  std::string real_multiplier = name + "_real_multiplier";
+  std::string left_shift = name + "_left_shift";
+  std::string right_shift = name + "_right_shift";
+  std::string quant_multiplier = name + "_quant_multiplier";
+  CodeArray(real_multiplier, quant_arg.real_multiplier_, quant_arg.filter_arg_num_, false);
+  CodeArray(left_shift, quant_arg.left_shift_, quant_arg.filter_arg_num_, false);
+  CodeArray(right_shift, quant_arg.right_shift_, quant_arg.filter_arg_num_, false);
+  CodeArray(quant_multiplier, quant_arg.quant_multiplier_, quant_arg.filter_arg_num_, false);
+
+  std::string out_act_min = name + "_out_act_min";
+  std::string out_act_max = name + "_out_act_max";
+  CodeArray(out_act_min, quant_arg.out_act_min_, 1, false);
+  CodeArray(out_act_max, quant_arg.out_act_max_, 1, false);
+
+  std::string conv_quant_arg = name + "_conv_quant_arg";
+
+  CodeBaseStruct("ConvQuantArg", conv_quant_arg, quant_arg.round_mode_, quant_arg.quant_multiplier_mode_, quant_arg_in,
+                 quant_arg_w, quant_arg_out, real_multiplier, left_shift, right_shift, quant_multiplier, out_act_min,
+                 out_act_max, quant_arg.input_arg_num_, quant_arg.filter_arg_num_, quant_arg.output_arg_num_,
+                 quant_arg.per_channel_);
+
+  CodeBaseStruct(
+    "ConvParameter", name, conv_parameter.op_parameter_, conv_quant_arg, conv_parameter.kernel_h_,
+    conv_parameter.kernel_w_, conv_parameter.stride_h_, conv_parameter.stride_w_, conv_parameter.dilation_h_,
+    conv_parameter.dilation_w_, conv_parameter.pad_u_, conv_parameter.pad_d_, conv_parameter.pad_l_,
+    conv_parameter.pad_r_, conv_parameter.group_, conv_parameter.tile_num_, conv_parameter.input_batch_,
+    conv_parameter.input_h_, conv_parameter.input_w_, conv_parameter.input_channel_, conv_parameter.output_batch_,
+    conv_parameter.output_h_, conv_parameter.output_w_, conv_parameter.output_channel_, conv_parameter.thread_num_,
+    conv_parameter.input_unit_, conv_parameter.output_unit_, conv_parameter.pad_mode_, conv_parameter.act_type_);
+}
+
 void NNaclInt8Serializer::CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter) {
  CodeBaseStruct("ArithmeticParameter", name, arithmetic_parameter.op_parameter_, arithmetic_parameter.broadcasting_,
                 arithmetic_parameter.ndim_, arithmetic_parameter.activation_type_,
--- a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h
+++ b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h
@ -33,7 +33,7 @@ namespace mindspore::lite::micro::nnacl {
 class NNaclInt8Serializer : public Serializer {
 public:
  NNaclInt8Serializer() = default;
-  ~NNaclInt8Serializer() = default;
+  ~NNaclInt8Serializer() override = default;
  void CodeStruct(const std::string &name, const ConvParameter &conv_parameter);
  void CodeStruct(const std::string &name, const MatMulParameter &matmul_parameter);
  void CodeStruct(const std::string &name, const AddQuantParameter &add_quant_parameter);
--- a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_stream_utils.h
+++ b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_stream_utils.h
@ -47,6 +47,18 @@ inline std::ostream &operator<<(std::ostream &code, RoundMode round_mode) {
  return code;
 }

+inline std::ostream &operator<<(std::ostream &code, RoundingMode rounding_mode) {
+  code << "(RoundingMode)"
+       << "(" << static_cast<int>(rounding_mode) << ")";
+  return code;
+}
+
+inline std::ostream &operator<<(std::ostream &code, PadMode pad_mode) {
+  code << "(PadMode)"
+       << "(" << static_cast<int>(pad_mode) << ")";
+  return code;
+}
+
 inline std::ostream &operator<<(std::ostream &code, ActType act_type) {
  code << "(ActType)"
       << "(" << static_cast<int>(act_type) << ")";
--- a/mindspore/lite/micro/wrapper/int8/conv_init_int8.c
+++ b/mindspore/lite/micro/wrapper/int8/conv_init_int8.c
@ -0,0 +1,88 @@
+/*
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "wrapper/int8/conv_init_int8.h"
+#include <memory.h>
+#include "nnacl/op_base.h"
+#include "nnacl/int8/matmul_int8.h"
+#include "nnacl/errorcode.h"
+
+int ConvInit(int8_t *origin_weight, const int32_t *ori_bias, const int32_t *filter_quant_zps, int kernel_h,
+             int kernel_w, int input_channel, int output_channel, int32_t input_zp, bool filter_peroc,
+             bool support_optimize, int8_t **packed_weight, int32_t **bias_data) {
+  int8_t *packed_weight_ = NULL;
+  int32_t *bias_data_ = NULL;
+  int kernel_plane = kernel_h * kernel_w;
+  int up_round_deep;
+  int up_round_oc;
+#ifdef ENABLE_ARM32
+  up_round_oc = UP_ROUND(output_channel, C2NUM);
+  up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM);
+#else
+  if (support_optimize) {
+    up_round_oc = UP_ROUND(output_channel, C8NUM);
+    up_round_deep = UP_ROUND(kernel_plane * input_channel, C4NUM);
+  } else {
+    up_round_oc = UP_ROUND(output_channel, C4NUM);
+    up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM);
+  }
+#endif
+  int pack_weight_size = up_round_oc * up_round_deep;
+  size_t bias_size = up_round_oc * sizeof(int32_t);
+
+  // init weight
+  packed_weight_ = (int8_t *)(malloc(pack_weight_size));
+  if (packed_weight_ == NULL) {
+    return NNACL_ERR;
+  }
+  memset(packed_weight_, 0, pack_weight_size);
+#ifdef ENABLE_ARM32
+  RowMajor2Row2x16MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
+#else
+  if (support_optimize) {
+    RowMajor2Row8x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
+  } else {
+    RowMajor2Row16x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
+  }
+#endif
+
+  // init bias
+  bias_data_ = (int32_t *)(malloc(bias_size));
+  if (bias_data_ == NULL) {
+    free(packed_weight_);
+    return NNACL_ERR;
+  }
+  memset(bias_data_, 0, bias_size);
+  if (ori_bias != NULL) {
+    memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t));
+  }
+
+  for (int oc = 0; oc < output_channel; oc++) {
+    int32_t filter_zp = filter_quant_zps[0];
+    if (filter_peroc) {
+      filter_zp = filter_quant_zps[oc];
+    }
+    int32_t weight_sum_value = up_round_deep * filter_zp;
+    for (int i = 0; i < kernel_plane * input_channel; i++) {
+      weight_sum_value += origin_weight[oc * kernel_plane * input_channel + i] - filter_zp;
+    }
+    bias_data_[oc] += filter_zp * input_zp * up_round_deep - weight_sum_value * input_zp;
+  }
+
+  *packed_weight = packed_weight_;
+  *bias_data = bias_data_;
+  return NNACL_OK;
+}
--- a/mindspore/lite/micro/wrapper/int8/conv_init_int8.h
+++ b/mindspore/lite/micro/wrapper/int8/conv_init_int8.h
@ -0,0 +1,26 @@
+/*
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_MICRO_INT8_CONV_INIT_H_
+#define MINDSPORE_LITE_MICRO_INT8_CONV_INIT_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+int ConvInit(int8_t *origin_weight, const int32_t *ori_bias, const int32_t *filter_quant_zps, int kernel_h,
+             int kernel_w, int input_channel, int output_channel, int32_t input_zp, bool filter_peroc,
+             bool support_optimize, int8_t **packed_weight, int32_t **bias_data);
+
+#endif  // MINDSPORE_LITE_MICRO_INT8_CONV_INIT_H_