!12391 add conv1x1 coder

From: @zhujingxuan Reviewed-by: Signed-off-by:
2021-02-19 15:09:11 +08:00 · 2021-02-19 15:09:11 +08:00 · 3cc3d5c9cf
parent 41ae5dbbd0 72a3a02a0b
commit 3cc3d5c9cf
10 changed files with 678 additions and 6 deletions
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@ -81,6 +81,7 @@ set(CODER_OPCODERS_SRC
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/concat_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/fullconnection_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/matmul_int8_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc
@ -126,13 +127,10 @@ set(LITE_KERNEL_SRC
        ${LITE_DIR}/nnacl/int8/fixed_point.c
        ${LITE_DIR}/nnacl/fp32/matmul_fp32.c
        ${LITE_DIR}/nnacl/int8/conv3x3_int8.c
-        )
-set(MICRO_ADAPTER_SRC
-        ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
-        ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
-        ${MICRO_DIR}/wrapper/int8/conv_init_int8.c
+        ${LITE_DIR}/nnacl/int8/conv1x1_int8.c
+        ${LITE_DIR}/nnacl/base/conv1x1_base.c
        )

 list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
-        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MICRO_ADAPTER_SRC})
+        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC})

--- a/mindspore/lite/micro/cmake/wrapper.cmake
+++ b/mindspore/lite/micro/cmake/wrapper.cmake
@ -0,0 +1,12 @@
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+
+set(MICRO_WRAPPER_SRC
+        ${LITE_DIR}/src/runtime/thread_pool.c
+        ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
+        ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
+        ${MICRO_DIR}/wrapper/int8/conv_init_int8.c
+        ${MICRO_DIR}/wrapper/int8/conv1x1_init_int8.c
+        ${MICRO_DIR}/wrapper/int8/conv1x1_run_int8.c
+        )
+
+list(APPEND FILE_SET ${MICRO_WRAPPER_SRC})
--- a/mindspore/lite/micro/coder/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/CMakeLists.txt
@ -19,6 +19,7 @@ include_directories(${TOP_DIR}/mindspore/core/)
 #include coder
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../)
 include(${MICRO_DIR}/cmake/file_list.cmake)
+include(${MICRO_DIR}/cmake/wrapper.cmake)
 add_executable(codegen main.cc ${FILE_SET})
 add_dependencies(codegen fbs_src)
 add_dependencies(codegen fbs_inner_src)
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc
@ -0,0 +1,193 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h"
+#include <string>
+#include <vector>
+#include "securec/include/securec.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "micro/coder/opcoders/file_collector.h"
+#include "micro/coder/log.h"
+#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
+
+namespace mindspore::lite::micro::nnacl {
+
+int Conv2D1x1Int8Coder::Prepare(CoderContext *const context) {
+  matmul_param_ = new (std::nothrow) MatMulParameter();
+  MS_CHECK_PTR(matmul_param_);
+  MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Init failed");
+  MS_CHECK_RET_CODE(Conv2DBaseCoder::SetQuantParam(), "SetQuantParam failed");
+  filter_peroc_ = (conv_param_->conv_quant_arg_.filter_arg_num_ != kPerTensor);
+  if (filter_peroc_) {
+    MS_CHECK_RET_CODE(InitFilterPeroc(), "InitFilterPeroc failed.");
+  }
+  CheckSupportOptimize();
+  MS_CHECK_RET_CODE(InitWeightBias(context), "InitWeightBias failed");
+  MS_CHECK_RET_CODE(InitParam(), "InitParam failed");
+  MS_CHECK_RET_CODE(InitRunBuf(), "InitRunBuf failed");
+  return RET_OK;
+}
+
+int Conv2D1x1Int8Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {"nnacl/int8/conv1x1_int8.h", "nnacl/common_func.h", "wrapper/int8/conv1x1_init_int8.h",
+           "wrapper/int8/conv1x1_run_int8.h"},
+          {"common_func.c", "pack.c", "conv1x1_int8.c", "matmul_int8.c", "fixed_point.c", "conv1x1_init_int8.c",
+           "conv1x1_run_int8.c"});
+
+  nnacl::NNaclInt8Serializer code;
+
+  code.CodeStruct("conv_param", *conv_param_);
+  code.CodeStruct("matmul_param", *matmul_param_);
+
+  code.CodeBaseStruct("Conv1x1Args", "args", input_sum_, filter_zp_ptr_, left_shift_, right_shift_, multiplier_,
+                      packed_weight_, bias_data_, packed_input_, nullptr, nullptr, 0, 0, "conv_param", "matmul_param",
+                      matmul_func_, pre_trans_input_, support_optimize_, filter_peroc_);
+
+  code.CodeFunction("Conv1x1Run", input_tensor_, "args", "THREAD_POOL_DEFAULT", thread_num_s_, output_tensor_);
+
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+void Conv2D1x1Int8Coder::CheckSupportOptimize() {
+  support_optimize_ = false;
+  matmul_func_ = "MatMulInt8_4x16_r";
+  if (target_ == kARM64) {
+    matmul_func_ = "MatMulDpInt8_optimize_handler";
+  }
+}
+
+int Conv2D1x1Int8Coder::InitWeightBias(CoderContext *const context) {
+  int32_t input_channel = filter_tensor_->Channel();
+  int32_t output_channel = filter_tensor_->Batch();
+  int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
+
+  nnacl::NNaclInt8Serializer code;
+
+  packed_weight_ = static_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, kOnlineSize, kOnlinePackWeight));
+  MS_CHECK_PTR(packed_weight_);
+  bias_data_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, kOnlineSize, kOnlinePackWeight));
+  MS_CHECK_PTR(bias_data_);
+
+  std::string packed_weight_str = "(int8_t **)&" + allocator_->GetRuntimeAddr(packed_weight_);
+  std::string bias_data_str = "(int32_t **)&" + allocator_->GetRuntimeAddr(bias_data_);
+  std::string filter_zp_str = "";
+  if (filter_peroc_) {
+    filter_zp_str = allocator_->GetRuntimeAddr(filter_zp_ptr_);
+  } else {
+    MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_);
+    filter_zp_str = "filter_zp";
+    code << "int32_t filter_zp[1] = {" << conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ << "};\n";
+  }
+
+  if (target_ == kARM64) {
+    code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel,
+                               output_channel, input_zp, "GetSupportOptFlag()", filter_peroc_, packed_weight_str,
+                               bias_data_str);
+  } else {
+    code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel,
+                               output_channel, input_zp, support_optimize_, filter_peroc_, packed_weight_str,
+                               bias_data_str);
+  }
+
+  context->AppendInitCode(code.str());
+  return RET_OK;
+}
+
+int Conv2D1x1Int8Coder::InitFilterPeroc() {
+  int32_t output_channel = filter_tensor_->Batch();
+  int round_oc;
+  if (target_ == kARM32A) {
+    round_oc = UP_ROUND(output_channel, C2NUM);
+  } else {
+    round_oc = MSMAX(UP_ROUND(output_channel, C16NUM), UP_ROUND(output_channel, C4NUM));
+  }
+
+  MS_CHECK_TRUE(conv_quant_arg_->filter_arg_num_ == static_cast<size_t>(output_channel),
+                "weight per channel quant param length is not equal to filter num, filter is not PerChannel");
+  size_t output_size = output_channel * sizeof(int32_t);
+  size_t oc_size = round_oc * sizeof(int32_t);
+
+  /* filter zp */
+  filter_zp_ptr_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, output_size, kOfflinePackWeight));
+  MS_CHECK_PTR(filter_zp_ptr_);
+  MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_);
+  for (int fi = 0; fi < output_channel; fi++) {
+    filter_zp_ptr_[fi] = conv_param_->conv_quant_arg_.filter_quant_args_[fi].zp_;
+  }
+
+  /* left shift */
+  left_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
+  MS_CHECK_PTR(left_shift_);
+  MS_CHECK_RET_CODE(memset_s(left_shift_, oc_size, 0, oc_size), "memset left_shift_ failed");
+  MS_CHECK_RET_CODE(memcpy_s(left_shift_, oc_size, conv_param_->conv_quant_arg_.left_shift_, output_size),
+                    "memcpy_s left_shift_ failed");
+
+  /* right shift */
+  right_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
+  MS_CHECK_PTR(right_shift_);
+  MS_CHECK_RET_CODE(memset_s(right_shift_, oc_size, 0, oc_size), "memset right_shift_ failed");
+  MS_CHECK_RET_CODE(memcpy_s(right_shift_, oc_size, conv_param_->conv_quant_arg_.right_shift_, output_size),
+                    "memcpy_s right_shift_ failed");
+  /* multiplier */
+  multiplier_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
+  MS_CHECK_PTR(multiplier_);
+  MS_CHECK_RET_CODE(memset_s(multiplier_, oc_size, 0, oc_size), "memset multiplier_ failed");
+  MS_CHECK_RET_CODE(memcpy_s(multiplier_, oc_size, conv_param_->conv_quant_arg_.quant_multiplier_, output_size),
+                    "memcpy_s multiplier_ failed");
+
+  return RET_OK;
+}
+
+int Conv2D1x1Int8Coder::InitParam() {
+  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
+                      conv_param_->stride_w_ != 1);
+
+  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
+  matmul_param_->deep_ = conv_param_->input_channel_;
+  matmul_param_->col_ = conv_param_->output_channel_;
+  matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
+  matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM);
+  matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM);
+
+  int row_pack_count = C4NUM;
+  /* init input sum size */
+  input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count);
+
+  if (pre_trans_input_) {
+    input_ptr_ = reinterpret_cast<int8_t *>(
+      allocator_->Malloc(kNumberTypeInt8, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t), kWorkspace));
+    MS_CHECK_PTR(input_ptr_);
+  }
+
+  return RET_OK;
+}
+
+int Conv2D1x1Int8Coder::InitRunBuf() {
+  input_sum_ =
+    reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, input_sum_size_ * sizeof(int32_t), kWorkspace));
+  MS_CHECK_PTR(input_sum_);
+
+  size_t size = MSMAX(UP_ROUND(matmul_param_->row_, C8NUM) * UP_ROUND(matmul_param_->deep_, C4NUM),
+                      UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM));
+
+  packed_input_ = reinterpret_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, size * sizeof(int8_t), kWorkspace));
+  MS_CHECK_PTR(packed_input_);
+  return RET_OK;
+}
+
+}  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h
@ -0,0 +1,67 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
+#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
+#include "micro/coder/opcoders/base/conv2d_base_coder.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "nnacl/conv_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class Conv2D1x1Int8Coder final : public Conv2DBaseCoder {
+ public:
+  Conv2D1x1Int8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                     const Model::Node *node, size_t node_index, Target target)
+      : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+  ~Conv2D1x1Int8Coder() override = default;
+
+ private:
+  void CheckSupportOptimize();
+
+  int InitWeightBias(CoderContext *const context);
+
+  int InitFilterPeroc();
+
+  int InitParam();
+
+  int InitRunBuf();
+
+  int32_t *input_sum_{nullptr};     /* per-oc */
+  int32_t *filter_zp_ptr_{nullptr}; /* per-oc up round  */
+  int32_t *left_shift_{nullptr};    /* per-oc up round  */
+  int32_t *right_shift_{nullptr};   /* per-oc up round  */
+  int32_t *multiplier_{nullptr};    /* per-oc up round  */
+  int8_t *packed_weight_{nullptr};
+  int32_t *bias_data_{nullptr};
+  int8_t *packed_input_{nullptr};
+  int8_t *input_ptr_{nullptr};
+  int8_t *output_ptr_{nullptr};
+  size_t input_sum_size_{0};
+  MatMulParameter *matmul_param_{nullptr};
+  std::string matmul_func_;
+  bool pre_trans_input_{false};
+  bool support_optimize_{false};
+  bool filter_peroc_{false};
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc
+++ b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc
@ -60,6 +60,16 @@ void NNaclInt8Serializer::CodeStruct(const std::string &name, const ConvParamete
    conv_parameter.input_unit_, conv_parameter.output_unit_, conv_parameter.pad_mode_, conv_parameter.act_type_);
 }

+void NNaclInt8Serializer::CodeStruct(const std::string &name, const MatMulParameter &matmul_parameter) {
+  CodeBaseStruct("MatMulParameter", name, matmul_parameter.op_parameter_, matmul_parameter.has_bias_,
+                 matmul_parameter.row_, matmul_parameter.col_, matmul_parameter.row_4_, matmul_parameter.row_6_,
+                 matmul_parameter.row_12_, matmul_parameter.row_16_, matmul_parameter.row_align_,
+                 matmul_parameter.col_4_, matmul_parameter.col_8_, matmul_parameter.col_align_, matmul_parameter.deep_,
+                 matmul_parameter.deep_4_, matmul_parameter.deep_16_, matmul_parameter.batch,
+                 matmul_parameter.a_transpose_, matmul_parameter.b_transpose_, matmul_parameter.a_const_,
+                 matmul_parameter.b_const_, matmul_parameter.act_type_);
+}
+
 void NNaclInt8Serializer::CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter) {
  CodeBaseStruct("ArithmeticParameter", name, arithmetic_parameter.op_parameter_, arithmetic_parameter.broadcasting_,
                 arithmetic_parameter.ndim_, arithmetic_parameter.activation_type_,
--- a/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.c
+++ b/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.c
@ -0,0 +1,90 @@
+/*
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "wrapper/int8/conv1x1_init_int8.h"
+#include <memory.h>
+#include "nnacl/int8/matmul_int8.h"
+#include "nnacl/errorcode.h"
+
+int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel,
+                int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc,
+                int8_t **packed_weight, int32_t **bias_data) {
+  if (packed_weight == NULL || bias_data == NULL) {
+    return NNACL_ERR;
+  }
+#ifdef ENABLE_ARM32
+  /* InitWeightBiasArm32 */
+  /* weight */
+  size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C2NUM) * sizeof(int8_t);
+  int8_t *packed_weight_ = (int8_t *)(malloc(size));
+  if (packed_weight_ == NULL) {
+    return NNACL_ERR;
+  }
+  memset(packed_weight_, 0, size);
+  RowMajor2Row2x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
+  /* bias */
+  size = UP_ROUND(output_channel, C2NUM);
+  int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t));
+  if (bias_data_ == NULL) {
+    free(packed_weight_);
+    return NNACL_ERR;
+  }
+  memset(bias_data_, 0, size * sizeof(int32_t));
+  if (src_bias != NULL) {
+    memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t));
+  }
+#else
+  /* InitWeightBias */
+  /* weight */
+  size_t size = support_optimize ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C16NUM) * sizeof(int8_t)
+                                 : UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
+  int8_t *packed_weight_ = (int8_t *)(malloc(size));
+  if (packed_weight_ == NULL) {
+    return NNACL_ERR;
+  }
+  memset(packed_weight_, 0, size);
+  if (support_optimize) {
+    RowMajor2Row4x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
+  } else {
+    RowMajor2Row16x4MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
+  }
+  /* bias */
+  size = support_optimize ? UP_ROUND(output_channel, C16NUM) : UP_ROUND(output_channel, C4NUM);
+  int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t));
+  if (bias_data_ == NULL) {
+    free(packed_weight_);
+    return NNACL_ERR;
+  }
+  memset(bias_data_, 0, size * sizeof(int32_t));
+  if (src_bias != NULL) {
+    memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t));
+  }
+#endif
+  /* InitBiasByzp */
+  /* bias = bias - v2 x zp1 + zp1 x zp2  */
+  for (int oc = 0; oc < output_channel; oc++) {
+    int32_t weight_sum_value = 0;
+    int32_t filter_zp = (filter_peroc) ? filter_zps[oc] : filter_zps[0];
+    for (int ic = 0; ic < input_channel; ic++) {
+      weight_sum_value += src_weight[oc * input_channel + ic];
+    }
+    bias_data_[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
+  }
+
+  *packed_weight = packed_weight_;
+  *bias_data = bias_data_;
+  return NNACL_OK;
+}
--- a/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.h
+++ b/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.h
@ -0,0 +1,28 @@
+/*
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
+#define MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "nnacl/conv_parameter.h"
+
+int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel,
+                int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc,
+                int8_t **packed_weight, int32_t **bias_data);
+
+#endif  // MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
--- a/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.c
+++ b/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.c
@ -0,0 +1,224 @@
+/*
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "wrapper/int8/conv1x1_run_int8.h"
+#include "nnacl/base/conv1x1_base.h"
+#include "nnacl/int8/matmul_int8.h"
+#include "nnacl/int8/pack_int8.h"
+#include "nnacl/int8/conv1x1_int8.h"
+#include "nnacl/errorcode.h"
+
+void Pre1x1Trans(Conv1x1Args *args, int8_t *src_input, int8_t *src_output) {
+  args->output_ptr_ = src_output;
+  if (args->pre_trans_input_) {
+    Conv1x1InputPack(src_input, args->input_ptr_, args->conv_param_, sizeof(int8_t));
+  } else {
+    args->input_ptr_ = src_input;
+  }
+}
+
+int OcOptPre(void *cdata, int task_id) {
+  Conv1x1Args *args = (Conv1x1Args *)(cdata);
+  int cur_stride = args->thread_stride_hw_ * C4NUM;
+  int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
+  int cur_hw = MSMIN(cur_stride, res_stride);
+  if (cur_hw <= 0) {
+    return NNACL_OK;
+  }
+  int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
+  int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_;
+  int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;
+
+  if (args->filter_peroc_) {
+    PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1);
+  } else {
+    PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw,
+                                args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_);
+  }
+  return NNACL_OK;
+}
+
+int RunArm64OptOc(void *cdata, int task_id) {
+  Conv1x1Args *args = (Conv1x1Args *)(cdata);
+  int stride = args->thread_stride_oc_ * C16NUM;
+  int cur_stride = task_id * stride;
+  int res_stride = args->matmul_param_->col_ - cur_stride;
+  int cur_oc = MSMIN(stride, res_stride);
+  if (cur_oc <= 0) {
+    return NNACL_OK;
+  }
+
+  bool filter_peroc = args->filter_peroc_;
+  int32_t *cur_left_shift =
+    filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_;
+  int32_t *cur_right_shift =
+    filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_;
+  int32_t *cur_multiplier =
+    filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_;
+  int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_;
+
+  Conv1x1Int8Opt(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_4_,
+                 args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride,
+                 args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_4_, cur_left_shift, cur_right_shift,
+                 cur_multiplier, args->conv_param_, args->matmul_func_, cur_zp);
+  return NNACL_OK;
+}
+
+int RunArmOc(void *cdata, int task_id) {
+  Conv1x1Args *args = (Conv1x1Args *)(cdata);
+#ifdef ENABLE_ARM32
+  int col_tile = C2NUM;
+#else
+  int col_tile = C4NUM;
+#endif
+  int stride = args->thread_stride_oc_ * col_tile;
+  int cur_stride = task_id * stride;
+  int res_stride = args->matmul_param_->col_ - cur_stride;
+  int cur_oc = MSMIN(stride, res_stride);
+  if (cur_oc <= 0) {
+    return NNACL_OK;
+  }
+
+  bool filter_peroc = args->filter_peroc_;
+  int32_t *cur_left_shift =
+    filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_;
+  int32_t *cur_right_shift =
+    filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_;
+  int32_t *cur_multiplier =
+    filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_;
+  int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_;
+
+  Conv1x1Int8(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_16_,
+              args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride,
+              args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_16_, cur_left_shift, cur_right_shift,
+              cur_multiplier, args->conv_param_, cur_zp);
+  return NNACL_OK;
+}
+
+int RunArm64OptHw(void *cdata, int task_id) {
+  Conv1x1Args *args = (Conv1x1Args *)(cdata);
+  int cur_stride = args->thread_stride_hw_ * C4NUM;
+  int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
+  int cur_hw = MSMIN(cur_stride, res_stride);
+  if (cur_hw <= 0) {
+    return NNACL_OK;
+  }
+  int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
+  int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_;
+  int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_;
+  int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;
+
+  if (args->filter_peroc_) {
+    PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1);
+  } else {
+    PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw,
+                                args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_);
+  }
+
+  Conv1x1Int8Opt(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw,
+                 args->matmul_param_->col_, args->matmul_param_->deep_4_, args->left_shift_, args->right_shift_,
+                 args->multiplier_, args->conv_param_, args->matmul_func_, args->filter_zp_ptr_);
+  return NNACL_OK;
+}
+
+int RunArmHw(void *cdata, int task_id) {
+  Conv1x1Args *args = (Conv1x1Args *)(cdata);
+  int cur_stride = args->thread_stride_hw_ * C4NUM;
+  int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
+  int cur_hw = MSMIN(cur_stride, res_stride);
+  if (cur_hw <= 0) {
+    return NNACL_OK;
+  }
+
+  int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
+  int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_;
+  int8_t *hw_packed_in =
+    args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_16_;
+  int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;
+
+  RowMajor2Row16x4MajorInt8(hw_in, hw_packed_in, cur_hw, args->matmul_param_->deep_);
+
+  if (args->filter_peroc_) {
+    PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, 1, UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_);
+  } else {
+    PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
+                             UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_);
+  }
+
+  Conv1x1Int8(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw,
+              args->matmul_param_->col_, args->matmul_param_->deep_16_, args->left_shift_, args->right_shift_,
+              args->multiplier_, args->conv_param_, args->filter_zp_ptr_);
+  return NNACL_OK;
+}
+
+void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out) {
+  int row_pack_count = C4NUM;
+  int col_pack_count;
+
+#ifdef ENABLE_ARM32
+  col_pack_count = C2NUM;
+#else
+  if (args->support_optimize_) {
+    col_pack_count = C16NUM;
+  } else {
+    col_pack_count = C4NUM;
+  }
+#endif
+  int hw_thread_count = UP_DIV(args->matmul_param_->row_, row_pack_count);
+  int oc_thread_count = UP_DIV(args->matmul_param_->col_, col_pack_count);
+  size_t thread_count_hw = MSMIN(thread_num, hw_thread_count);
+  args->thread_stride_hw_ = UP_DIV(hw_thread_count, thread_count_hw);
+  size_t thread_count_oc = MSMIN(thread_num, oc_thread_count);
+  args->thread_stride_oc_ = UP_DIV(oc_thread_count, thread_count_oc);
+  bool parallel_by_oc = oc_thread_count > thread_num;
+
+  for (int batch_index = 0; batch_index < args->conv_param_->input_batch_; batch_index++) {
+    Pre1x1Trans(args,
+                src_in + batch_index * args->conv_param_->input_h_ * args->conv_param_->input_w_ *
+                           args->conv_param_->input_channel_,
+                src_out + batch_index * args->matmul_param_->row_ * args->matmul_param_->col_);
+    if (parallel_by_oc) {
+      /* input transpose and input sum */
+      if (args->support_optimize_) {
+        ParallelLaunch(thread_pool, OcOptPre, args, thread_count_hw);
+      } else {
+        RowMajor2Row16x4MajorInt8(args->input_ptr_, args->packed_input_, args->matmul_param_->row_,
+                                  args->matmul_param_->deep_);
+        if (args->filter_peroc_) {
+          PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_, 1, args->matmul_param_->row_4_,
+                                   args->matmul_param_->deep_16_);
+        } else {
+          PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_,
+                                   args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
+                                   args->matmul_param_->row_4_, args->matmul_param_->deep_16_);
+        }
+      }
+      /* matmul parallel by oc */
+      if (args->support_optimize_) {
+        ParallelLaunch(thread_pool, RunArm64OptOc, args, thread_count_oc);
+      } else {
+        ParallelLaunch(thread_pool, RunArmOc, args, thread_count_oc);
+      }
+    } else {
+      /* matmul parallel by hw */
+      if (args->support_optimize_) {
+        ParallelLaunch(thread_pool, RunArm64OptHw, args, thread_count_hw);
+      } else {
+        ParallelLaunch(thread_pool, RunArmHw, args, thread_count_hw);
+      }
+    }
+  }
+}
--- a/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.h
+++ b/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.h
@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_
+#define MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "nnacl/conv_parameter.h"
+#include "nnacl/matmul_parameter.h"
+#include "src/runtime/thread_pool.h"
+
+typedef struct {
+  int32_t *input_sum_;     /* per-oc */
+  int32_t *filter_zp_ptr_; /* per-oc up round  */
+  int32_t *left_shift_;    /* per-oc up round  */
+  int32_t *right_shift_;   /* per-oc up round  */
+  int32_t *multiplier_;    /* per-oc up round  */
+  int8_t *packed_weight_;
+  int32_t *bias_data_;
+  int8_t *packed_input_;
+  int8_t *input_ptr_;
+  int8_t *output_ptr_;
+  size_t thread_stride_hw_;
+  size_t thread_stride_oc_;
+  ConvParameter *conv_param_;
+  MatMulParameter *matmul_param_;
+  MATMUL_OPT_DP_FUNC matmul_func_;
+  bool pre_trans_input_;
+  bool support_optimize_;
+  bool filter_peroc_;
+} Conv1x1Args;
+
+void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out);
+
+#endif  // MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_