From a3cc26ffcc1559e8cbc5a4834e6323510928d097 Mon Sep 17 00:00:00 2001
From: ling <lingqiaomin.huawei.com>
Date: Fri, 11 Sep 2020 14:56:36 +0800
Subject: [PATCH] [MSLITE][Develop]int8 conv1x1 aupport arm32

---
 mindspore/lite/nnacl/int8/matmul_int8.c       |  15 ++
 mindspore/lite/nnacl/int8/matmul_int8.h       |   3 +-
 mindspore/lite/nnacl/matmul_parameter.h       |   1 +
 mindspore/lite/nnacl/op_base.h                |   1 +
 .../kernel/arm/int8/convolution_1x1_int8.cc   | 134 +++++++++++++-----
 .../kernel/arm/int8/convolution_1x1_int8.h    |   2 +
 6 files changed, 118 insertions(+), 38 deletions(-)

diff --git a/mindspore/lite/nnacl/int8/matmul_int8.c b/mindspore/lite/nnacl/int8/matmul_int8.c
index 1e1241712c8..33da1b4ed76 100644
--- a/mindspore/lite/nnacl/int8/matmul_int8.c
+++ b/mindspore/lite/nnacl/int8/matmul_int8.c
@@ -43,6 +43,21 @@ void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int co
   }
 }
 
+void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
+  int col16 = UP_ROUND(col, C16NUM);
+  for (int r = 0; r < row; r++) {
+    int rd4 = r / C2NUM;
+    int rm4 = r % C2NUM;
+    for (int c = 0; c < col; c++) {
+      int cd16 = c / C16NUM;
+      int cm16 = c % C16NUM;
+      int dst_index = rd4 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm4 * C16NUM + cm16;
+      int src_index = r * col + c;
+      dst_ptr[dst_index] = src_ptr[src_index];
+    }
+  }
+}
+
 void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
   int col4 = UP_ROUND(col, C4NUM);
   for (int r = 0; r < row; r++) {
diff --git a/mindspore/lite/nnacl/int8/matmul_int8.h b/mindspore/lite/nnacl/int8/matmul_int8.h
index fe20548b8d4..11dd3a66c05 100644
--- a/mindspore/lite/nnacl/int8/matmul_int8.h
+++ b/mindspore/lite/nnacl/int8/matmul_int8.h
@@ -42,7 +42,6 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
                       bool per_channel);
 void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Row4x8MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
-
 void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16);
 void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16);
 void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order);
@@ -52,6 +51,8 @@ void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums
                 int act_max, int out_zp, int multiplier, int left_shift, int right_shift, int row, int col, int deep16,
                 int stride);
 
+void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
+
 #ifdef ENABLE_ARM64
 void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
                       const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift,
diff --git a/mindspore/lite/nnacl/matmul_parameter.h b/mindspore/lite/nnacl/matmul_parameter.h
index 7be90402c8f..8f6b562974c 100644
--- a/mindspore/lite/nnacl/matmul_parameter.h
+++ b/mindspore/lite/nnacl/matmul_parameter.h
@@ -39,6 +39,7 @@ typedef struct MatMulParameter {
   int row_8_;
   int row_12_;
   int row_16_;
+  int col_2_;
   int col_4_;
   int col_8_;
   int deep_;
diff --git a/mindspore/lite/nnacl/op_base.h b/mindspore/lite/nnacl/op_base.h
index e5bf293ed31..b080dfd3341 100644
--- a/mindspore/lite/nnacl/op_base.h
+++ b/mindspore/lite/nnacl/op_base.h
@@ -21,6 +21,7 @@
 #include <stdlib.h>
 #include <stdbool.h>
 
+#define C2NUM 2
 #define C4NUM 4
 #define C8NUM 8
 #define C12NUM 12
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
index d4ac9fcdbc8..0acd324d2a5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@@ -86,44 +86,10 @@ void Convolution1x1Int8CPUKernel::CheckSupportOptimize() {
   return;
 }
 
-int Convolution1x1Int8CPUKernel::InitWeightBias() {
-  auto filter_tensor = in_tensors_.at(kWeightIndex);
-  auto input_channel = filter_tensor->Channel();
-  auto output_channel = filter_tensor->Batch();
-
-  /* weight */
-  size_t size = support_optimize_ ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C8NUM) * sizeof(int8_t)
-                                  : UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
-  packed_weight_ = reinterpret_cast<int8_t *>(malloc(size));
-  if (packed_weight_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!";
-    return RET_ERROR;
-  }
-  memset(packed_weight_, 0, size);
-  if (support_optimize_) {
-    RowMajor2Row8x4MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->MutableData()), packed_weight_, output_channel,
-                             input_channel);
-  } else {
-    RowMajor2Row4x16MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->MutableData()), packed_weight_, output_channel,
-                              input_channel);
-  }
-
+int Convolution1x1Int8CPUKernel::InitBiasByzp(void *src_weight, int input_channel, int output_channel) {
   /* bias = bias - v2 x zp1 + zp1 x zp2  */
-  int col4 = UP_ROUND(output_channel, C4NUM);
-  int col8 = UP_ROUND(output_channel, C8NUM);
-  size = support_optimize_ ? col8 * sizeof(int32_t) : col4 * sizeof(int32_t);
-  bias_data_ = malloc(size);
-  if (bias_data_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!";
-    return RET_ERROR;
-  }
-  memset(bias_data_, 0, size);
-  if (in_tensors_.size() == 3) {
-    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t));
-  }
-
   int32_t *bias_data = reinterpret_cast<int32_t *>(bias_data_);
-  int8_t *weight = reinterpret_cast<int8_t *>(filter_tensor->MutableData());
+  int8_t *weight = reinterpret_cast<int8_t *>(src_weight);
   int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
   for (int oc = 0; oc < output_channel; oc++) {
     int32_t weight_sum_value = 0;
@@ -147,6 +113,77 @@ int Convolution1x1Int8CPUKernel::InitWeightBias() {
   return RET_OK;
 }
 
+int Convolution1x1Int8CPUKernel::InitWeightBias() {
+  auto filter_tensor = in_tensors_.at(kWeightIndex);
+  auto input_channel = filter_tensor->Channel();
+  auto output_channel = filter_tensor->Batch();
+
+  /* weight */
+  size_t size = support_optimize_ ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C8NUM) * sizeof(int8_t)
+                                  : UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
+  packed_weight_ = reinterpret_cast<int8_t *>(malloc(size));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!";
+    return RET_ERROR;
+  }
+  memset(packed_weight_, 0, size);
+  if (support_optimize_) {
+    RowMajor2Row8x4MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->MutableData()), packed_weight_, output_channel,
+                             input_channel);
+  } else {
+    RowMajor2Row4x16MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->MutableData()), packed_weight_, output_channel,
+                              input_channel);
+  }
+
+  int col4 = UP_ROUND(output_channel, C4NUM);
+  int col8 = UP_ROUND(output_channel, C8NUM);
+  size = support_optimize_ ? col8 * sizeof(int32_t) : col4 * sizeof(int32_t);
+  bias_data_ = malloc(size);
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, size);
+  if (in_tensors_.size() == 3) {
+    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t));
+  }
+
+  InitBiasByzp(filter_tensor->MutableData(), input_channel, output_channel);
+  return RET_OK;
+}
+
+int Convolution1x1Int8CPUKernel::InitWeightBiasArm32() {
+  auto filter_tensor = in_tensors_.at(kWeightIndex);
+  auto input_channel = filter_tensor->Channel();
+  auto output_channel = filter_tensor->Batch();
+
+  /* weight */
+  size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C2NUM) * sizeof(int8_t);
+  packed_weight_ = reinterpret_cast<int8_t *>(malloc(size));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc weight error!";
+    return RET_ERROR;
+  }
+  memset(packed_weight_, 0, size);
+  RowMajor2Row2x16MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->MutableData()), packed_weight_, output_channel,
+                            input_channel);
+
+  /* bias */
+  int col2 = UP_ROUND(output_channel, C2NUM);
+  bias_data_ = malloc(col2 * sizeof(int32_t));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc bias_ptr_ error!";
+    return RET_ERROR;
+  }
+  memset(bias_data_, 0, size);
+  if (in_tensors_.size() == 3) {
+    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t));
+  }
+
+  InitBiasByzp(filter_tensor->MutableData(), input_channel, output_channel);
+  return RET_OK;
+}
+
 int Convolution1x1Int8CPUKernel::Init() {
   matmul_param_ = new (std::nothrow) MatMulParameter();
   if (matmul_param_ == nullptr) {
@@ -164,7 +201,11 @@ int Convolution1x1Int8CPUKernel::Init() {
 
   CheckSupportOptimize();
 
+#ifdef ENABLE_ARM32
+  ret = InitWeightBiasArm32();
+#else
   ret = InitWeightBias();
+#endif
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
     return ret;
@@ -183,6 +224,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
   matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
   matmul_param_->deep_ = conv_param_->input_channel_;
   matmul_param_->col_ = conv_param_->output_channel_;
+  matmul_param_->col_2_ = UP_ROUND(matmul_param_->col_, C2NUM);
   matmul_param_->col_4_ = UP_ROUND(matmul_param_->col_, C4NUM);
   matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
   matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
@@ -192,6 +234,10 @@ int Convolution1x1Int8CPUKernel::InitParam() {
 
   int row_pack_count = 0;
   int col_pack_count = 0;
+#ifdef ENABLE_ARM32
+  row_pack_count = C4NUM;
+  col_pack_count = C2NUM;
+#else
   if (support_optimize_) {
     row_pack_count = C8NUM;
     col_pack_count = C8NUM;
@@ -199,6 +245,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
     row_pack_count = C4NUM;
     col_pack_count = C4NUM;
   }
+#endif
 
   /* init input sum size */
   if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
@@ -222,7 +269,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
     memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t));
   }
   return RET_OK;
-}
+}  // namespace mindspore::kernel
 
 int Convolution1x1Int8CPUKernel::ReSize() {
   FreeResizeBuf();
@@ -260,6 +307,18 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
   int32_t *cur_right_shift = conv_param_->conv_quant_arg_.right_shift_;
   int32_t *cur_multiplier = conv_param_->conv_quant_arg_.quant_multiplier_;
 
+#ifdef ENABLE_ARM32
+  int cur_stride = thread_stride_ * C2NUM;
+  int res_stride = matmul_param_->col_ - task_id * thread_stride_ * C2NUM;
+  int cur_oc = MSMIN(cur_stride, res_stride);
+  if (cur_oc <= 0) {
+    return RET_OK;
+  }
+  Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_,
+              output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum,
+              reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_, cur_oc,
+              matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
+#else
   if (support_optimize_) {
     int cur_stride = thread_stride_ * C8NUM;
     int res_stride = matmul_param_->col_ - task_id * thread_stride_ * C8NUM;
@@ -296,6 +355,7 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
                 reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C4NUM, matmul_param_->row_, cur_oc,
                 matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
   }
+#endif
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
index 342aa3eff71..d8a57f2439b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
@@ -52,8 +52,10 @@ class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
   void FreeResizeBuf();
   int InitParam();
   int InitWeightBias();
+  int InitWeightBiasArm32();
   void Pre1x1Trans(int8_t *src_input, int8_t *src_output);
   void CheckSupportOptimize();
+  int InitBiasByzp(void *src_weight, int input_channel, int output_channel);
 
  private:
   int32_t *input_sum_ = nullptr;     /* per-channel: oc4 format */