!30296 fix DynamicMatmul4x4x16AIWI c++ && copy bias bug and float_mode bug

Merge pull request !30296 from yeyunpeng2020/dynamic_quant
2022-02-21 06:35:22 +00:00 · 2022-02-21 06:35:22 +00:00 · 56a90003c4
parent 6c301b6e1f 3e8ac68bc6
commit 56a90003c4
8 changed files with 98 additions and 53 deletions
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/opt/DynamicMatmulSdot4x4x16AIWI.S
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/opt/DynamicMatmulSdot4x4x16AIWI.S
@ -20,7 +20,7 @@

 // void DynamicMatmulSdot4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
 //                                  float *bias, size_t row, size_t col, size_t stride, const int *a_sums,
-//                                  const int *b_sums, size_t a_zp, size_t b_zp, size_t deep);
+//                                  const int *b_sums, int64_t a_zp, int64_t b_zp_sum);
 // x0: a(left matrix ptr)
 // x1: b(right matrix ptr)
 // x2: out ptr
@ -33,7 +33,7 @@
 // x9: a_sums
 // x10: b_sums
 // x19/w19: a_zp
-// x19/w20: b_zp
+// x19/w20: b_zp_sum

 asm_function DynamicMatmulSdot4x4x16AIWI
    sub sp, sp, #144
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/int8/dynamic_matmul_int8.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/int8/dynamic_matmul_int8.c
@ -17,30 +17,31 @@
 #include "nnacl/int8/dynamic_matmul_int8.h"
 #include "nnacl/int8/fixed_point.h"

-void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col,
-                             int deep4, size_t stride, float input_scale, const float *filter_scale,
-                             bool filter_per_channel) {
+void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
+                             float *bias, size_t row, size_t col, size_t stride, const int *a_sums, const int *b_sums,
+                             int64_t a_zp, int64_t b_zp_sum) {
  /* *
   * row4x4-major * row4x16-major => (int8)row-major
   * support activation per-layer symmetric && weight per-layer/per-channel symmetric
   * */
  for (int r = 0; r < row; r++) {
+    int64_t s2 = a_sums[r] * b_zp_sum;
    for (int c = 0; c < col; c++) {
      int r4div = r / C4NUM, r4mod = r % C4NUM;
      int c16div = c / C16NUM, c16mod = c % C16NUM;
-      int32_t value = 0;
+      int32_t s1 = 0;
      for (int d = 0; d < deep4; d++) {
        int d4div = d / C4NUM, d4mod = d % C4NUM;
        size_t ai = r4div * deep4 * C4NUM + d4div * C4NUM * C4NUM + r4mod * C4NUM + d4mod;
        size_t bi = c16div * deep4 * C16NUM + d4div * C4NUM * C16NUM + c16mod * C4NUM + d4mod;
-        value += a[ai] * b[bi];
+        s1 += a[ai] * b[bi];
      }
-      int filter_quant_index = filter_per_channel ? c : 0;
-      double multi_scale = input_scale * filter_scale[filter_quant_index];
-      size_t ci = r * stride + c;
-      dst[ci] = multi_scale * value;
+      int64_t s3 = b_sums[c] * a_zp;
+      int64_t s4 = a_zp * b_zp_sum;
+      size_t ci = r * stride / sizeof(float) + c;
+      out[ci] = multi_scales[c] * (s1 - s2 - s3 + s4);
      if (bias != NULL) {
-        dst[ci] += bias[c];
+        out[ci] += bias[c];
      }
    }
  }
@ -74,7 +75,7 @@ void DynamicMatmul4x16x4AIWI(const int8_t *a, const int8_t *b, const float *bias
      }
      value = s0 - s1 - s2 + s3;
      int filter_quant_index = filter_per_channel ? c : 0;
-      double multi_scale = input_scale * filter_scale[filter_quant_index];
+      float multi_scale = input_scale * filter_scale[filter_quant_index];
      size_t ci = r * stride + c;
      dst[ci] = multi_scale * value;
      if (bias != NULL) {
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/int8/dynamic_matmul_int8.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/int8/dynamic_matmul_int8.h
@ -34,12 +34,11 @@ void CalcPartWeightSums(const int8_t *weight, int row, int stride, int cur_col,
 #ifdef ENABLE_ARM64
 void DynamicMatmulSdot4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
                                 float *bias, size_t row, size_t col, size_t stride, const int *a_sums,
-                                 const int *b_sums, int64_t a_zp, int64_t b_zp);
-#else
-void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col,
-                             int deep4, size_t stride, float input_scale, const float *filter_scale,
-                             bool filter_per_channel);
+                                 const int *b_sums, int64_t a_zp, int64_t b_zp_sum);
 #endif
+void DynamicMatmul4x4x16AIWI(const int8_t *a, const int8_t *b, float *out, size_t deep4, float *multi_scales,
+                             float *bias, size_t row, size_t col, size_t stride, const int *a_sums, const int *b_sums,
+                             int64_t a_zp, int64_t b_zp_sum);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_base_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_base_int8.cc
@ -99,8 +99,16 @@ int MatmulDynamicBaseInt8CPUKernel::InitFilterQuantParam() {
  return RET_OK;
 }

-void MatmulDynamicBaseInt8CPUKernel::ResizeParameter() {
-  param_->row_align_ = UP_ROUND(param_->row_, row_tile_);
+void MatmulDynamicBaseInt8CPUKernel::ResizeMatrixBParameter() {
+  auto w_shape = in_tensors_.at(kWeightIndex)->shape();
+  int batch = 1;
+  for (size_t i = 0; i < w_shape.size() - kSize2; ++i) {
+    batch *= w_shape[i];
+  }
+  param_->batch = batch;
+  param_->col_ = param_->b_transpose_ ? w_shape[w_shape.size() - kSize2] : w_shape[w_shape.size() - kSize1];
+  param_->deep_ = param_->b_transpose_ ? w_shape[w_shape.size() - kSize1] : w_shape[w_shape.size() - kSize2];
+
  param_->col_align_ = UP_ROUND(param_->col_, col_tile_);
  param_->deep_align_ = UP_ROUND(param_->deep_, deep_tile_);

@ -126,6 +134,10 @@ void MatmulDynamicBaseInt8CPUKernel::FreeTmpBuffer() {
    free(weight_sums_);
    weight_sums_ = nullptr;
  }
+  if (fp32_bias_ptr_ != nullptr) {
+    free(fp32_bias_ptr_);
+    fp32_bias_ptr_ = nullptr;
+  }
  return;
 }

@ -143,8 +155,6 @@ int MatmulDynamicBaseInt8CPUKernel::InitInputQuantParam() {
 int MatmulDynamicBaseInt8CPUKernel::TransferB() {
  auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->data());
  CHECK_NULL_RETURN(weight_data);
-  memset(pack_b_ptr_, quant_param_->filter_zp_[0],
-         param_->batch * param_->col_align_ * param_->deep_align_ * sizeof(int8_t));
  for (int i = 0; i < param_->batch; i++) {
    auto current_weight = weight_data + i * param_->deep_ * param_->col_;
    auto current_b_pack = pack_b_ptr_ + i * param_->col_align_ * param_->deep_align_;
@ -161,31 +171,51 @@ int MatmulDynamicBaseInt8CPUKernel::TransferB() {
  return RET_OK;
 }

-int MatmulDynamicBaseInt8CPUKernel::InitTmpBuffer() {
+int MatmulDynamicBaseInt8CPUKernel::InitMatrixABuffer() {
+  if (pack_a_ptr_ != nullptr) {
+    delete pack_a_ptr_;
+    pack_a_ptr_ = nullptr;
+  }
  pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(param_->row_align_ * param_->deep_align_ * sizeof(int8_t)));
  if (pack_a_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
+  if (input_sums_ != nullptr) {
+    delete pack_a_ptr_;
+    input_sums_ = nullptr;
+  }
+  input_sums_ = reinterpret_cast<int *>(malloc(param_->row_align_ * sizeof(int)));
+  if (input_sums_ == nullptr) {
+    FreeTmpBuffer();
+    return RET_ERROR;
+  }
+  memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_align_ * sizeof(int8_t));
+  memset(input_sums_, 0, param_->row_align_ * sizeof(int));
+  return RET_OK;
+}
+
+int MatmulDynamicBaseInt8CPUKernel::InitMatrixBBuffer() {
+  if (pack_b_ptr_ != nullptr) {
+    delete pack_b_ptr_;
+    pack_b_ptr_ = nullptr;
+  }
  pack_b_ptr_ =
    reinterpret_cast<int8_t *>(malloc(param_->batch * param_->col_align_ * param_->deep_align_ * sizeof(int8_t)));
  if (pack_b_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
-  input_sums_ = reinterpret_cast<int *>(malloc(param_->row_align_ * sizeof(int)));
-  if (input_sums_ == nullptr) {
-    FreeTmpBuffer();
-    return RET_ERROR;
+  if (weight_sums_ != nullptr) {
+    delete weight_sums_;
+    weight_sums_ = nullptr;
  }
  weight_sums_ = reinterpret_cast<int *>(malloc(param_->batch * param_->col_align_ * sizeof(int)));
  if (weight_sums_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
-  memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_align_ * sizeof(int8_t));
  memset(pack_b_ptr_, 0, param_->batch * param_->col_align_ * param_->deep_align_ * sizeof(int8_t));
-  memset(input_sums_, 0, param_->row_align_ * sizeof(int));
  memset(weight_sums_, 0, param_->batch * param_->col_align_ * sizeof(int));
  return RET_OK;
 }
@ -193,7 +223,7 @@ int MatmulDynamicBaseInt8CPUKernel::InitTmpBuffer() {
 int MatmulDynamicBaseInt8CPUKernel::CopyBias() {
  if (in_tensors_.size() == kHasBiasSize) {
    auto bias_tensor = in_tensors_[kBiasIndex];
-    fp32_bias_ptr_ = reinterpret_cast<float *>(bias_tensor->data());
+    fp32_bias_ptr_ = static_cast<float *>(malloc(bias_tensor->Size()));
    if (fp32_bias_ptr_ == nullptr) {
      MS_LOG(ERROR) << "Memory allocation failed";
      FreeTmpBuffer();
@ -216,12 +246,25 @@ int MatmulDynamicBaseInt8CPUKernel::Prepare() {
    return ret;
  }
  if (param_->b_const_) {
+    ResizeMatrixBParameter();
    ret = InitFilterQuantParam();
    if (ret != RET_OK) {
      FreeQuantParam();
      return ret;
    }
+    ret = InitMatrixBBuffer();
+    if (ret != RET_OK) {
+      FreeQuantParam();
+      return ret;
+    }
+
+    ret = TransferB();
+    if (ret != RET_OK) {
+      FreeQuantParam();
+      return ret;
+    }
  }
+
  ret = CopyBias();
  if (ret != RET_OK) {
    FreeQuantParam();
@ -234,30 +277,27 @@ int MatmulDynamicBaseInt8CPUKernel::Prepare() {
 }

 int MatmulDynamicBaseInt8CPUKernel::ReSize() {
-  int batch = 1;
  auto x_shape = in_tensors_.at(0)->shape();
  auto o_shape = out_tensors_.at(0)->shape();
-  MS_ASSERT(x_shape.size() >= kSize2);
-  for (size_t i = 0; i < x_shape.size() - kSize2; ++i) {
-    batch *= x_shape[i];
-  }
-  param_->batch = batch;
  MS_ASSERT(o_shape.size() >= kSize2);
  param_->row_ = o_shape[o_shape.size() - kSize2];
-  param_->col_ = o_shape[o_shape.size() - kSize1];
+  param_->row_align_ = UP_ROUND(param_->row_, row_tile_);
  param_->deep_ = param_->a_transpose_ ? x_shape[x_shape.size() - kSize2] : x_shape[x_shape.size() - kSize1];
+  param_->deep_align_ = UP_ROUND(param_->deep_, deep_tile_);

-  FreeTmpBuffer();
-
-  ResizeParameter();
-
-  auto ret = InitTmpBuffer();
+  auto ret = InitMatrixABuffer();
  if (ret != RET_OK) {
    FreeQuantParam();
    return ret;
  }
-  if (param_->b_const_ == true) {
-    TransferB();
+
+  if (!param_->b_const_) {
+    ResizeMatrixBParameter();
+    ret = InitMatrixBBuffer();
+    if (ret != RET_OK) {
+      FreeQuantParam();
+      return ret;
+    }
  }
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_base_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_base_int8.h
@ -39,10 +39,10 @@ class MatmulDynamicBaseInt8CPUKernel : public InnerKernel {
  int ReSize() override;

 private:
-  void ResizeParameter();
+  void ResizeMatrixBParameter();
  int CopyBias();
-  int InitTmpBuffer();
-
+  int InitMatrixABuffer();
+  int InitMatrixBBuffer();
  int MallocQuantParam();

 protected:
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_sdot_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_sdot_int8.cc
@ -82,7 +82,6 @@ int MatMulDynamicSdotInt8Kernel::MatMulDynamicArm64SdotPre(int task_id) {
 }

 int MatMulDynamicSdotInt8Kernel::MatMulDynamicArm64SdotImpl(int task_id) {
-#if defined(ENABLE_ARM64) && !defined(SUPPORT_NNIE) && (!defined(MACHINE_LINUX_ARM64))
  // Multi-thread split by col.
  int stride = thread_stride_ * col_tile_;
  int cur_stride = task_id * stride;
@ -128,12 +127,18 @@ int MatMulDynamicSdotInt8Kernel::MatMulDynamicArm64SdotImpl(int task_id) {
      if (bias != nullptr) {
        bias += col_offset;
      }
+
+#if defined(ENABLE_ARM64) && !defined(SUPPORT_NNIE) && (!defined(MACHINE_LINUX_ARM64))
      DynamicMatmulSdot4x4x16AIWI(a_ptr, b_ptr, out_ptr, param_->deep_align_, multi_scale.data() + c, bias, row, col,
                                  out_stride, input_sums_ptr, weight_sums_ptr, quant_param_->input_zp_,
                                  quant_param_->filter_zp_[0] * param_->deep_);
+#else
+      DynamicMatmul4x4x16AIWI(a_ptr, b_ptr, out_ptr, param_->deep_align_, multi_scale.data() + c, bias, row, col,
+                              out_stride, input_sums_ptr, weight_sums_ptr, quant_param_->input_zp_,
+                              quant_param_->filter_zp_[0] * param_->deep_);
+#endif
    }
  }
-#endif
  return RET_OK;
 }

--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@ -664,7 +664,9 @@ int Scheduler::InferNodeShape(const lite::Model::Node *node) {

  parameter->quant_type_ = node->quant_type_;
  parameter->thread_num_ = context_->thread_num_;
-
+  if (context_->float_mode && parameter->quant_type_ == schema::QuantType_QUANT_ALL) {
+    parameter->quant_type_ = schema::QuantType_QUANT_WEIGHT;
+  }
  if (node->output_indices_.empty()) {
    MS_LOG(ERROR) << "The output size is invalid";
    if (parameter->destroy_func_ != nullptr) {
@ -1000,9 +1002,6 @@ int Scheduler::FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std:
    cpu_desc.data_type = kNumberTypeFloat16;
  }
  int ret;
-  if (context_->float_mode && op_parameter->quant_type_ == schema::QuantType_QUANT_ALL) {
-    op_parameter->quant_type_ = schema::QuantType_QUANT_WEIGHT;
-  }
 #ifndef WEIGHT_DECODE_CLIP
  ret = WeightDecoder::DequantNode(op_parameter, in_tensors, kernel_data_type, src_model_->version_);
  if (ret != RET_OK) {
--- a/mindspore/lite/test/st/scripts/base_functions.sh
+++ b/mindspore/lite/test/st/scripts/base_functions.sh
@ -88,6 +88,7 @@ function Convert() {
        done
      fi
      # start running converter
+      echo "Convert ${model_name} ${quant_type} ......"
      echo ${model_name} >> "$4"
      echo './converter_lite  --fmk='${model_fmk}' --modelFile='${model_file}' --weightFile='${weight_file}' --outputFile='${output_file}\
        ' --inputDataType='${in_dtype}' --outputDataType='${out_dtype}' --inputShape='${spec_shapes}\