From 9ad4064750c5b3ff09bcb744aaa6f50875d2ec9e Mon Sep 17 00:00:00 2001
From: yeyunpeng2020 <yeyunpeng1@huawei.com>
Date: Mon, 27 Dec 2021 16:07:23 +0800
Subject: [PATCH] Abstract bias correction

---
 .../lite/tools/converter/quant_param_holder.h |   6 -
 .../quantizer/bias_correction_strategy.cc     | 488 +++++++++++++
 .../quantizer/bias_correction_strategy.h      |  83 +++
 .../tools/converter/quantizer/bitpacking.h    |   6 +-
 .../quantizer/full_quant_quantizer.cc         | 643 +-----------------
 .../quantizer/full_quant_quantizer.h          |  40 +-
 .../converter/quantizer/quantize_util.cc      | 160 +++++
 .../tools/converter/quantizer/quantize_util.h |   2 +
 8 files changed, 743 insertions(+), 685 deletions(-)
 create mode 100644 mindspore/lite/tools/converter/quantizer/bias_correction_strategy.cc
 create mode 100644 mindspore/lite/tools/converter/quantizer/bias_correction_strategy.h
diff --git a/mindspore/lite/tools/converter/quant_param_holder.h b/mindspore/lite/tools/converter/quant_param_holder.h
index 9b7186de0e1..307a76588f9 100644
--- a/mindspore/lite/tools/converter/quant_param_holder.h
+++ b/mindspore/lite/tools/converter/quant_param_holder.h
@@ -120,12 +120,6 @@ class QuantParamHolder : public Value {
 
   std::vector<std::vector<schema::QuantParamT>> get_output_quant_params() const { return this->output_quant_params_; }
 
-  // deprecated
-  void ClearInputOutputQuantParam() {
-    input_quant_params_.clear();
-    output_quant_params_.clear();
-  }
-
   bool IsInputQuantParamsInited() {
     if (this->input_quant_params_.empty()) {
       return false;
diff --git a/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.cc b/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.cc
new file mode 100644
index 00000000000..9a66c400787
--- /dev/null
+++ b/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.cc
@@ -0,0 +1,488 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/converter/quantizer/bias_correction_strategy.h"
+#include <dirent.h>
+#include <future>
+#include <set>
+#include <memory>
+#include <functional>
+#include <numeric>
+#include <string>
+#include <thread>
+#include <vector>
+#include <algorithm>
+#include "src/common/log_adapter.h"
+#include "include/errorcode.h"
+#include "mindapi/base/type_id.h"
+#include "tools/common/tensor_util.h"
+
+namespace mindspore::lite::quant {
+namespace {
+constexpr int kHasBiasTensorSize = 3;
+const char *kTypeConv2D = schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion);
+}  // namespace
+int BiasCorrectionStrategy::CheckFp32TensorVec(const std::string &node_name,
+                                               const std::vector<mindspore::tensor::MSTensor *> &tensor_vec) {
+  if (tensor_vec.empty()) {
+    MS_LOG(ERROR) << "node: " << node_name << " input tensors is 0";
+    return RET_ERROR;
+  }
+  auto *tensor = tensor_vec[0];
+  CHECK_NULL_RETURN(tensor);
+  if (tensor->data_type() != kNumberTypeFloat32) {
+    MS_LOG(INFO) << "node: " << node_name << " will not quantize"
+                 << " tensor data_type: " << tensor->data_type();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+bool BiasCorrectionStrategy::OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data) {
+  MS_ASSERT(data != nullptr);
+  std::lock_guard<std::mutex> lg(mutex_op_input_);
+  if (type == STORE) {
+    if (fp32_op_input_map_.find(op_name) != fp32_op_input_map_.end()) {
+      // the data has not been fetched by int8 model
+      return false;
+    }
+    fp32_op_input_map_[op_name] = *data;
+    return true;
+  } else if (type == FETCH) {
+    if (fp32_op_input_map_.find(op_name) == fp32_op_input_map_.end()) {
+      // the data not generated by fp32 model yet
+      return false;
+    }
+    *data = fp32_op_input_map_[op_name];
+    fp32_op_input_map_.erase(op_name);
+    return true;
+  } else {
+    MS_LOG(ERROR) << "unexpected type: " << type;
+  }
+  return false;
+}
+
+bool BiasCorrectionStrategy::OpOutputChMeanDataHandle(OperationType type, const string &op_name,
+                                                      std::vector<float> *data) {
+  MS_ASSERT(data != nullptr);
+  std::lock_guard<std::mutex> lg(mutex_op_output_);
+  if (type == STORE) {
+    if (fp32_op_output_ch_mean_map_.find(op_name) != fp32_op_output_ch_mean_map_.end()) {
+      // the data has not been fetched by int8 model
+      return false;
+    }
+    fp32_op_output_ch_mean_map_[op_name] = *data;
+    return true;
+  } else if (type == FETCH) {
+    if (fp32_op_output_ch_mean_map_.find(op_name) == fp32_op_output_ch_mean_map_.end()) {
+      // the data not generated by fp32 model yet
+      return false;
+    }
+    *data = fp32_op_output_ch_mean_map_[op_name];
+    fp32_op_output_ch_mean_map_.erase(op_name);
+    return true;
+  } else {
+    MS_LOG(ERROR) << "unexpected type: " << type;
+  }
+  return false;
+}
+
+KernelCallBack BiasCorrectionStrategy::GetBeforeCallBack(bool int8_op) {
+  KernelCallBack before_call_back;
+  if (!int8_op) {
+    before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
+                              const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
+                              const CallBackParam &callParam) -> bool {
+      if (callParam.node_type == kTypeConv2D) {
+        if (CheckFp32TensorVec(callParam.node_name, before_inputs) != RET_OK) {
+          return true;
+        }
+        auto tensor = before_inputs[0];
+        MS_ASSERT(tensor != nullptr);
+        size_t elem_count = tensor->ElementsNum();
+        MS_CHECK_GT(elem_count, 0, false);
+        std::vector<float> fp32_op_input(elem_count);
+        auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size());
+        if (ret != EOK) {
+          MS_LOG(ERROR) << "memcpy error: " << ret;
+          return false;
+        }
+        while (!OpInputDataHandle(STORE, callParam.node_name, &fp32_op_input)) {
+          std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
+        }
+      }
+      return true;
+    };
+  } else {
+    before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
+                              const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
+                              const CallBackParam &callParam) -> bool {
+      if (callParam.node_type == kTypeConv2D) {
+        std::vector<float> fp32_op_input;
+        while (!OpInputDataHandle(FETCH, callParam.node_name, &fp32_op_input)) {
+          std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
+        }
+        auto tensor = before_inputs[0];
+        MS_ASSERT(tensor != nullptr);
+        // op can be skipped.
+        if (tensor->data_type() != kNumberTypeInt8) {
+          MS_LOG(INFO) << "tensor type is " << tensor->data_type();
+          return true;
+        }
+        // do quantization: activation is always per layer quantized
+        std::vector<int8_t> quant_datas;
+        auto quant_params = tensor->quant_params();
+        if (quant_params.size() != 1) {
+          MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size();
+          return false;
+        }
+        schema::QuantParamT quant_param_t;
+        quant_param_t.scale = quant_params[0].scale;
+        quant_param_t.zeroPoint = quant_params[0].zeroPoint;
+        for (auto float_data : fp32_op_input) {
+          auto quant_data = QuantizeData<int8_t>(float_data, &quant_param_t, activation_q_max_, activation_q_min_);
+          quant_datas.push_back(quant_data);
+        }
+
+        if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) {
+          MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size()
+                        << " not the same with: " << quant_datas.size() * sizeof(int8_t);
+          return false;
+        }
+
+        auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t));
+        if (ret != EOK) {
+          MS_LOG(ERROR) << "memcpy error: " << ret;
+          return false;
+        }
+      }
+      return true;
+    };
+  }
+  return before_call_back;
+}
+
+KernelCallBack BiasCorrectionStrategy::GetAfterCallBack(bool int8_op) {
+  KernelCallBack after_call_back;
+  if (!int8_op) {
+    return GetFloatAfterCallBack();
+  }
+  return GetInt8AfterCallBack();
+}
+
+KernelCallBack BiasCorrectionStrategy::GetInt8AfterCallBack() {
+  KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
+                                          const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
+                                          const CallBackParam &callParam) -> bool {
+    if (callParam.node_type == kTypeConv2D) {
+      std::vector<float> fp32_op_output_ch_mean;
+      while (!OpOutputChMeanDataHandle(FETCH, callParam.node_name, &fp32_op_output_ch_mean)) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
+      }
+      auto tensor = afterOutputs[0];
+      MS_ASSERT(tensor != nullptr);
+      // op can be skipped.
+      if (tensor->data_type() != kNumberTypeInt8) {
+        MS_LOG(INFO) << "tensor type is " << tensor->data_type();
+        return true;
+      }
+      const int8_t *tensor_data = static_cast<int8_t *>(tensor->data());
+      size_t elem_count = tensor->ElementsNum();
+      MS_CHECK_GT(elem_count, 0, false);
+      auto shapes = tensor->shape();
+      if (shapes.size() != DIMENSION_4D) {
+        MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
+        return false;
+      }
+      // suppose the the format is NHWC
+      auto channels = shapes[FOURTH_INPUT];
+      if (channels == 0) {
+        MS_LOG(ERROR) << "unexpected channels: 0";
+        return false;
+      }
+      auto quant_params = tensor->quant_params();
+      if (quant_params.size() != 1) {
+        MS_LOG(ERROR) << "unexpected activatation quant_params size: " << quant_params.size();
+        return false;
+      }
+      auto scale = quant_params[0].scale;
+      auto zp = quant_params[0].zeroPoint;
+      std::vector<float> dequant_op_output_ch_mean(channels);
+      auto one_filter_size = elem_count / channels;
+      for (int i = 0; i < channels; i++) {
+        float sum = 0;
+        for (size_t j = 0; j < one_filter_size; j++) {
+          auto index = j * channels + i;
+          if (index >= elem_count) {
+            MS_LOG(ERROR) << "over flow!";
+            return false;
+          }
+          // deuqant activation
+          auto float_data = scale * (tensor_data[index] - zp);
+          sum += float_data;
+        }
+        if (one_filter_size == 0) {
+          MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
+          return false;
+        }
+        sum = sum / one_filter_size;
+        dequant_op_output_ch_mean[i] = sum;
+      }
+      std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(),
+                     dequant_op_output_ch_mean.begin(), std::minus<>());
+
+      if (op_bias_diff_map_.find(callParam.node_name) != op_bias_diff_map_.end()) {
+        auto &bias_diff = op_bias_diff_map_[callParam.node_name];
+        std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(),
+                       std::plus<>());
+      } else {
+        op_bias_diff_map_[callParam.node_name] = dequant_op_output_ch_mean;
+      }
+    }
+    return true;
+  };
+  return after_call_back;
+}
+
+KernelCallBack BiasCorrectionStrategy::GetFloatAfterCallBack() {
+  KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
+                                          const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
+                                          const CallBackParam &callParam) -> bool {
+    if (callParam.node_type == kTypeConv2D) {
+      if (CheckFp32TensorVec(callParam.node_name, afterOutputs) != RET_OK) {
+        return true;
+      }
+      auto tensor = afterOutputs[0];
+      MS_ASSERT(tensor != nullptr);
+      const auto *tensor_data = static_cast<const float *>(tensor->data());
+      size_t elem_count = tensor->ElementsNum();
+      MS_CHECK_GT(elem_count, 0, false);
+      auto shapes = tensor->shape();
+      if (shapes.size() != DIMENSION_4D) {
+        MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
+        return false;
+      }
+      // suppose the activation format: NHWC
+      auto channels = shapes[FOURTH_INPUT];
+      if (channels == 0) {
+        MS_LOG(ERROR) << "unexpected channels: 0";
+        return false;
+      }
+      std::vector<float> fp32_op_output_ch_mean(channels);
+      auto one_filter_size = elem_count / channels;
+      for (int i = 0; i < channels; i++) {
+        float sum = 0;
+        for (size_t j = 0; j < one_filter_size; j++) {
+          auto index = j * channels + i;
+          if (index >= elem_count) {
+            MS_LOG(ERROR) << "over flow!";
+            return false;
+          }
+          sum += tensor_data[index];
+        }
+        if (one_filter_size == 0) {
+          MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
+          return false;
+        }
+        sum = sum / one_filter_size;
+        fp32_op_output_ch_mean[i] = sum;
+      }
+      while (!OpOutputChMeanDataHandle(STORE, callParam.node_name, &fp32_op_output_ch_mean)) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
+      }
+    }
+    return true;
+  };
+  return after_call_back;
+}
+
+int BiasCorrectionStrategy::Int8Inference() {
+  // int8 inference
+  std::vector<mindspore::tensor::MSTensor *> inputs = int8_session_->GetInputs();
+  for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) {
+    for (size_t input_index = 0; input_index < inputs.size(); input_index++) {
+      int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]);
+      if (status != RET_OK) {
+        MS_LOG(ERROR) << "generate input data failed!";
+        return RET_ERROR;
+      }
+    }
+    // before func
+    KernelCallBack before_call_back = GetBeforeCallBack(true);
+    // after func
+    KernelCallBack after_call_back = GetAfterCallBack(true);
+    int8_session_->BindThread(true);
+    auto status = int8_session_->RunGraph(before_call_back, after_call_back);
+    int8_session_->BindThread(false);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "run model failed!";
+      return RET_ERROR;
+    }
+  }  // end for images
+  return RET_OK;
+}
+
+int BiasCorrectionStrategy::DoBiasCorrection(const FuncGraphPtr &func_graph) {
+  // init in8 session
+  MS_LOG(INFO) << "create quant session";
+  flags_.commonQuantParam.quant_type = schema::QuantType_QUANT_ALL;
+  auto int8_sm = CreateSessionByFuncGraph(func_graph, flags_, this->flags_.commonQuantParam.thread_num);
+  int8_session_ = int8_sm.session;
+  int8_model_ = int8_sm.model;
+  if (int8_session_ == nullptr || int8_model_ == nullptr) {
+    MS_LOG(ERROR) << "create session failed!";
+    return RET_ERROR;
+  }
+
+  std::future<int> int8_inference = std::async(std::launch::async, &BiasCorrectionStrategy::Int8Inference, this);
+  // get input tensor
+  std::vector<mindspore::tensor::MSTensor *> inputs = fp32_session_->GetInputs();
+  // fp32 inference
+  for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) {
+    for (size_t input_index = 0; input_index < inputs.size(); input_index++) {
+      int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]);
+      if (status != RET_OK) {
+        MS_LOG(ERROR) << "generate input data from images failed!";
+        return RET_ERROR;
+      }
+    }
+    // before func
+    KernelCallBack before_call_back = GetBeforeCallBack(false);
+    // after func
+    KernelCallBack after_call_back = GetAfterCallBack(false);
+    fp32_session_->BindThread(true);
+    auto status = fp32_session_->RunGraph(before_call_back, after_call_back);
+    fp32_session_->BindThread(false);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "run model failed!";
+      return RET_ERROR;
+    }
+  }  // end for images
+
+  int status = int8_inference.get();
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "int8 inference failed!";
+    return RET_ERROR;
+  }
+  if (calibrator_->GetBatchNum() == 0) {
+    MS_LOG(ERROR) << "divisor 'calibrate_size' cannot be 0.";
+    return RET_ERROR;
+  }
+  for (auto &key_value : op_bias_diff_map_) {
+    std::for_each(key_value.second.begin(), key_value.second.end(),
+                  [this](float &data) { data = data / calibrator_->GetBatchNum(); });
+  }
+  auto cnodes = func_graph->GetOrderedCnodes();
+  for (auto &cnode : cnodes) {
+    auto op_name = cnode->fullname_with_scope();
+    if (op_bias_diff_map_.find(op_name) == op_bias_diff_map_.end()) {
+      continue;
+    }
+    status = DoCNodeBiasCorrection(func_graph, cnode);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "do node bias correct failed.";
+      break;
+    }
+  }
+  return status;
+}
+
+int BiasCorrectionStrategy::DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
+  auto op_name = cnode->fullname_with_scope();
+  const auto &bias_diff = op_bias_diff_map_[op_name];
+  auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
+  if (primitive == nullptr) {
+    MS_LOG(ERROR) << op_name << " primitive is nullptr";
+    return RET_NULL_PTR;
+  }
+  auto quant_param_holder = GetCNodeQuantHolder(primitive);
+  MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr.");
+  auto input_quant_params = quant_param_holder->get_input_quant_params();
+  if (input_quant_params.size() == kHasBiasTensorSize) {
+    // compensate the existed
+    auto bias_quant_params = input_quant_params.at(THIRD_INPUT);
+    auto bias = cnode->input(THIRD_INPUT + 1);
+    auto bias_parameter_ptr = bias->cast<ParameterPtr>();
+    auto bias_default_param = bias_parameter_ptr->default_param();
+    auto bias_param = bias_default_param->cast<tensor::TensorPtr>();
+    int *bias_datas = static_cast<int *>(bias_param->data_c());
+
+    if (static_cast<size_t>(bias_param->DataSize()) != bias_diff.size()) {
+      MS_LOG(DEBUG) << op_name << " unexpected bias data count: " << bias_param->DataSize()
+                    << " not the same as bias_diff: " << bias_diff.size();
+      return RET_ERROR;
+    }
+    if (bias_quant_params.size() != bias_diff.size()) {
+      MS_LOG(ERROR) << op_name << " unexpected bias quant params size: " << bias_quant_params.size()
+                    << " not the same as bias_diff: " << bias_diff.size();
+      return RET_ERROR;
+    }
+    for (size_t i = 0; i < bias_param->DataSize(); i++) {
+      auto scale = bias_quant_params[i].scale;
+      if (fabs(scale) <= 0.0f) {
+        MS_LOG(ERROR) << op_name << " divisor 'scale' cannot be 0.";
+        return RET_ERROR;
+      }
+      double after_correct = std::round(bias_diff[i] / scale) + bias_datas[i];
+      const constexpr int32_t corrected_bias_abs_limit = 0.6 * INT32_MAX;
+      if (after_correct > corrected_bias_abs_limit) {
+        MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too large: " << after_correct
+                        << " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale;
+        bias_datas[i] = static_cast<int>(corrected_bias_abs_limit);
+      } else if (after_correct < -corrected_bias_abs_limit) {
+        MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too small: " << after_correct
+                        << " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale;
+        bias_datas[i] = static_cast<int>(-corrected_bias_abs_limit);
+      } else {
+        auto diff = static_cast<int>(std::round(bias_diff[i] / scale));
+        bias_datas[i] += diff;
+      }
+    }
+  } else if (input_quant_params.size() == kHasBiasTensorSize - 1) {
+    MS_LOG(INFO) << op_name << " add bias input";
+    // need to add bias input
+    auto parameter = func_graph->add_parameter();
+    if (parameter == nullptr) {
+      MS_LOG(ERROR) << "parameter is nullptr.";
+      return RET_NULL_PTR;
+    }
+    std::vector<int64_t> shape;
+    shape.push_back(bias_diff.size());
+
+    auto tensor_info = CreateTensorInfo(bias_diff.data(), sizeof(float) * bias_diff.size(), shape, kNumberTypeFloat32);
+    if (tensor_info == nullptr) {
+      MS_LOG(ERROR) << op_name << " create tensor info failed.";
+      return RET_ERROR;
+    }
+    auto status = InitParameterFromTensorInfo(parameter, tensor_info);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << op_name << " init parameter from tensor info failed";
+      return RET_ERROR;
+    }
+    parameter->set_name("added_" + op_name + "_bias");
+    cnode->add_input(parameter);
+    status = DoParameterBiasQuant(parameter, primitive);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << op_name << " Do bias quant failed.";
+      return RET_ERROR;
+    }
+  } else {
+    MS_LOG(WARNING) << op_name << " unexpected size: " << input_quant_params.size()
+                    << ", and shared weight tensor does not support bias correction temporarily.";
+  }
+  return RET_OK;
+}
+}  // namespace mindspore::lite::quant
diff --git a/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.h b/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.h
new file mode 100644
index 00000000000..53e08ce8d80
--- /dev/null
+++ b/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BIASCORRECTION_H
+#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BIASCORRECTION_H
+#include <memory>
+#include <map>
+#include <string>
+#include <vector>
+#include "base/base.h"
+#include "ir/anf.h"
+#include "tools/converter/quantizer/calibrator.h"
+
+namespace mindspore::lite::quant {
+enum OperationType {
+  STORE,
+  FETCH,
+};
+
+class BiasCorrectionStrategy {
+ public:
+  BiasCorrectionStrategy(const converter::Flags &flags, const std::shared_ptr<Calibrator> &calibrator,
+                         session::LiteSession *fp32_session, Model *fp32_model, int activation_q_min,
+                         int activation_q_max)
+      : flags_(flags),
+        calibrator_(calibrator),
+        fp32_session_(fp32_session),
+        fp32_model_(fp32_model),
+        activation_q_min_(activation_q_min),
+        activation_q_max_(activation_q_max) {}
+  ~BiasCorrectionStrategy() {
+    if (int8_session_ != nullptr) {
+      delete int8_session_;
+    }
+    if (int8_model_ != nullptr) {
+      delete int8_model_;
+    }
+  }
+  int DoBiasCorrection(const FuncGraphPtr &func_graph);
+
+ private:
+  int DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode);
+  int Int8Inference();
+  bool OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
+  bool OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
+  KernelCallBack GetBeforeCallBack(bool int8_op);
+  KernelCallBack GetAfterCallBack(bool int8_op);
+  KernelCallBack GetInt8AfterCallBack();
+  KernelCallBack GetFloatAfterCallBack();
+  int CheckFp32TensorVec(const std::string &node_name, const std::vector<mindspore::tensor::MSTensor *> &tensor_vec);
+
+ private:
+  converter::Flags flags_;
+  std::shared_ptr<Calibrator> calibrator_{nullptr};
+  session::LiteSession *fp32_session_{nullptr};
+  Model *fp32_model_{nullptr};
+  int activation_q_min_{INT8_MIN};
+  int activation_q_max_{INT8_MAX};
+
+  session::LiteSession *int8_session_{nullptr};
+  Model *int8_model_{nullptr};
+
+  std::map<std::string, std::vector<float>> fp32_op_input_map_;           // concurrency
+  std::map<std::string, std::vector<float>> fp32_op_output_ch_mean_map_;  // concurrency
+  std::map<std::string, std::vector<float>> op_bias_diff_map_;            // only use by int8 model
+  std::mutex mutex_op_input_;
+  std::mutex mutex_op_output_;
+};
+}  // namespace mindspore::lite::quant
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BIASCORRECTION_H
diff --git a/mindspore/lite/tools/converter/quantizer/bitpacking.h b/mindspore/lite/tools/converter/quantizer/bitpacking.h
index be465e67a0f..9ff274dee29 100644
--- a/mindspore/lite/tools/converter/quantizer/bitpacking.h
+++ b/mindspore/lite/tools/converter/quantizer/bitpacking.h
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER__GENERAL_BITPACKING_H
-#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER__GENERAL_BITPACKING_H
-#include <stdint.h>
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H
+#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H
+#include <cstdint>
 #include <stack>
 #include <queue>
 #include <vector>
diff --git a/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.cc b/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.cc
index 1de107a0ccb..0131df6162b 100644
--- a/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.cc
+++ b/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.cc
@@ -16,18 +16,11 @@
 
 #include "tools/converter/quantizer/full_quant_quantizer.h"
 #include <dirent.h>
-#include <future>
 #include <set>
 #include <memory>
 #include <unordered_map>
-#include <functional>
-#include <numeric>
-#include <utility>
 #include <string>
-#include <thread>
 #include <vector>
-#include <algorithm>
-#include "ops/fusion/full_connection.h"
 #include "ops/tuple_get_item.h"
 #include "src/tensor.h"
 #include "tools/converter/quantizer/quant_cast.h"
@@ -41,6 +34,7 @@
 #include "tools/common/node_util.h"
 #include "nnacl/op_base.h"
 #include "src/common/log_util.h"
+#include "tools/converter/quantizer/bias_correction_strategy.h"
 
 using std::string;
 using std::vector;
@@ -50,88 +44,10 @@ namespace {
 static const std::set<PrimitivePtr> has_bias_operator = {prim::kPrimConv2DFusion, prim::kPrimConv2dTransposeFusion,
                                                          prim::kPrimMatMulFusion, prim::kPrimFullConnection,
                                                          prim::kPrimLayerNormFusion};
-constexpr int kHasBiasTensorSize = 3;
-constexpr int KBiasBitNum = 32;
-const char *kTypeConv2D = schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion);
 }  // namespace
-namespace {
-int ComputeBiasDataAndQuantParam(const std::vector<double> &bias_scales, const std::vector<double> &input_scales,
-                                 const float *raw_datas, const QuantParamHolderPtr &quant_param_holder,
-                                 std::vector<schema::QuantParamT> *quant_params, std::vector<int32_t> *quant_datas) {
-  MS_ASSERT(raw_datas != nullptr && quant_param_holder != nullptr);
-  MS_ASSERT(quant_params != nullptr && quant_datas != nullptr);
-  double bias_scale_tmp;
-  const constexpr double quanted_bias_abs_limit = 0.5 * INT32_MAX;
-  MS_CHECK_TRUE_MSG(quant_param_holder->get_input_quant_params().size() > 1, RET_ERROR, "invalid access.");
-  auto weight_quant_params = quant_param_holder->get_input_quant_params().at(1);
-  auto shape_size = quant_datas->size();
-  if (bias_scales.size() == shape_size) {
-    for (size_t i = 0; i < shape_size; i++) {
-      bias_scale_tmp = bias_scales[i];
-      if (fabs(bias_scale_tmp) <= 0.0f) {
-        MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0.";
-        return RET_ERROR;
-      }
-      if (std::abs(raw_datas[i] / bias_scale_tmp) >= quanted_bias_abs_limit) {
-        MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[i].scale
-                      << " is too small, need to update";
-        // update filter scale and zp
-        double activate_scale = input_scales[0];
-        double filter_scale = std::abs(raw_datas[i]) / (activate_scale * quanted_bias_abs_limit);
-        weight_quant_params[i].scale = filter_scale;
-        weight_quant_params[i].zeroPoint = 0;
-        quant_param_holder->set_input_quant_param(1, weight_quant_params);
-        bias_scale_tmp = std::abs(raw_datas[i]) / quanted_bias_abs_limit;
-        quant_params->at(i).scale = bias_scale_tmp;
-        MS_LOG(DEBUG) << "new filter scale: " << filter_scale;
-      }
-      auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp);
-      quant_datas->at(i) = quant_data;
-    }
-    return RET_OK;
-  } else if (bias_scales.size() == 1) {
-    // for fc, per tensor quant
-    bias_scale_tmp = quant_params->front().scale;
-    float max_raw_data = 0.0f;
-    for (size_t i = 0; i < shape_size; i++) {
-      if (std::abs(raw_datas[i]) > max_raw_data) {
-        max_raw_data = std::abs(raw_datas[i]);
-      }
-    }
-    if (fabs(bias_scale_tmp) <= 0.0f) {
-      MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0.";
-      return RET_ERROR;
-    }
-    if (std::abs(max_raw_data / bias_scale_tmp) >= quanted_bias_abs_limit) {
-      MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[0].scale
-                    << " is too small, need to update";
-      double activate_scale = input_scales[0];
-      MS_CHECK_TRUE_MSG(activate_scale != 0, RET_ERROR, "activate_scale == 0");
-      double filter_scale = std::abs(max_raw_data) / (activate_scale * quanted_bias_abs_limit);
-      weight_quant_params[0].scale = filter_scale;
-      weight_quant_params[0].zeroPoint = 0;
-      quant_param_holder->set_input_quant_param(1, weight_quant_params);
-      bias_scale_tmp = max_raw_data / quanted_bias_abs_limit;
-      quant_params->front().scale = bias_scale_tmp;
-      MS_LOG(DEBUG) << "new filter scale: " << filter_scale;
-    }
-    for (size_t i = 0; i < shape_size; i++) {
-      auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp);
-      quant_datas->at(i) = quant_data;
-    }
-    return RET_OK;
-  }
-  MS_LOG(ERROR) << "unexpected input_scales size: " << input_scales.size()
-                << " weight_scales size: " << weight_quant_params.size();
-  return RET_ERROR;
-}
-}  // namespace
-
 FullQuantQuantizer::~FullQuantQuantizer() {
   delete fp32_session_;
   delete fp32_model_;
-  delete int8_session_;
-  delete int8_model_;
 }
 
 int FullQuantQuantizer::SetInOutQuantParam(const AnfNodePtr &input_node, const std::unique_ptr<DataDistribution> &info,
@@ -206,94 +122,6 @@ int FullQuantQuantizer::DoValueNodeWeightQuant(const ValueNodePtr &weight, const
   return RET_OK;
 }
 
-int FullQuantQuantizer::DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive) {
-  CHECK_NULL_RETURN(bias);
-  CHECK_NULL_RETURN(primitive);
-  auto bias_default_param = bias->default_param();
-  auto bias_param = bias_default_param->cast<tensor::TensorPtr>();
-  MS_ASSERT(bias_parameter != nullptr);
-  auto quant_param_holder = GetCNodeQuantHolder(primitive);
-  MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr.");
-  auto active_weight_quant_params = quant_param_holder->get_input_quant_params();
-
-  auto active_params = active_weight_quant_params.at(FIRST_INPUT);
-  auto weight_params = active_weight_quant_params.at(SECOND_INPUT);
-
-  vector<double> input_scales;
-  vector<double> filter_scales;
-  vector<double> bias_scales;
-  size_t sizeX = active_params.size();
-  for (size_t i = 0; i < sizeX; i++) {
-    input_scales.emplace_back(active_params[i].scale);
-  }
-  size_t sizeY = weight_params.size();
-  if (sizeX != sizeY) {
-    if (sizeX > 1 && sizeY > 1) {
-      MS_LOG(ERROR) << "input and filter's scale count cannot match!";
-      return RET_ERROR;
-    }
-  }
-  for (size_t i = 0; i < sizeY; i++) {
-    filter_scales.emplace_back(weight_params[i].scale);
-  }
-  size_t size = std::max(sizeX, sizeY);
-  for (size_t i = 0; i < size; i++) {
-    auto scaleX = sizeX > 1 ? input_scales[i] : input_scales[0];
-    auto scaleY = sizeY > 1 ? filter_scales[i] : filter_scales[0];
-    bias_scales.push_back(scaleX * scaleY);
-  }
-  MS_ASSERT(!bias_scales.empty());
-  size_t shape_size = bias_param->DataSize();
-
-  // set bias quant param
-  std::vector<schema::QuantParamT> quant_params;
-  for (double bias_scale : bias_scales) {
-    schema::QuantParamT quant_param;
-    if (bias_scale == 0) {
-      MS_LOG(WARNING) << "bias_scale == 0";
-      quant_param.scale = 1;
-    } else {
-      quant_param.scale = bias_scale;
-    }
-    quant_param.numBits = KBiasBitNum;
-    quant_param.zeroPoint = 0;
-    quant_param.inited = true;
-    quant_params.emplace_back(quant_param);
-  }
-  // quant bias data
-  std::vector<int32_t> quant_datas(shape_size);
-
-  auto *raw_datas = static_cast<float *>(bias_param->data_c());
-  if (ComputeBiasDataAndQuantParam(bias_scales, input_scales, raw_datas, quant_param_holder, &quant_params,
-                                   &quant_datas) != RET_OK) {
-    MS_LOG(ERROR) << "compute bias data failed.";
-    return RET_ERROR;
-  }
-  quant_param_holder->set_input_quant_param(THIRD_INPUT, quant_params);
-  auto ret = SetTensorData(bias_param, quant_datas.data(), shape_size * sizeof(int32_t));
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "set tensor data failed.";
-    return RET_ERROR;
-  }
-  // set dtype
-  auto abstractBase = bias->abstract();
-  if (abstractBase == nullptr) {
-    MS_LOG(ERROR) << "Abstract of parameter is nullptr, " << bias->name();
-    return RET_ERROR;
-  }
-  if (!utils::isa<abstract::AbstractTensorPtr>(abstractBase)) {
-    MS_LOG(ERROR) << "Abstract of parameter should be anstract tensor, " << bias->name();
-    return RET_ERROR;
-  }
-  auto abstractTensor = utils::cast<abstract::AbstractTensorPtr>(abstractBase);
-  if (abstractTensor == nullptr || abstractTensor->element() == nullptr) {
-    MS_LOG(ERROR) << "abstractTensor is nullptr" << bias->name();
-    return RET_NULL_PTR;
-  }
-  abstractTensor->element()->set_type(TypeIdToType(kNumberTypeInt32));
-  return RET_OK;
-}
-
 int FullQuantQuantizer::IsSupportWeightQuant(const CNodePtr &cnode, const AnfNodePtr &input_node, size_t input_index) {
   auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
   if (primitive == nullptr) {
@@ -628,17 +456,6 @@ int FullQuantQuantizer::MarkQuantNode(const FuncGraphPtr &func_graph) {
         MS_LOG(ERROR) << cnode->fullname_with_scope() << " add quantized op failed.";
         return ret;
       }
-      auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
-      if (primitive == nullptr) {
-        MS_LOG(ERROR) << cnode->fullname_with_scope() << " primitive is null";
-        return RET_ERROR;
-      }
-      auto quant_param_holder = GetCNodeQuantHolder(primitive);
-      if (quant_param_holder == nullptr) {
-        MS_LOG(ERROR) << cnode->fullname_with_scope() << " quant_param_holder is null";
-        return RET_ERROR;
-      }
-      quant_param_holder->ClearInputOutputQuantParam();
     }
   }
   return RET_OK;
@@ -658,7 +475,7 @@ int FullQuantQuantizer::PreProcess(const FuncGraphPtr &func_graph) {
       break;
   }
   InitQMinMax();
-  calibrator_ = std::make_unique<Calibrator>(this->bit_num_, activation_q_max_, activation_q_min_,
+  calibrator_ = std::make_shared<Calibrator>(this->bit_num_, activation_q_max_, activation_q_min_,
                                              this->flags_.fullQuantParam.activation_quant_method,
                                              this->flags_.dataPreProcessParam, activation_symmetry_);
   MSLITE_CHECK_PTR(calibrator_);
@@ -670,22 +487,6 @@ int FullQuantQuantizer::PreProcess(const FuncGraphPtr &func_graph) {
   return RET_OK;
 }
 
-int FullQuantQuantizer::CheckFp32TensorVec(const std::string &node_name,
-                                           const std::vector<mindspore::tensor::MSTensor *> &tensor_vec) {
-  if (tensor_vec.empty()) {
-    MS_LOG(ERROR) << "node: " << node_name << " input tensors is 0";
-    return RET_ERROR;
-  }
-  auto *tensor = tensor_vec[0];
-  CHECK_NULL_RETURN(tensor);
-  if (tensor->data_type() != kNumberTypeFloat32) {
-    MS_LOG(INFO) << "node: " << node_name << " will not quantize"
-                 << " tensor data_type: " << tensor->data_type();
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
 int FullQuantQuantizer::DoInference(CollectType collect_type) {
   // get input tensor
   vector<mindspore::tensor::MSTensor *> inputs = fp32_session_->GetInputs();
@@ -736,172 +537,6 @@ int FullQuantQuantizer::DoInference(CollectType collect_type) {
   return RET_OK;
 }
 
-int FullQuantQuantizer::Int8Inference() {
-  // int8 inference
-  vector<mindspore::tensor::MSTensor *> inputs = int8_session_->GetInputs();
-  for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) {
-    for (size_t input_index = 0; input_index < inputs.size(); input_index++) {
-      int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]);
-      if (status != RET_OK) {
-        MS_LOG(ERROR) << "generate input data failed!";
-        return RET_ERROR;
-      }
-    }
-    // before func
-    KernelCallBack before_call_back = GetBeforeCallBack(true);
-    // after func
-    KernelCallBack after_call_back = GetAfterCallBack(true);
-    int8_session_->BindThread(true);
-    auto status = int8_session_->RunGraph(before_call_back, after_call_back);
-    int8_session_->BindThread(false);
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << "run model failed!";
-      return RET_ERROR;
-    }
-  }  // end for images
-  return RET_OK;
-}
-
-int FullQuantQuantizer::BiasCorrection(const FuncGraphPtr &func_graph) {
-  std::future<int> int8_inference = std::async(std::launch::async, &FullQuantQuantizer::Int8Inference, this);
-  // get input tensor
-  vector<mindspore::tensor::MSTensor *> inputs = fp32_session_->GetInputs();
-  // fp32 inference
-  for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) {
-    for (size_t input_index = 0; input_index < inputs.size(); input_index++) {
-      int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]);
-      if (status != RET_OK) {
-        MS_LOG(ERROR) << "generate input data from images failed!";
-        return RET_ERROR;
-      }
-    }
-    // before func
-    KernelCallBack before_call_back = GetBeforeCallBack(false);
-    // after func
-    KernelCallBack after_call_back = GetAfterCallBack(false);
-    fp32_session_->BindThread(true);
-    auto status = fp32_session_->RunGraph(before_call_back, after_call_back);
-    fp32_session_->BindThread(false);
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << "run model failed!";
-      return RET_ERROR;
-    }
-  }  // end for images
-
-  int status = int8_inference.get();
-  if (status != RET_OK) {
-    MS_LOG(ERROR) << "int8 inference failed!";
-    return RET_ERROR;
-  }
-  if (calibrator_->GetBatchNum() == 0) {
-    MS_LOG(ERROR) << "divisor 'calibrate_size' cannot be 0.";
-    return RET_ERROR;
-  }
-  for (auto &key_value : op_bias_diff_map_) {
-    std::for_each(key_value.second.begin(), key_value.second.end(),
-                  [this](float &data) { data = data / calibrator_->GetBatchNum(); });
-  }
-  auto cnodes = func_graph->GetOrderedCnodes();
-  for (auto &cnode : cnodes) {
-    auto op_name = cnode->fullname_with_scope();
-    if (op_bias_diff_map_.find(op_name) == op_bias_diff_map_.end()) {
-      continue;
-    }
-    status = BiasCorrection(func_graph, cnode);
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << "do node bias correct failed.";
-      break;
-    }
-  }
-  return status;
-}
-
-int FullQuantQuantizer::BiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
-  auto op_name = cnode->fullname_with_scope();
-  const auto &bias_diff = op_bias_diff_map_[op_name];
-  auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
-  if (primitive == nullptr) {
-    MS_LOG(ERROR) << op_name << " primitive is nullptr";
-    return RET_NULL_PTR;
-  }
-  auto quant_param_holder = GetCNodeQuantHolder(primitive);
-  MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr.");
-  auto input_quant_params = quant_param_holder->get_input_quant_params();
-  if (input_quant_params.size() == kHasBiasTensorSize) {
-    // compensate the existed
-    auto bias_quant_params = input_quant_params.at(THIRD_INPUT);
-    auto bias = cnode->input(THIRD_INPUT + 1);
-    auto bias_parameter_ptr = bias->cast<ParameterPtr>();
-    auto bias_default_param = bias_parameter_ptr->default_param();
-    auto bias_param = bias_default_param->cast<tensor::TensorPtr>();
-    int *bias_datas = static_cast<int *>(bias_param->data_c());
-
-    if (static_cast<size_t>(bias_param->DataSize()) != bias_diff.size()) {
-      MS_LOG(DEBUG) << op_name << " unexpected bias data count: " << bias_param->DataSize()
-                    << " not the same as bias_diff: " << bias_diff.size();
-      return RET_ERROR;
-    }
-    if (bias_quant_params.size() != bias_diff.size()) {
-      MS_LOG(ERROR) << op_name << " unexpected bias quant params size: " << bias_quant_params.size()
-                    << " not the same as bias_diff: " << bias_diff.size();
-      return RET_ERROR;
-    }
-    for (size_t i = 0; i < bias_param->DataSize(); i++) {
-      auto scale = bias_quant_params[i].scale;
-      if (fabs(scale) <= 0.0f) {
-        MS_LOG(ERROR) << op_name << " divisor 'scale' cannot be 0.";
-        return RET_ERROR;
-      }
-      double after_correct = std::round(bias_diff[i] / scale) + bias_datas[i];
-      const constexpr int32_t corrected_bias_abs_limit = 0.6 * INT32_MAX;
-      if (after_correct > corrected_bias_abs_limit) {
-        MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too large: " << after_correct
-                        << " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale;
-        bias_datas[i] = static_cast<int>(corrected_bias_abs_limit);
-      } else if (after_correct < -corrected_bias_abs_limit) {
-        MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too small: " << after_correct
-                        << " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale;
-        bias_datas[i] = static_cast<int>(-corrected_bias_abs_limit);
-      } else {
-        auto diff = static_cast<int>(std::round(bias_diff[i] / scale));
-        bias_datas[i] += diff;
-      }
-    }
-  } else if (input_quant_params.size() == kHasBiasTensorSize - 1) {
-    MS_LOG(INFO) << op_name << " add bias input";
-    // need to add bias input
-    auto parameter = func_graph->add_parameter();
-    if (parameter == nullptr) {
-      MS_LOG(ERROR) << "parameter is nullptr.";
-      return RET_NULL_PTR;
-    }
-    ShapeVector shape;
-    shape.push_back(bias_diff.size());
-
-    auto tensor_info = CreateTensorInfo(bias_diff.data(), sizeof(float) * bias_diff.size(), shape, kNumberTypeFloat32);
-    if (tensor_info == nullptr) {
-      MS_LOG(ERROR) << op_name << " create tensor info failed.";
-      return RET_ERROR;
-    }
-    auto status = InitParameterFromTensorInfo(parameter, tensor_info);
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << op_name << " init parameter from tensor info failed";
-      return RET_ERROR;
-    }
-    parameter->set_name("added_" + op_name + "_bias");
-    cnode->add_input(parameter);
-    status = DoParameterBiasQuant(parameter, primitive);
-    if (status != RET_OK) {
-      MS_LOG(ERROR) << op_name << " Do bias quant failed.";
-      return RET_ERROR;
-    }
-  } else {
-    MS_LOG(WARNING) << op_name << " unexpected size: " << input_quant_params.size()
-                    << ", and shared weight tensor does not support bias correction temporarily.";
-  }
-  return RET_OK;
-}
-
 int FullQuantQuantizer::DoQuantize(FuncGraphPtr func_graph) {
   MS_LOG(INFO) << "start to parse config file";
   if (flags_.dataPreProcessParam.calibrate_path.empty()) {
@@ -968,283 +603,17 @@ int FullQuantQuantizer::DoQuantize(FuncGraphPtr func_graph) {
       ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
       return RET_ERROR;
     }
-    SessionModel int8_sm;
     if (this->flags_.fullQuantParam.bias_correction) {
-      // init in8 session
-      MS_LOG(INFO) << "create quant session";
-      flags_.commonQuantParam.quant_type = schema::QuantType_QUANT_ALL;
-      int8_sm = CreateSessionByFuncGraph(func_graph, flags_, this->flags_.commonQuantParam.thread_num);
-      int8_session_ = int8_sm.session;
-      int8_model_ = int8_sm.model;
-      if (int8_session_ == nullptr || int8_model_ == nullptr) {
-        MS_LOG(ERROR) << "create session failed!";
-        return RET_ERROR;
-      }
       MS_LOG(INFO) << "do bias correction";
-      status = BiasCorrection(func_graph);
+      BiasCorrectionStrategy strategy(flags_, calibrator_, fp32_session_, fp32_model_, activation_q_min_,
+                                      activation_q_max_);
+      status = strategy.DoBiasCorrection(func_graph);
       if (status != RET_OK) {
-        MS_LOG(ERROR) << "BiasCorrection failed.";
+        MS_LOG(ERROR) << "bias_correction failed.";
         return status;
       }
     }
   }
   return RET_OK;
 }
-
-bool FullQuantQuantizer::OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data) {
-  MS_ASSERT(data != nullptr);
-  std::lock_guard<std::mutex> lg(mutex_op_input_);
-  if (type == STORE) {
-    if (fp32_op_input_map_.find(op_name) != fp32_op_input_map_.end()) {
-      // the data has not been fetched by int8 model
-      return false;
-    }
-    fp32_op_input_map_[op_name] = *data;
-    return true;
-  } else if (type == FETCH) {
-    if (fp32_op_input_map_.find(op_name) == fp32_op_input_map_.end()) {
-      // the data not generated by fp32 model yet
-      return false;
-    }
-    *data = fp32_op_input_map_[op_name];
-    fp32_op_input_map_.erase(op_name);
-    return true;
-  } else {
-    MS_LOG(ERROR) << "unexpected type: " << type;
-  }
-  return false;
-}
-
-bool FullQuantQuantizer::OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector<float> *data) {
-  MS_ASSERT(data != nullptr);
-  std::lock_guard<std::mutex> lg(mutex_op_output_);
-  if (type == STORE) {
-    if (fp32_op_output_ch_mean_map_.find(op_name) != fp32_op_output_ch_mean_map_.end()) {
-      // the data has not been fetched by int8 model
-      return false;
-    }
-    fp32_op_output_ch_mean_map_[op_name] = *data;
-    return true;
-  } else if (type == FETCH) {
-    if (fp32_op_output_ch_mean_map_.find(op_name) == fp32_op_output_ch_mean_map_.end()) {
-      // the data not generated by fp32 model yet
-      return false;
-    }
-    *data = fp32_op_output_ch_mean_map_[op_name];
-    fp32_op_output_ch_mean_map_.erase(op_name);
-    return true;
-  } else {
-    MS_LOG(ERROR) << "unexpected type: " << type;
-  }
-  return false;
-}
-
-KernelCallBack FullQuantQuantizer::GetBeforeCallBack(bool int8_op) {
-  KernelCallBack before_call_back;
-  if (!int8_op) {
-    before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
-                              const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
-                              const CallBackParam &callParam) -> bool {
-      if (callParam.node_type == kTypeConv2D) {
-        if (FullQuantQuantizer::CheckFp32TensorVec(callParam.node_name, before_inputs) != RET_OK) {
-          return true;
-        }
-        auto tensor = before_inputs[0];
-        MS_ASSERT(tensor != nullptr);
-        size_t elem_count = tensor->ElementsNum();
-        MS_CHECK_GT(elem_count, 0, false);
-        std::vector<float> fp32_op_input(elem_count);
-        auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size());
-        if (ret != EOK) {
-          MS_LOG(ERROR) << "memcpy error: " << ret;
-          return false;
-        }
-        while (!OpInputDataHandle(STORE, callParam.node_name, &fp32_op_input)) {
-          std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
-        }
-      }
-      return true;
-    };
-  } else {
-    before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
-                              const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
-                              const CallBackParam &callParam) -> bool {
-      if (callParam.node_type == kTypeConv2D) {
-        vector<float> fp32_op_input;
-        while (!OpInputDataHandle(FETCH, callParam.node_name, &fp32_op_input)) {
-          std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
-        }
-        auto tensor = before_inputs[0];
-        MS_ASSERT(tensor != nullptr);
-        // op can be skipped.
-        if (tensor->data_type() != kNumberTypeInt8) {
-          MS_LOG(INFO) << "tensor type is " << tensor->data_type();
-          return true;
-        }
-        // do quantization: activation is always per layer quantized
-        std::vector<int8_t> quant_datas;
-        auto quant_params = tensor->quant_params();
-        if (quant_params.size() != 1) {
-          MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size();
-          return false;
-        }
-        schema::QuantParamT quant_param_t;
-        quant_param_t.scale = quant_params[0].scale;
-        quant_param_t.zeroPoint = quant_params[0].zeroPoint;
-        for (auto float_data : fp32_op_input) {
-          auto quant_data = QuantizeData<int8_t>(float_data, &quant_param_t, activation_q_max_, activation_q_min_);
-          quant_datas.push_back(quant_data);
-        }
-
-        if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) {
-          MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size()
-                        << " not the same with: " << quant_datas.size() * sizeof(int8_t);
-          return false;
-        }
-
-        auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t));
-        if (ret != EOK) {
-          MS_LOG(ERROR) << "memcpy error: " << ret;
-          return false;
-        }
-      }
-      return true;
-    };
-  }
-  return before_call_back;
-}
-
-KernelCallBack FullQuantQuantizer::GetAfterCallBack(bool int8_op) {
-  KernelCallBack after_call_back;
-  if (!int8_op) {
-    return GetFloatAfterCallBack();
-  }
-  return GetInt8AfterCallBack();
-}
-
-KernelCallBack FullQuantQuantizer::GetInt8AfterCallBack() {
-  KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
-                                          const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
-                                          const CallBackParam &callParam) -> bool {
-    if (callParam.node_type == kTypeConv2D) {
-      vector<float> fp32_op_output_ch_mean;
-      while (!OpOutputChMeanDataHandle(FETCH, callParam.node_name, &fp32_op_output_ch_mean)) {
-        std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
-      }
-      auto tensor = afterOutputs[0];
-      MS_ASSERT(tensor != nullptr);
-      // op can be skipped.
-      if (tensor->data_type() != kNumberTypeInt8) {
-        MS_LOG(INFO) << "tensor type is " << tensor->data_type();
-        return true;
-      }
-      const int8_t *tensor_data = static_cast<int8_t *>(tensor->data());
-      size_t elem_count = tensor->ElementsNum();
-      MS_CHECK_GT(elem_count, 0, false);
-      auto shapes = tensor->shape();
-      if (shapes.size() != DIMENSION_4D) {
-        MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
-        return false;
-      }
-      // suppose the the format is NHWC
-      auto channels = shapes[FOURTH_INPUT];
-      if (channels == 0) {
-        MS_LOG(ERROR) << "unexpected channels: 0";
-        return false;
-      }
-      auto quant_params = tensor->quant_params();
-      if (quant_params.size() != 1) {
-        MS_LOG(ERROR) << "unexpected activatation quant_params size: " << quant_params.size();
-        return false;
-      }
-      auto scale = quant_params[0].scale;
-      auto zp = quant_params[0].zeroPoint;
-      std::vector<float> dequant_op_output_ch_mean(channels);
-      auto one_filter_size = elem_count / channels;
-      for (int i = 0; i < channels; i++) {
-        float sum = 0;
-        for (size_t j = 0; j < one_filter_size; j++) {
-          auto index = j * channels + i;
-          if (index >= elem_count) {
-            MS_LOG(ERROR) << "over flow!";
-            return false;
-          }
-          // deuqant activation
-          auto float_data = scale * (tensor_data[index] - zp);
-          sum += float_data;
-        }
-        if (one_filter_size == 0) {
-          MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
-          return false;
-        }
-        sum = sum / one_filter_size;
-        dequant_op_output_ch_mean[i] = sum;
-      }
-      std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(),
-                     dequant_op_output_ch_mean.begin(), std::minus<>());
-
-      if (op_bias_diff_map_.find(callParam.node_name) != op_bias_diff_map_.end()) {
-        auto &bias_diff = op_bias_diff_map_[callParam.node_name];
-        std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(),
-                       std::plus<>());
-      } else {
-        op_bias_diff_map_[callParam.node_name] = dequant_op_output_ch_mean;
-      }
-    }
-    return true;
-  };
-  return after_call_back;
-}
-
-KernelCallBack FullQuantQuantizer::GetFloatAfterCallBack() {
-  KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
-                                          const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
-                                          const CallBackParam &callParam) -> bool {
-    if (callParam.node_type == kTypeConv2D) {
-      if (FullQuantQuantizer::CheckFp32TensorVec(callParam.node_name, afterOutputs) != RET_OK) {
-        return true;
-      }
-      auto tensor = afterOutputs[0];
-      MS_ASSERT(tensor != nullptr);
-      const auto *tensor_data = static_cast<const float *>(tensor->data());
-      size_t elem_count = tensor->ElementsNum();
-      MS_CHECK_GT(elem_count, 0, false);
-      auto shapes = tensor->shape();
-      if (shapes.size() != DIMENSION_4D) {
-        MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
-        return false;
-      }
-      // suppose the activation format: NHWC
-      auto channels = shapes[FOURTH_INPUT];
-      if (channels == 0) {
-        MS_LOG(ERROR) << "unexpected channels: 0";
-        return false;
-      }
-      std::vector<float> fp32_op_output_ch_mean(channels);
-      auto one_filter_size = elem_count / channels;
-      for (int i = 0; i < channels; i++) {
-        float sum = 0;
-        for (size_t j = 0; j < one_filter_size; j++) {
-          auto index = j * channels + i;
-          if (index >= elem_count) {
-            MS_LOG(ERROR) << "over flow!";
-            return false;
-          }
-          sum += tensor_data[index];
-        }
-        if (one_filter_size == 0) {
-          MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
-          return false;
-        }
-        sum = sum / one_filter_size;
-        fp32_op_output_ch_mean[i] = sum;
-      }
-      while (!OpOutputChMeanDataHandle(STORE, callParam.node_name, &fp32_op_output_ch_mean)) {
-        std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
-      }
-    }
-    return true;
-  };
-  return after_call_back;
-}
 }  // namespace mindspore::lite::quant
diff --git a/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.h b/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.h
index 156f04f86ed..e46f4b2ce09 100644
--- a/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.h
+++ b/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.h
@@ -39,11 +39,6 @@
 #include "src/common/quant_utils.h"
 
 namespace mindspore::lite::quant {
-enum OperationType {
-  STORE,
-  FETCH,
-};
-
 class FullQuantQuantizer : public Quantizer {
  public:
   explicit FullQuantQuantizer(const converter::Flags &flags) : Quantizer(flags) {
@@ -55,45 +50,20 @@ class FullQuantQuantizer : public Quantizer {
   int DoQuantize(FuncGraphPtr func_graph) override;
 
  private:
-  bool OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
-
-  bool OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
-
   int PreProcess(const FuncGraphPtr &func_graph);
-
-  int CheckFp32TensorVec(const std::string &node_name, const std::vector<mindspore::tensor::MSTensor *> &tensor_vec);
-
   int DoInference(CollectType collect_type);
-
   int UpdateDivergeInterval();
-
   int QuantNodeSimpleOp(const CNodePtr &cnode);
-
   int QuantNode(const FuncGraphPtr &func_graph);
-
   int SetInOutQuantParam(const AnfNodePtr &input_node, const std::unique_ptr<DataDistribution> &info,
                          const PrimitivePtr &primitive, bool is_input, size_t index) const;
-
   int DoParameterWeightQuant(const ParameterPtr &weight, const PrimitivePtr &primitive, bool per_channel,
                              int input_index) const;
-
   int DoValueNodeWeightQuant(const ValueNodePtr &weight, const PrimitivePtr &primitive, bool per_channel,
                              int input_index) const;
-
   int DoParameterNodeQuant(const CNodePtr &cnode, const ParameterPtr &input_node, size_t input_index);
-
   int DoValueNodeQuant(const CNodePtr &cnode, const ValueNodePtr &input_node, size_t input_index);
-
   int IsSupportWeightQuant(const CNodePtr &cnode, const AnfNodePtr &input_node, size_t input_index);
-
-  int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive);
-  int Int8Inference();
-  int BiasCorrection(const FuncGraphPtr &func_graph);
-  int BiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode);
-  KernelCallBack GetBeforeCallBack(bool int8_op);
-  KernelCallBack GetAfterCallBack(bool int8_op);
-  KernelCallBack GetInt8AfterCallBack();
-  KernelCallBack GetFloatAfterCallBack();
   void InitQMinMax();
   void InitCpuConfig();
   void InitKirinConfig();
@@ -117,17 +87,9 @@ class FullQuantQuantizer : public Quantizer {
   std::set<PrimitivePtr> per_channel_ops_;
   std::set<mindspore::ActivationType> support_activation_;
 
-  std::unique_ptr<Calibrator> calibrator_{nullptr};
+  std::shared_ptr<Calibrator> calibrator_{nullptr};
   session::LiteSession *fp32_session_{nullptr};
   Model *fp32_model_{nullptr};
-  session::LiteSession *int8_session_{nullptr};
-  Model *int8_model_{nullptr};
-
-  std::map<std::string, std::vector<float>> fp32_op_input_map_;           // concurrency
-  std::map<std::string, std::vector<float>> fp32_op_output_ch_mean_map_;  // concurrency
-  std::map<std::string, std::vector<float>> op_bias_diff_map_;            // only use by int8 model
-  std::mutex mutex_op_input_;
-  std::mutex mutex_op_output_;
 
   // key is tensor_name
   std::map<std::string, std::vector<schema::QuantParamT>> weight_quant_params_bak;
diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.cc b/mindspore/lite/tools/converter/quantizer/quantize_util.cc
index 197997a07d8..97a92ce913d 100644
--- a/mindspore/lite/tools/converter/quantizer/quantize_util.cc
+++ b/mindspore/lite/tools/converter/quantizer/quantize_util.cc
@@ -46,7 +46,79 @@ constexpr int kSingleDirBiasTensorSize = 4;
 constexpr int kLstmBiasShapeSize = 2;
 constexpr int kLstmBiasIndex = 3;
 constexpr size_t kBitNumPerByte = 8;
+
+int ComputeBiasDataAndQuantParam(const std::vector<double> &bias_scales, const std::vector<double> &input_scales,
+                                 const float *raw_datas, const QuantParamHolderPtr &quant_param_holder,
+                                 std::vector<schema::QuantParamT> *quant_params, std::vector<int32_t> *quant_datas) {
+  MS_ASSERT(raw_datas != nullptr && quant_param_holder != nullptr);
+  MS_ASSERT(quant_params != nullptr && quant_datas != nullptr);
+  double bias_scale_tmp;
+  const constexpr double quanted_bias_abs_limit = 0.5 * INT32_MAX;
+  MS_CHECK_TRUE_MSG(quant_param_holder->get_input_quant_params().size() > 1, RET_ERROR, "invalid access.");
+  auto weight_quant_params = quant_param_holder->get_input_quant_params().at(1);
+  auto shape_size = quant_datas->size();
+  if (bias_scales.size() == shape_size) {
+    for (size_t i = 0; i < shape_size; i++) {
+      bias_scale_tmp = bias_scales[i];
+      if (fabs(bias_scale_tmp) <= 0.0f) {
+        MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0.";
+        return RET_ERROR;
+      }
+      if (std::abs(raw_datas[i] / bias_scale_tmp) >= quanted_bias_abs_limit) {
+        MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[i].scale
+                      << " is too small, need to update";
+        // update filter scale and zp
+        double activate_scale = input_scales[0];
+        double filter_scale = std::abs(raw_datas[i]) / (activate_scale * quanted_bias_abs_limit);
+        weight_quant_params[i].scale = filter_scale;
+        weight_quant_params[i].zeroPoint = 0;
+        quant_param_holder->set_input_quant_param(1, weight_quant_params);
+        bias_scale_tmp = std::abs(raw_datas[i]) / quanted_bias_abs_limit;
+        quant_params->at(i).scale = bias_scale_tmp;
+        MS_LOG(DEBUG) << "new filter scale: " << filter_scale;
+      }
+      auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp);
+      quant_datas->at(i) = quant_data;
+    }
+    return RET_OK;
+  } else if (bias_scales.size() == 1) {
+    // for fc, per tensor quant
+    bias_scale_tmp = quant_params->front().scale;
+    float max_raw_data = 0.0f;
+    for (size_t i = 0; i < shape_size; i++) {
+      if (std::abs(raw_datas[i]) > max_raw_data) {
+        max_raw_data = std::abs(raw_datas[i]);
+      }
+    }
+    if (fabs(bias_scale_tmp) <= 0.0f) {
+      MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0.";
+      return RET_ERROR;
+    }
+    if (std::abs(max_raw_data / bias_scale_tmp) >= quanted_bias_abs_limit) {
+      MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[0].scale
+                    << " is too small, need to update";
+      double activate_scale = input_scales[0];
+      MS_CHECK_TRUE_MSG(activate_scale != 0, RET_ERROR, "activate_scale == 0");
+      double filter_scale = std::abs(max_raw_data) / (activate_scale * quanted_bias_abs_limit);
+      weight_quant_params[0].scale = filter_scale;
+      weight_quant_params[0].zeroPoint = 0;
+      quant_param_holder->set_input_quant_param(1, weight_quant_params);
+      bias_scale_tmp = max_raw_data / quanted_bias_abs_limit;
+      quant_params->front().scale = bias_scale_tmp;
+      MS_LOG(DEBUG) << "new filter scale: " << filter_scale;
+    }
+    for (size_t i = 0; i < shape_size; i++) {
+      auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp);
+      quant_datas->at(i) = quant_data;
+    }
+    return RET_OK;
+  }
+  MS_LOG(ERROR) << "unexpected input_scales size: " << input_scales.size()
+                << " weight_scales size: " << weight_quant_params.size();
+  return RET_ERROR;
+}
 }  // namespace
+
 QuantParamHolderPtr GetCNodeQuantHolder(const PrimitivePtr &primitive) {
   MS_CHECK_TRUE_RET(primitive != nullptr, nullptr);
   QuantParamHolderPtr quant_params_holder = nullptr;
@@ -459,4 +531,92 @@ std::string BoolVectorToString(const std::vector<bool> &bool_vec) {
   }
   return str;
 }
+
+int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive) {
+  CHECK_NULL_RETURN(bias);
+  CHECK_NULL_RETURN(primitive);
+  auto bias_default_param = bias->default_param();
+  auto bias_param = bias_default_param->cast<tensor::TensorPtr>();
+  MS_ASSERT(bias_parameter != nullptr);
+  auto quant_param_holder = GetCNodeQuantHolder(primitive);
+  MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr.");
+  auto active_weight_quant_params = quant_param_holder->get_input_quant_params();
+
+  auto active_params = active_weight_quant_params.at(FIRST_INPUT);
+  auto weight_params = active_weight_quant_params.at(SECOND_INPUT);
+
+  vector<double> input_scales;
+  vector<double> filter_scales;
+  vector<double> bias_scales;
+  size_t sizeX = active_params.size();
+  for (size_t i = 0; i < sizeX; i++) {
+    input_scales.emplace_back(active_params[i].scale);
+  }
+  size_t sizeY = weight_params.size();
+  if (sizeX != sizeY) {
+    if (sizeX > 1 && sizeY > 1) {
+      MS_LOG(ERROR) << "input and filter's scale count cannot match!";
+      return RET_ERROR;
+    }
+  }
+  for (size_t i = 0; i < sizeY; i++) {
+    filter_scales.emplace_back(weight_params[i].scale);
+  }
+  size_t size = std::max(sizeX, sizeY);
+  for (size_t i = 0; i < size; i++) {
+    auto scaleX = sizeX > 1 ? input_scales[i] : input_scales[0];
+    auto scaleY = sizeY > 1 ? filter_scales[i] : filter_scales[0];
+    bias_scales.push_back(scaleX * scaleY);
+  }
+  MS_ASSERT(!bias_scales.empty());
+  size_t shape_size = bias_param->DataSize();
+
+  // set bias quant param
+  std::vector<schema::QuantParamT> quant_params;
+  for (double bias_scale : bias_scales) {
+    schema::QuantParamT quant_param;
+    if (bias_scale == 0) {
+      MS_LOG(WARNING) << "bias_scale == 0";
+      quant_param.scale = 1;
+    } else {
+      quant_param.scale = bias_scale;
+    }
+    quant_param.numBits = k32Bit;
+    quant_param.zeroPoint = 0;
+    quant_param.inited = true;
+    quant_params.emplace_back(quant_param);
+  }
+  // quant bias data
+  std::vector<int32_t> quant_datas(shape_size);
+
+  auto *raw_datas = static_cast<float *>(bias_param->data_c());
+  if (ComputeBiasDataAndQuantParam(bias_scales, input_scales, raw_datas, quant_param_holder, &quant_params,
+                                   &quant_datas) != RET_OK) {
+    MS_LOG(ERROR) << "compute bias data failed.";
+    return RET_ERROR;
+  }
+  quant_param_holder->set_input_quant_param(THIRD_INPUT, quant_params);
+  auto ret = SetTensorData(bias_param, quant_datas.data(), shape_size * sizeof(int32_t));
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "set tensor data failed.";
+    return RET_ERROR;
+  }
+  // set dtype
+  auto abstractBase = bias->abstract();
+  if (abstractBase == nullptr) {
+    MS_LOG(ERROR) << "Abstract of parameter is nullptr, " << bias->name();
+    return RET_ERROR;
+  }
+  if (!utils::isa<abstract::AbstractTensorPtr>(abstractBase)) {
+    MS_LOG(ERROR) << "Abstract of parameter should be anstract tensor, " << bias->name();
+    return RET_ERROR;
+  }
+  auto abstractTensor = utils::cast<abstract::AbstractTensorPtr>(abstractBase);
+  if (abstractTensor == nullptr || abstractTensor->element() == nullptr) {
+    MS_LOG(ERROR) << "abstractTensor is nullptr" << bias->name();
+    return RET_NULL_PTR;
+  }
+  abstractTensor->element()->set_type(TypeIdToType(kNumberTypeInt32));
+  return RET_OK;
+}
 }  // namespace mindspore::lite::quant
diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.h b/mindspore/lite/tools/converter/quantizer/quantize_util.h
index b320af594ab..e928d4401c3 100644
--- a/mindspore/lite/tools/converter/quantizer/quantize_util.h
+++ b/mindspore/lite/tools/converter/quantizer/quantize_util.h
@@ -96,6 +96,8 @@ int GetPreferredDim(const PrimitivePtr &primitive, int input_index, const std::v
 
 std::vector<int> ConvertShapeVectorToInt32(const ShapeVector &dims);
 
+int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive);
+
 template <typename T>
 int FixedBitQuantFilter(const AnfNodePtr &parameter, const tensor::TensorPtr &weight, const PrimitivePtr &primitive,
                         QuantType quant_type, int quant_max, int quant_min, size_t bit_num,