Abstract bias correction

This commit is contained in:
yeyunpeng2020 2021-12-27 16:07:23 +08:00
parent 300a1ade7c
commit 9ad4064750
8 changed files with 743 additions and 685 deletions

View File

@ -120,12 +120,6 @@ class QuantParamHolder : public Value {
std::vector<std::vector<schema::QuantParamT>> get_output_quant_params() const { return this->output_quant_params_; }
// deprecated
void ClearInputOutputQuantParam() {
input_quant_params_.clear();
output_quant_params_.clear();
}
bool IsInputQuantParamsInited() {
if (this->input_quant_params_.empty()) {
return false;

View File

@ -0,0 +1,488 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tools/converter/quantizer/bias_correction_strategy.h"
#include <dirent.h>
#include <future>
#include <set>
#include <memory>
#include <functional>
#include <numeric>
#include <string>
#include <thread>
#include <vector>
#include <algorithm>
#include "src/common/log_adapter.h"
#include "include/errorcode.h"
#include "mindapi/base/type_id.h"
#include "tools/common/tensor_util.h"
namespace mindspore::lite::quant {
namespace {
constexpr int kHasBiasTensorSize = 3;
const char *kTypeConv2D = schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion);
} // namespace
int BiasCorrectionStrategy::CheckFp32TensorVec(const std::string &node_name,
const std::vector<mindspore::tensor::MSTensor *> &tensor_vec) {
if (tensor_vec.empty()) {
MS_LOG(ERROR) << "node: " << node_name << " input tensors is 0";
return RET_ERROR;
}
auto *tensor = tensor_vec[0];
CHECK_NULL_RETURN(tensor);
if (tensor->data_type() != kNumberTypeFloat32) {
MS_LOG(INFO) << "node: " << node_name << " will not quantize"
<< " tensor data_type: " << tensor->data_type();
return RET_ERROR;
}
return RET_OK;
}
bool BiasCorrectionStrategy::OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data) {
MS_ASSERT(data != nullptr);
std::lock_guard<std::mutex> lg(mutex_op_input_);
if (type == STORE) {
if (fp32_op_input_map_.find(op_name) != fp32_op_input_map_.end()) {
// the data has not been fetched by int8 model
return false;
}
fp32_op_input_map_[op_name] = *data;
return true;
} else if (type == FETCH) {
if (fp32_op_input_map_.find(op_name) == fp32_op_input_map_.end()) {
// the data not generated by fp32 model yet
return false;
}
*data = fp32_op_input_map_[op_name];
fp32_op_input_map_.erase(op_name);
return true;
} else {
MS_LOG(ERROR) << "unexpected type: " << type;
}
return false;
}
bool BiasCorrectionStrategy::OpOutputChMeanDataHandle(OperationType type, const string &op_name,
std::vector<float> *data) {
MS_ASSERT(data != nullptr);
std::lock_guard<std::mutex> lg(mutex_op_output_);
if (type == STORE) {
if (fp32_op_output_ch_mean_map_.find(op_name) != fp32_op_output_ch_mean_map_.end()) {
// the data has not been fetched by int8 model
return false;
}
fp32_op_output_ch_mean_map_[op_name] = *data;
return true;
} else if (type == FETCH) {
if (fp32_op_output_ch_mean_map_.find(op_name) == fp32_op_output_ch_mean_map_.end()) {
// the data not generated by fp32 model yet
return false;
}
*data = fp32_op_output_ch_mean_map_[op_name];
fp32_op_output_ch_mean_map_.erase(op_name);
return true;
} else {
MS_LOG(ERROR) << "unexpected type: " << type;
}
return false;
}
KernelCallBack BiasCorrectionStrategy::GetBeforeCallBack(bool int8_op) {
KernelCallBack before_call_back;
if (!int8_op) {
before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
if (CheckFp32TensorVec(callParam.node_name, before_inputs) != RET_OK) {
return true;
}
auto tensor = before_inputs[0];
MS_ASSERT(tensor != nullptr);
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
std::vector<float> fp32_op_input(elem_count);
auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size());
if (ret != EOK) {
MS_LOG(ERROR) << "memcpy error: " << ret;
return false;
}
while (!OpInputDataHandle(STORE, callParam.node_name, &fp32_op_input)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
}
return true;
};
} else {
before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
std::vector<float> fp32_op_input;
while (!OpInputDataHandle(FETCH, callParam.node_name, &fp32_op_input)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
auto tensor = before_inputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeInt8) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
// do quantization: activation is always per layer quantized
std::vector<int8_t> quant_datas;
auto quant_params = tensor->quant_params();
if (quant_params.size() != 1) {
MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size();
return false;
}
schema::QuantParamT quant_param_t;
quant_param_t.scale = quant_params[0].scale;
quant_param_t.zeroPoint = quant_params[0].zeroPoint;
for (auto float_data : fp32_op_input) {
auto quant_data = QuantizeData<int8_t>(float_data, &quant_param_t, activation_q_max_, activation_q_min_);
quant_datas.push_back(quant_data);
}
if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) {
MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size()
<< " not the same with: " << quant_datas.size() * sizeof(int8_t);
return false;
}
auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t));
if (ret != EOK) {
MS_LOG(ERROR) << "memcpy error: " << ret;
return false;
}
}
return true;
};
}
return before_call_back;
}
KernelCallBack BiasCorrectionStrategy::GetAfterCallBack(bool int8_op) {
KernelCallBack after_call_back;
if (!int8_op) {
return GetFloatAfterCallBack();
}
return GetInt8AfterCallBack();
}
KernelCallBack BiasCorrectionStrategy::GetInt8AfterCallBack() {
KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
std::vector<float> fp32_op_output_ch_mean;
while (!OpOutputChMeanDataHandle(FETCH, callParam.node_name, &fp32_op_output_ch_mean)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
auto tensor = afterOutputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeInt8) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
const int8_t *tensor_data = static_cast<int8_t *>(tensor->data());
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
auto shapes = tensor->shape();
if (shapes.size() != DIMENSION_4D) {
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
return false;
}
// suppose the the format is NHWC
auto channels = shapes[FOURTH_INPUT];
if (channels == 0) {
MS_LOG(ERROR) << "unexpected channels: 0";
return false;
}
auto quant_params = tensor->quant_params();
if (quant_params.size() != 1) {
MS_LOG(ERROR) << "unexpected activatation quant_params size: " << quant_params.size();
return false;
}
auto scale = quant_params[0].scale;
auto zp = quant_params[0].zeroPoint;
std::vector<float> dequant_op_output_ch_mean(channels);
auto one_filter_size = elem_count / channels;
for (int i = 0; i < channels; i++) {
float sum = 0;
for (size_t j = 0; j < one_filter_size; j++) {
auto index = j * channels + i;
if (index >= elem_count) {
MS_LOG(ERROR) << "over flow!";
return false;
}
// deuqant activation
auto float_data = scale * (tensor_data[index] - zp);
sum += float_data;
}
if (one_filter_size == 0) {
MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
return false;
}
sum = sum / one_filter_size;
dequant_op_output_ch_mean[i] = sum;
}
std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(),
dequant_op_output_ch_mean.begin(), std::minus<>());
if (op_bias_diff_map_.find(callParam.node_name) != op_bias_diff_map_.end()) {
auto &bias_diff = op_bias_diff_map_[callParam.node_name];
std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(),
std::plus<>());
} else {
op_bias_diff_map_[callParam.node_name] = dequant_op_output_ch_mean;
}
}
return true;
};
return after_call_back;
}
KernelCallBack BiasCorrectionStrategy::GetFloatAfterCallBack() {
KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
if (CheckFp32TensorVec(callParam.node_name, afterOutputs) != RET_OK) {
return true;
}
auto tensor = afterOutputs[0];
MS_ASSERT(tensor != nullptr);
const auto *tensor_data = static_cast<const float *>(tensor->data());
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
auto shapes = tensor->shape();
if (shapes.size() != DIMENSION_4D) {
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
return false;
}
// suppose the activation format: NHWC
auto channels = shapes[FOURTH_INPUT];
if (channels == 0) {
MS_LOG(ERROR) << "unexpected channels: 0";
return false;
}
std::vector<float> fp32_op_output_ch_mean(channels);
auto one_filter_size = elem_count / channels;
for (int i = 0; i < channels; i++) {
float sum = 0;
for (size_t j = 0; j < one_filter_size; j++) {
auto index = j * channels + i;
if (index >= elem_count) {
MS_LOG(ERROR) << "over flow!";
return false;
}
sum += tensor_data[index];
}
if (one_filter_size == 0) {
MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
return false;
}
sum = sum / one_filter_size;
fp32_op_output_ch_mean[i] = sum;
}
while (!OpOutputChMeanDataHandle(STORE, callParam.node_name, &fp32_op_output_ch_mean)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
}
return true;
};
return after_call_back;
}
int BiasCorrectionStrategy::Int8Inference() {
// int8 inference
std::vector<mindspore::tensor::MSTensor *> inputs = int8_session_->GetInputs();
for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) {
for (size_t input_index = 0; input_index < inputs.size(); input_index++) {
int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]);
if (status != RET_OK) {
MS_LOG(ERROR) << "generate input data failed!";
return RET_ERROR;
}
}
// before func
KernelCallBack before_call_back = GetBeforeCallBack(true);
// after func
KernelCallBack after_call_back = GetAfterCallBack(true);
int8_session_->BindThread(true);
auto status = int8_session_->RunGraph(before_call_back, after_call_back);
int8_session_->BindThread(false);
if (status != RET_OK) {
MS_LOG(ERROR) << "run model failed!";
return RET_ERROR;
}
} // end for images
return RET_OK;
}
int BiasCorrectionStrategy::DoBiasCorrection(const FuncGraphPtr &func_graph) {
// init in8 session
MS_LOG(INFO) << "create quant session";
flags_.commonQuantParam.quant_type = schema::QuantType_QUANT_ALL;
auto int8_sm = CreateSessionByFuncGraph(func_graph, flags_, this->flags_.commonQuantParam.thread_num);
int8_session_ = int8_sm.session;
int8_model_ = int8_sm.model;
if (int8_session_ == nullptr || int8_model_ == nullptr) {
MS_LOG(ERROR) << "create session failed!";
return RET_ERROR;
}
std::future<int> int8_inference = std::async(std::launch::async, &BiasCorrectionStrategy::Int8Inference, this);
// get input tensor
std::vector<mindspore::tensor::MSTensor *> inputs = fp32_session_->GetInputs();
// fp32 inference
for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) {
for (size_t input_index = 0; input_index < inputs.size(); input_index++) {
int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]);
if (status != RET_OK) {
MS_LOG(ERROR) << "generate input data from images failed!";
return RET_ERROR;
}
}
// before func
KernelCallBack before_call_back = GetBeforeCallBack(false);
// after func
KernelCallBack after_call_back = GetAfterCallBack(false);
fp32_session_->BindThread(true);
auto status = fp32_session_->RunGraph(before_call_back, after_call_back);
fp32_session_->BindThread(false);
if (status != RET_OK) {
MS_LOG(ERROR) << "run model failed!";
return RET_ERROR;
}
} // end for images
int status = int8_inference.get();
if (status != RET_OK) {
MS_LOG(ERROR) << "int8 inference failed!";
return RET_ERROR;
}
if (calibrator_->GetBatchNum() == 0) {
MS_LOG(ERROR) << "divisor 'calibrate_size' cannot be 0.";
return RET_ERROR;
}
for (auto &key_value : op_bias_diff_map_) {
std::for_each(key_value.second.begin(), key_value.second.end(),
[this](float &data) { data = data / calibrator_->GetBatchNum(); });
}
auto cnodes = func_graph->GetOrderedCnodes();
for (auto &cnode : cnodes) {
auto op_name = cnode->fullname_with_scope();
if (op_bias_diff_map_.find(op_name) == op_bias_diff_map_.end()) {
continue;
}
status = DoCNodeBiasCorrection(func_graph, cnode);
if (status != RET_OK) {
MS_LOG(ERROR) << "do node bias correct failed.";
break;
}
}
return status;
}
int BiasCorrectionStrategy::DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
auto op_name = cnode->fullname_with_scope();
const auto &bias_diff = op_bias_diff_map_[op_name];
auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
if (primitive == nullptr) {
MS_LOG(ERROR) << op_name << " primitive is nullptr";
return RET_NULL_PTR;
}
auto quant_param_holder = GetCNodeQuantHolder(primitive);
MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr.");
auto input_quant_params = quant_param_holder->get_input_quant_params();
if (input_quant_params.size() == kHasBiasTensorSize) {
// compensate the existed
auto bias_quant_params = input_quant_params.at(THIRD_INPUT);
auto bias = cnode->input(THIRD_INPUT + 1);
auto bias_parameter_ptr = bias->cast<ParameterPtr>();
auto bias_default_param = bias_parameter_ptr->default_param();
auto bias_param = bias_default_param->cast<tensor::TensorPtr>();
int *bias_datas = static_cast<int *>(bias_param->data_c());
if (static_cast<size_t>(bias_param->DataSize()) != bias_diff.size()) {
MS_LOG(DEBUG) << op_name << " unexpected bias data count: " << bias_param->DataSize()
<< " not the same as bias_diff: " << bias_diff.size();
return RET_ERROR;
}
if (bias_quant_params.size() != bias_diff.size()) {
MS_LOG(ERROR) << op_name << " unexpected bias quant params size: " << bias_quant_params.size()
<< " not the same as bias_diff: " << bias_diff.size();
return RET_ERROR;
}
for (size_t i = 0; i < bias_param->DataSize(); i++) {
auto scale = bias_quant_params[i].scale;
if (fabs(scale) <= 0.0f) {
MS_LOG(ERROR) << op_name << " divisor 'scale' cannot be 0.";
return RET_ERROR;
}
double after_correct = std::round(bias_diff[i] / scale) + bias_datas[i];
const constexpr int32_t corrected_bias_abs_limit = 0.6 * INT32_MAX;
if (after_correct > corrected_bias_abs_limit) {
MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too large: " << after_correct
<< " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale;
bias_datas[i] = static_cast<int>(corrected_bias_abs_limit);
} else if (after_correct < -corrected_bias_abs_limit) {
MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too small: " << after_correct
<< " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale;
bias_datas[i] = static_cast<int>(-corrected_bias_abs_limit);
} else {
auto diff = static_cast<int>(std::round(bias_diff[i] / scale));
bias_datas[i] += diff;
}
}
} else if (input_quant_params.size() == kHasBiasTensorSize - 1) {
MS_LOG(INFO) << op_name << " add bias input";
// need to add bias input
auto parameter = func_graph->add_parameter();
if (parameter == nullptr) {
MS_LOG(ERROR) << "parameter is nullptr.";
return RET_NULL_PTR;
}
std::vector<int64_t> shape;
shape.push_back(bias_diff.size());
auto tensor_info = CreateTensorInfo(bias_diff.data(), sizeof(float) * bias_diff.size(), shape, kNumberTypeFloat32);
if (tensor_info == nullptr) {
MS_LOG(ERROR) << op_name << " create tensor info failed.";
return RET_ERROR;
}
auto status = InitParameterFromTensorInfo(parameter, tensor_info);
if (status != RET_OK) {
MS_LOG(ERROR) << op_name << " init parameter from tensor info failed";
return RET_ERROR;
}
parameter->set_name("added_" + op_name + "_bias");
cnode->add_input(parameter);
status = DoParameterBiasQuant(parameter, primitive);
if (status != RET_OK) {
MS_LOG(ERROR) << op_name << " Do bias quant failed.";
return RET_ERROR;
}
} else {
MS_LOG(WARNING) << op_name << " unexpected size: " << input_quant_params.size()
<< ", and shared weight tensor does not support bias correction temporarily.";
}
return RET_OK;
}
} // namespace mindspore::lite::quant

View File

@ -0,0 +1,83 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BIASCORRECTION_H
#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BIASCORRECTION_H
#include <memory>
#include <map>
#include <string>
#include <vector>
#include "base/base.h"
#include "ir/anf.h"
#include "tools/converter/quantizer/calibrator.h"
namespace mindspore::lite::quant {
enum OperationType {
STORE,
FETCH,
};
class BiasCorrectionStrategy {
public:
BiasCorrectionStrategy(const converter::Flags &flags, const std::shared_ptr<Calibrator> &calibrator,
session::LiteSession *fp32_session, Model *fp32_model, int activation_q_min,
int activation_q_max)
: flags_(flags),
calibrator_(calibrator),
fp32_session_(fp32_session),
fp32_model_(fp32_model),
activation_q_min_(activation_q_min),
activation_q_max_(activation_q_max) {}
~BiasCorrectionStrategy() {
if (int8_session_ != nullptr) {
delete int8_session_;
}
if (int8_model_ != nullptr) {
delete int8_model_;
}
}
int DoBiasCorrection(const FuncGraphPtr &func_graph);
private:
int DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode);
int Int8Inference();
bool OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
bool OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
KernelCallBack GetBeforeCallBack(bool int8_op);
KernelCallBack GetAfterCallBack(bool int8_op);
KernelCallBack GetInt8AfterCallBack();
KernelCallBack GetFloatAfterCallBack();
int CheckFp32TensorVec(const std::string &node_name, const std::vector<mindspore::tensor::MSTensor *> &tensor_vec);
private:
converter::Flags flags_;
std::shared_ptr<Calibrator> calibrator_{nullptr};
session::LiteSession *fp32_session_{nullptr};
Model *fp32_model_{nullptr};
int activation_q_min_{INT8_MIN};
int activation_q_max_{INT8_MAX};
session::LiteSession *int8_session_{nullptr};
Model *int8_model_{nullptr};
std::map<std::string, std::vector<float>> fp32_op_input_map_; // concurrency
std::map<std::string, std::vector<float>> fp32_op_output_ch_mean_map_; // concurrency
std::map<std::string, std::vector<float>> op_bias_diff_map_; // only use by int8 model
std::mutex mutex_op_input_;
std::mutex mutex_op_output_;
};
} // namespace mindspore::lite::quant
#endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BIASCORRECTION_H

View File

@ -14,9 +14,9 @@
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER__GENERAL_BITPACKING_H
#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER__GENERAL_BITPACKING_H
#include <stdint.h>
#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H
#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H
#include <cstdint>
#include <stack>
#include <queue>
#include <vector>

View File

@ -16,18 +16,11 @@
#include "tools/converter/quantizer/full_quant_quantizer.h"
#include <dirent.h>
#include <future>
#include <set>
#include <memory>
#include <unordered_map>
#include <functional>
#include <numeric>
#include <utility>
#include <string>
#include <thread>
#include <vector>
#include <algorithm>
#include "ops/fusion/full_connection.h"
#include "ops/tuple_get_item.h"
#include "src/tensor.h"
#include "tools/converter/quantizer/quant_cast.h"
@ -41,6 +34,7 @@
#include "tools/common/node_util.h"
#include "nnacl/op_base.h"
#include "src/common/log_util.h"
#include "tools/converter/quantizer/bias_correction_strategy.h"
using std::string;
using std::vector;
@ -50,88 +44,10 @@ namespace {
static const std::set<PrimitivePtr> has_bias_operator = {prim::kPrimConv2DFusion, prim::kPrimConv2dTransposeFusion,
prim::kPrimMatMulFusion, prim::kPrimFullConnection,
prim::kPrimLayerNormFusion};
constexpr int kHasBiasTensorSize = 3;
constexpr int KBiasBitNum = 32;
const char *kTypeConv2D = schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion);
} // namespace
namespace {
int ComputeBiasDataAndQuantParam(const std::vector<double> &bias_scales, const std::vector<double> &input_scales,
const float *raw_datas, const QuantParamHolderPtr &quant_param_holder,
std::vector<schema::QuantParamT> *quant_params, std::vector<int32_t> *quant_datas) {
MS_ASSERT(raw_datas != nullptr && quant_param_holder != nullptr);
MS_ASSERT(quant_params != nullptr && quant_datas != nullptr);
double bias_scale_tmp;
const constexpr double quanted_bias_abs_limit = 0.5 * INT32_MAX;
MS_CHECK_TRUE_MSG(quant_param_holder->get_input_quant_params().size() > 1, RET_ERROR, "invalid access.");
auto weight_quant_params = quant_param_holder->get_input_quant_params().at(1);
auto shape_size = quant_datas->size();
if (bias_scales.size() == shape_size) {
for (size_t i = 0; i < shape_size; i++) {
bias_scale_tmp = bias_scales[i];
if (fabs(bias_scale_tmp) <= 0.0f) {
MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0.";
return RET_ERROR;
}
if (std::abs(raw_datas[i] / bias_scale_tmp) >= quanted_bias_abs_limit) {
MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[i].scale
<< " is too small, need to update";
// update filter scale and zp
double activate_scale = input_scales[0];
double filter_scale = std::abs(raw_datas[i]) / (activate_scale * quanted_bias_abs_limit);
weight_quant_params[i].scale = filter_scale;
weight_quant_params[i].zeroPoint = 0;
quant_param_holder->set_input_quant_param(1, weight_quant_params);
bias_scale_tmp = std::abs(raw_datas[i]) / quanted_bias_abs_limit;
quant_params->at(i).scale = bias_scale_tmp;
MS_LOG(DEBUG) << "new filter scale: " << filter_scale;
}
auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp);
quant_datas->at(i) = quant_data;
}
return RET_OK;
} else if (bias_scales.size() == 1) {
// for fc, per tensor quant
bias_scale_tmp = quant_params->front().scale;
float max_raw_data = 0.0f;
for (size_t i = 0; i < shape_size; i++) {
if (std::abs(raw_datas[i]) > max_raw_data) {
max_raw_data = std::abs(raw_datas[i]);
}
}
if (fabs(bias_scale_tmp) <= 0.0f) {
MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0.";
return RET_ERROR;
}
if (std::abs(max_raw_data / bias_scale_tmp) >= quanted_bias_abs_limit) {
MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[0].scale
<< " is too small, need to update";
double activate_scale = input_scales[0];
MS_CHECK_TRUE_MSG(activate_scale != 0, RET_ERROR, "activate_scale == 0");
double filter_scale = std::abs(max_raw_data) / (activate_scale * quanted_bias_abs_limit);
weight_quant_params[0].scale = filter_scale;
weight_quant_params[0].zeroPoint = 0;
quant_param_holder->set_input_quant_param(1, weight_quant_params);
bias_scale_tmp = max_raw_data / quanted_bias_abs_limit;
quant_params->front().scale = bias_scale_tmp;
MS_LOG(DEBUG) << "new filter scale: " << filter_scale;
}
for (size_t i = 0; i < shape_size; i++) {
auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp);
quant_datas->at(i) = quant_data;
}
return RET_OK;
}
MS_LOG(ERROR) << "unexpected input_scales size: " << input_scales.size()
<< " weight_scales size: " << weight_quant_params.size();
return RET_ERROR;
}
} // namespace
FullQuantQuantizer::~FullQuantQuantizer() {
delete fp32_session_;
delete fp32_model_;
delete int8_session_;
delete int8_model_;
}
int FullQuantQuantizer::SetInOutQuantParam(const AnfNodePtr &input_node, const std::unique_ptr<DataDistribution> &info,
@ -206,94 +122,6 @@ int FullQuantQuantizer::DoValueNodeWeightQuant(const ValueNodePtr &weight, const
return RET_OK;
}
int FullQuantQuantizer::DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive) {
CHECK_NULL_RETURN(bias);
CHECK_NULL_RETURN(primitive);
auto bias_default_param = bias->default_param();
auto bias_param = bias_default_param->cast<tensor::TensorPtr>();
MS_ASSERT(bias_parameter != nullptr);
auto quant_param_holder = GetCNodeQuantHolder(primitive);
MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr.");
auto active_weight_quant_params = quant_param_holder->get_input_quant_params();
auto active_params = active_weight_quant_params.at(FIRST_INPUT);
auto weight_params = active_weight_quant_params.at(SECOND_INPUT);
vector<double> input_scales;
vector<double> filter_scales;
vector<double> bias_scales;
size_t sizeX = active_params.size();
for (size_t i = 0; i < sizeX; i++) {
input_scales.emplace_back(active_params[i].scale);
}
size_t sizeY = weight_params.size();
if (sizeX != sizeY) {
if (sizeX > 1 && sizeY > 1) {
MS_LOG(ERROR) << "input and filter's scale count cannot match!";
return RET_ERROR;
}
}
for (size_t i = 0; i < sizeY; i++) {
filter_scales.emplace_back(weight_params[i].scale);
}
size_t size = std::max(sizeX, sizeY);
for (size_t i = 0; i < size; i++) {
auto scaleX = sizeX > 1 ? input_scales[i] : input_scales[0];
auto scaleY = sizeY > 1 ? filter_scales[i] : filter_scales[0];
bias_scales.push_back(scaleX * scaleY);
}
MS_ASSERT(!bias_scales.empty());
size_t shape_size = bias_param->DataSize();
// set bias quant param
std::vector<schema::QuantParamT> quant_params;
for (double bias_scale : bias_scales) {
schema::QuantParamT quant_param;
if (bias_scale == 0) {
MS_LOG(WARNING) << "bias_scale == 0";
quant_param.scale = 1;
} else {
quant_param.scale = bias_scale;
}
quant_param.numBits = KBiasBitNum;
quant_param.zeroPoint = 0;
quant_param.inited = true;
quant_params.emplace_back(quant_param);
}
// quant bias data
std::vector<int32_t> quant_datas(shape_size);
auto *raw_datas = static_cast<float *>(bias_param->data_c());
if (ComputeBiasDataAndQuantParam(bias_scales, input_scales, raw_datas, quant_param_holder, &quant_params,
&quant_datas) != RET_OK) {
MS_LOG(ERROR) << "compute bias data failed.";
return RET_ERROR;
}
quant_param_holder->set_input_quant_param(THIRD_INPUT, quant_params);
auto ret = SetTensorData(bias_param, quant_datas.data(), shape_size * sizeof(int32_t));
if (ret != RET_OK) {
MS_LOG(ERROR) << "set tensor data failed.";
return RET_ERROR;
}
// set dtype
auto abstractBase = bias->abstract();
if (abstractBase == nullptr) {
MS_LOG(ERROR) << "Abstract of parameter is nullptr, " << bias->name();
return RET_ERROR;
}
if (!utils::isa<abstract::AbstractTensorPtr>(abstractBase)) {
MS_LOG(ERROR) << "Abstract of parameter should be anstract tensor, " << bias->name();
return RET_ERROR;
}
auto abstractTensor = utils::cast<abstract::AbstractTensorPtr>(abstractBase);
if (abstractTensor == nullptr || abstractTensor->element() == nullptr) {
MS_LOG(ERROR) << "abstractTensor is nullptr" << bias->name();
return RET_NULL_PTR;
}
abstractTensor->element()->set_type(TypeIdToType(kNumberTypeInt32));
return RET_OK;
}
int FullQuantQuantizer::IsSupportWeightQuant(const CNodePtr &cnode, const AnfNodePtr &input_node, size_t input_index) {
auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
if (primitive == nullptr) {
@ -628,17 +456,6 @@ int FullQuantQuantizer::MarkQuantNode(const FuncGraphPtr &func_graph) {
MS_LOG(ERROR) << cnode->fullname_with_scope() << " add quantized op failed.";
return ret;
}
auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
if (primitive == nullptr) {
MS_LOG(ERROR) << cnode->fullname_with_scope() << " primitive is null";
return RET_ERROR;
}
auto quant_param_holder = GetCNodeQuantHolder(primitive);
if (quant_param_holder == nullptr) {
MS_LOG(ERROR) << cnode->fullname_with_scope() << " quant_param_holder is null";
return RET_ERROR;
}
quant_param_holder->ClearInputOutputQuantParam();
}
}
return RET_OK;
@ -658,7 +475,7 @@ int FullQuantQuantizer::PreProcess(const FuncGraphPtr &func_graph) {
break;
}
InitQMinMax();
calibrator_ = std::make_unique<Calibrator>(this->bit_num_, activation_q_max_, activation_q_min_,
calibrator_ = std::make_shared<Calibrator>(this->bit_num_, activation_q_max_, activation_q_min_,
this->flags_.fullQuantParam.activation_quant_method,
this->flags_.dataPreProcessParam, activation_symmetry_);
MSLITE_CHECK_PTR(calibrator_);
@ -670,22 +487,6 @@ int FullQuantQuantizer::PreProcess(const FuncGraphPtr &func_graph) {
return RET_OK;
}
int FullQuantQuantizer::CheckFp32TensorVec(const std::string &node_name,
const std::vector<mindspore::tensor::MSTensor *> &tensor_vec) {
if (tensor_vec.empty()) {
MS_LOG(ERROR) << "node: " << node_name << " input tensors is 0";
return RET_ERROR;
}
auto *tensor = tensor_vec[0];
CHECK_NULL_RETURN(tensor);
if (tensor->data_type() != kNumberTypeFloat32) {
MS_LOG(INFO) << "node: " << node_name << " will not quantize"
<< " tensor data_type: " << tensor->data_type();
return RET_ERROR;
}
return RET_OK;
}
int FullQuantQuantizer::DoInference(CollectType collect_type) {
// get input tensor
vector<mindspore::tensor::MSTensor *> inputs = fp32_session_->GetInputs();
@ -736,172 +537,6 @@ int FullQuantQuantizer::DoInference(CollectType collect_type) {
return RET_OK;
}
int FullQuantQuantizer::Int8Inference() {
// int8 inference
vector<mindspore::tensor::MSTensor *> inputs = int8_session_->GetInputs();
for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) {
for (size_t input_index = 0; input_index < inputs.size(); input_index++) {
int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]);
if (status != RET_OK) {
MS_LOG(ERROR) << "generate input data failed!";
return RET_ERROR;
}
}
// before func
KernelCallBack before_call_back = GetBeforeCallBack(true);
// after func
KernelCallBack after_call_back = GetAfterCallBack(true);
int8_session_->BindThread(true);
auto status = int8_session_->RunGraph(before_call_back, after_call_back);
int8_session_->BindThread(false);
if (status != RET_OK) {
MS_LOG(ERROR) << "run model failed!";
return RET_ERROR;
}
} // end for images
return RET_OK;
}
int FullQuantQuantizer::BiasCorrection(const FuncGraphPtr &func_graph) {
std::future<int> int8_inference = std::async(std::launch::async, &FullQuantQuantizer::Int8Inference, this);
// get input tensor
vector<mindspore::tensor::MSTensor *> inputs = fp32_session_->GetInputs();
// fp32 inference
for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) {
for (size_t input_index = 0; input_index < inputs.size(); input_index++) {
int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]);
if (status != RET_OK) {
MS_LOG(ERROR) << "generate input data from images failed!";
return RET_ERROR;
}
}
// before func
KernelCallBack before_call_back = GetBeforeCallBack(false);
// after func
KernelCallBack after_call_back = GetAfterCallBack(false);
fp32_session_->BindThread(true);
auto status = fp32_session_->RunGraph(before_call_back, after_call_back);
fp32_session_->BindThread(false);
if (status != RET_OK) {
MS_LOG(ERROR) << "run model failed!";
return RET_ERROR;
}
} // end for images
int status = int8_inference.get();
if (status != RET_OK) {
MS_LOG(ERROR) << "int8 inference failed!";
return RET_ERROR;
}
if (calibrator_->GetBatchNum() == 0) {
MS_LOG(ERROR) << "divisor 'calibrate_size' cannot be 0.";
return RET_ERROR;
}
for (auto &key_value : op_bias_diff_map_) {
std::for_each(key_value.second.begin(), key_value.second.end(),
[this](float &data) { data = data / calibrator_->GetBatchNum(); });
}
auto cnodes = func_graph->GetOrderedCnodes();
for (auto &cnode : cnodes) {
auto op_name = cnode->fullname_with_scope();
if (op_bias_diff_map_.find(op_name) == op_bias_diff_map_.end()) {
continue;
}
status = BiasCorrection(func_graph, cnode);
if (status != RET_OK) {
MS_LOG(ERROR) << "do node bias correct failed.";
break;
}
}
return status;
}
int FullQuantQuantizer::BiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
auto op_name = cnode->fullname_with_scope();
const auto &bias_diff = op_bias_diff_map_[op_name];
auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
if (primitive == nullptr) {
MS_LOG(ERROR) << op_name << " primitive is nullptr";
return RET_NULL_PTR;
}
auto quant_param_holder = GetCNodeQuantHolder(primitive);
MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr.");
auto input_quant_params = quant_param_holder->get_input_quant_params();
if (input_quant_params.size() == kHasBiasTensorSize) {
// compensate the existed
auto bias_quant_params = input_quant_params.at(THIRD_INPUT);
auto bias = cnode->input(THIRD_INPUT + 1);
auto bias_parameter_ptr = bias->cast<ParameterPtr>();
auto bias_default_param = bias_parameter_ptr->default_param();
auto bias_param = bias_default_param->cast<tensor::TensorPtr>();
int *bias_datas = static_cast<int *>(bias_param->data_c());
if (static_cast<size_t>(bias_param->DataSize()) != bias_diff.size()) {
MS_LOG(DEBUG) << op_name << " unexpected bias data count: " << bias_param->DataSize()
<< " not the same as bias_diff: " << bias_diff.size();
return RET_ERROR;
}
if (bias_quant_params.size() != bias_diff.size()) {
MS_LOG(ERROR) << op_name << " unexpected bias quant params size: " << bias_quant_params.size()
<< " not the same as bias_diff: " << bias_diff.size();
return RET_ERROR;
}
for (size_t i = 0; i < bias_param->DataSize(); i++) {
auto scale = bias_quant_params[i].scale;
if (fabs(scale) <= 0.0f) {
MS_LOG(ERROR) << op_name << " divisor 'scale' cannot be 0.";
return RET_ERROR;
}
double after_correct = std::round(bias_diff[i] / scale) + bias_datas[i];
const constexpr int32_t corrected_bias_abs_limit = 0.6 * INT32_MAX;
if (after_correct > corrected_bias_abs_limit) {
MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too large: " << after_correct
<< " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale;
bias_datas[i] = static_cast<int>(corrected_bias_abs_limit);
} else if (after_correct < -corrected_bias_abs_limit) {
MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too small: " << after_correct
<< " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale;
bias_datas[i] = static_cast<int>(-corrected_bias_abs_limit);
} else {
auto diff = static_cast<int>(std::round(bias_diff[i] / scale));
bias_datas[i] += diff;
}
}
} else if (input_quant_params.size() == kHasBiasTensorSize - 1) {
MS_LOG(INFO) << op_name << " add bias input";
// need to add bias input
auto parameter = func_graph->add_parameter();
if (parameter == nullptr) {
MS_LOG(ERROR) << "parameter is nullptr.";
return RET_NULL_PTR;
}
ShapeVector shape;
shape.push_back(bias_diff.size());
auto tensor_info = CreateTensorInfo(bias_diff.data(), sizeof(float) * bias_diff.size(), shape, kNumberTypeFloat32);
if (tensor_info == nullptr) {
MS_LOG(ERROR) << op_name << " create tensor info failed.";
return RET_ERROR;
}
auto status = InitParameterFromTensorInfo(parameter, tensor_info);
if (status != RET_OK) {
MS_LOG(ERROR) << op_name << " init parameter from tensor info failed";
return RET_ERROR;
}
parameter->set_name("added_" + op_name + "_bias");
cnode->add_input(parameter);
status = DoParameterBiasQuant(parameter, primitive);
if (status != RET_OK) {
MS_LOG(ERROR) << op_name << " Do bias quant failed.";
return RET_ERROR;
}
} else {
MS_LOG(WARNING) << op_name << " unexpected size: " << input_quant_params.size()
<< ", and shared weight tensor does not support bias correction temporarily.";
}
return RET_OK;
}
int FullQuantQuantizer::DoQuantize(FuncGraphPtr func_graph) {
MS_LOG(INFO) << "start to parse config file";
if (flags_.dataPreProcessParam.calibrate_path.empty()) {
@ -968,283 +603,17 @@ int FullQuantQuantizer::DoQuantize(FuncGraphPtr func_graph) {
ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status);
return RET_ERROR;
}
SessionModel int8_sm;
if (this->flags_.fullQuantParam.bias_correction) {
// init in8 session
MS_LOG(INFO) << "create quant session";
flags_.commonQuantParam.quant_type = schema::QuantType_QUANT_ALL;
int8_sm = CreateSessionByFuncGraph(func_graph, flags_, this->flags_.commonQuantParam.thread_num);
int8_session_ = int8_sm.session;
int8_model_ = int8_sm.model;
if (int8_session_ == nullptr || int8_model_ == nullptr) {
MS_LOG(ERROR) << "create session failed!";
return RET_ERROR;
}
MS_LOG(INFO) << "do bias correction";
status = BiasCorrection(func_graph);
BiasCorrectionStrategy strategy(flags_, calibrator_, fp32_session_, fp32_model_, activation_q_min_,
activation_q_max_);
status = strategy.DoBiasCorrection(func_graph);
if (status != RET_OK) {
MS_LOG(ERROR) << "BiasCorrection failed.";
MS_LOG(ERROR) << "bias_correction failed.";
return status;
}
}
}
return RET_OK;
}
bool FullQuantQuantizer::OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data) {
MS_ASSERT(data != nullptr);
std::lock_guard<std::mutex> lg(mutex_op_input_);
if (type == STORE) {
if (fp32_op_input_map_.find(op_name) != fp32_op_input_map_.end()) {
// the data has not been fetched by int8 model
return false;
}
fp32_op_input_map_[op_name] = *data;
return true;
} else if (type == FETCH) {
if (fp32_op_input_map_.find(op_name) == fp32_op_input_map_.end()) {
// the data not generated by fp32 model yet
return false;
}
*data = fp32_op_input_map_[op_name];
fp32_op_input_map_.erase(op_name);
return true;
} else {
MS_LOG(ERROR) << "unexpected type: " << type;
}
return false;
}
bool FullQuantQuantizer::OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector<float> *data) {
MS_ASSERT(data != nullptr);
std::lock_guard<std::mutex> lg(mutex_op_output_);
if (type == STORE) {
if (fp32_op_output_ch_mean_map_.find(op_name) != fp32_op_output_ch_mean_map_.end()) {
// the data has not been fetched by int8 model
return false;
}
fp32_op_output_ch_mean_map_[op_name] = *data;
return true;
} else if (type == FETCH) {
if (fp32_op_output_ch_mean_map_.find(op_name) == fp32_op_output_ch_mean_map_.end()) {
// the data not generated by fp32 model yet
return false;
}
*data = fp32_op_output_ch_mean_map_[op_name];
fp32_op_output_ch_mean_map_.erase(op_name);
return true;
} else {
MS_LOG(ERROR) << "unexpected type: " << type;
}
return false;
}
KernelCallBack FullQuantQuantizer::GetBeforeCallBack(bool int8_op) {
KernelCallBack before_call_back;
if (!int8_op) {
before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
if (FullQuantQuantizer::CheckFp32TensorVec(callParam.node_name, before_inputs) != RET_OK) {
return true;
}
auto tensor = before_inputs[0];
MS_ASSERT(tensor != nullptr);
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
std::vector<float> fp32_op_input(elem_count);
auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size());
if (ret != EOK) {
MS_LOG(ERROR) << "memcpy error: " << ret;
return false;
}
while (!OpInputDataHandle(STORE, callParam.node_name, &fp32_op_input)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
}
return true;
};
} else {
before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
vector<float> fp32_op_input;
while (!OpInputDataHandle(FETCH, callParam.node_name, &fp32_op_input)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
auto tensor = before_inputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeInt8) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
// do quantization: activation is always per layer quantized
std::vector<int8_t> quant_datas;
auto quant_params = tensor->quant_params();
if (quant_params.size() != 1) {
MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size();
return false;
}
schema::QuantParamT quant_param_t;
quant_param_t.scale = quant_params[0].scale;
quant_param_t.zeroPoint = quant_params[0].zeroPoint;
for (auto float_data : fp32_op_input) {
auto quant_data = QuantizeData<int8_t>(float_data, &quant_param_t, activation_q_max_, activation_q_min_);
quant_datas.push_back(quant_data);
}
if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) {
MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size()
<< " not the same with: " << quant_datas.size() * sizeof(int8_t);
return false;
}
auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t));
if (ret != EOK) {
MS_LOG(ERROR) << "memcpy error: " << ret;
return false;
}
}
return true;
};
}
return before_call_back;
}
KernelCallBack FullQuantQuantizer::GetAfterCallBack(bool int8_op) {
KernelCallBack after_call_back;
if (!int8_op) {
return GetFloatAfterCallBack();
}
return GetInt8AfterCallBack();
}
KernelCallBack FullQuantQuantizer::GetInt8AfterCallBack() {
KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
vector<float> fp32_op_output_ch_mean;
while (!OpOutputChMeanDataHandle(FETCH, callParam.node_name, &fp32_op_output_ch_mean)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
auto tensor = afterOutputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeInt8) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
const int8_t *tensor_data = static_cast<int8_t *>(tensor->data());
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
auto shapes = tensor->shape();
if (shapes.size() != DIMENSION_4D) {
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
return false;
}
// suppose the the format is NHWC
auto channels = shapes[FOURTH_INPUT];
if (channels == 0) {
MS_LOG(ERROR) << "unexpected channels: 0";
return false;
}
auto quant_params = tensor->quant_params();
if (quant_params.size() != 1) {
MS_LOG(ERROR) << "unexpected activatation quant_params size: " << quant_params.size();
return false;
}
auto scale = quant_params[0].scale;
auto zp = quant_params[0].zeroPoint;
std::vector<float> dequant_op_output_ch_mean(channels);
auto one_filter_size = elem_count / channels;
for (int i = 0; i < channels; i++) {
float sum = 0;
for (size_t j = 0; j < one_filter_size; j++) {
auto index = j * channels + i;
if (index >= elem_count) {
MS_LOG(ERROR) << "over flow!";
return false;
}
// deuqant activation
auto float_data = scale * (tensor_data[index] - zp);
sum += float_data;
}
if (one_filter_size == 0) {
MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
return false;
}
sum = sum / one_filter_size;
dequant_op_output_ch_mean[i] = sum;
}
std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(),
dequant_op_output_ch_mean.begin(), std::minus<>());
if (op_bias_diff_map_.find(callParam.node_name) != op_bias_diff_map_.end()) {
auto &bias_diff = op_bias_diff_map_[callParam.node_name];
std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(),
std::plus<>());
} else {
op_bias_diff_map_[callParam.node_name] = dequant_op_output_ch_mean;
}
}
return true;
};
return after_call_back;
}
KernelCallBack FullQuantQuantizer::GetFloatAfterCallBack() {
KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
if (FullQuantQuantizer::CheckFp32TensorVec(callParam.node_name, afterOutputs) != RET_OK) {
return true;
}
auto tensor = afterOutputs[0];
MS_ASSERT(tensor != nullptr);
const auto *tensor_data = static_cast<const float *>(tensor->data());
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
auto shapes = tensor->shape();
if (shapes.size() != DIMENSION_4D) {
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
return false;
}
// suppose the activation format: NHWC
auto channels = shapes[FOURTH_INPUT];
if (channels == 0) {
MS_LOG(ERROR) << "unexpected channels: 0";
return false;
}
std::vector<float> fp32_op_output_ch_mean(channels);
auto one_filter_size = elem_count / channels;
for (int i = 0; i < channels; i++) {
float sum = 0;
for (size_t j = 0; j < one_filter_size; j++) {
auto index = j * channels + i;
if (index >= elem_count) {
MS_LOG(ERROR) << "over flow!";
return false;
}
sum += tensor_data[index];
}
if (one_filter_size == 0) {
MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
return false;
}
sum = sum / one_filter_size;
fp32_op_output_ch_mean[i] = sum;
}
while (!OpOutputChMeanDataHandle(STORE, callParam.node_name, &fp32_op_output_ch_mean)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
}
return true;
};
return after_call_back;
}
} // namespace mindspore::lite::quant

View File

@ -39,11 +39,6 @@
#include "src/common/quant_utils.h"
namespace mindspore::lite::quant {
enum OperationType {
STORE,
FETCH,
};
class FullQuantQuantizer : public Quantizer {
public:
explicit FullQuantQuantizer(const converter::Flags &flags) : Quantizer(flags) {
@ -55,45 +50,20 @@ class FullQuantQuantizer : public Quantizer {
int DoQuantize(FuncGraphPtr func_graph) override;
private:
bool OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
bool OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
int PreProcess(const FuncGraphPtr &func_graph);
int CheckFp32TensorVec(const std::string &node_name, const std::vector<mindspore::tensor::MSTensor *> &tensor_vec);
int DoInference(CollectType collect_type);
int UpdateDivergeInterval();
int QuantNodeSimpleOp(const CNodePtr &cnode);
int QuantNode(const FuncGraphPtr &func_graph);
int SetInOutQuantParam(const AnfNodePtr &input_node, const std::unique_ptr<DataDistribution> &info,
const PrimitivePtr &primitive, bool is_input, size_t index) const;
int DoParameterWeightQuant(const ParameterPtr &weight, const PrimitivePtr &primitive, bool per_channel,
int input_index) const;
int DoValueNodeWeightQuant(const ValueNodePtr &weight, const PrimitivePtr &primitive, bool per_channel,
int input_index) const;
int DoParameterNodeQuant(const CNodePtr &cnode, const ParameterPtr &input_node, size_t input_index);
int DoValueNodeQuant(const CNodePtr &cnode, const ValueNodePtr &input_node, size_t input_index);
int IsSupportWeightQuant(const CNodePtr &cnode, const AnfNodePtr &input_node, size_t input_index);
int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive);
int Int8Inference();
int BiasCorrection(const FuncGraphPtr &func_graph);
int BiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode);
KernelCallBack GetBeforeCallBack(bool int8_op);
KernelCallBack GetAfterCallBack(bool int8_op);
KernelCallBack GetInt8AfterCallBack();
KernelCallBack GetFloatAfterCallBack();
void InitQMinMax();
void InitCpuConfig();
void InitKirinConfig();
@ -117,17 +87,9 @@ class FullQuantQuantizer : public Quantizer {
std::set<PrimitivePtr> per_channel_ops_;
std::set<mindspore::ActivationType> support_activation_;
std::unique_ptr<Calibrator> calibrator_{nullptr};
std::shared_ptr<Calibrator> calibrator_{nullptr};
session::LiteSession *fp32_session_{nullptr};
Model *fp32_model_{nullptr};
session::LiteSession *int8_session_{nullptr};
Model *int8_model_{nullptr};
std::map<std::string, std::vector<float>> fp32_op_input_map_; // concurrency
std::map<std::string, std::vector<float>> fp32_op_output_ch_mean_map_; // concurrency
std::map<std::string, std::vector<float>> op_bias_diff_map_; // only use by int8 model
std::mutex mutex_op_input_;
std::mutex mutex_op_output_;
// key is tensor_name
std::map<std::string, std::vector<schema::QuantParamT>> weight_quant_params_bak;

View File

@ -46,7 +46,79 @@ constexpr int kSingleDirBiasTensorSize = 4;
constexpr int kLstmBiasShapeSize = 2;
constexpr int kLstmBiasIndex = 3;
constexpr size_t kBitNumPerByte = 8;
int ComputeBiasDataAndQuantParam(const std::vector<double> &bias_scales, const std::vector<double> &input_scales,
const float *raw_datas, const QuantParamHolderPtr &quant_param_holder,
std::vector<schema::QuantParamT> *quant_params, std::vector<int32_t> *quant_datas) {
MS_ASSERT(raw_datas != nullptr && quant_param_holder != nullptr);
MS_ASSERT(quant_params != nullptr && quant_datas != nullptr);
double bias_scale_tmp;
const constexpr double quanted_bias_abs_limit = 0.5 * INT32_MAX;
MS_CHECK_TRUE_MSG(quant_param_holder->get_input_quant_params().size() > 1, RET_ERROR, "invalid access.");
auto weight_quant_params = quant_param_holder->get_input_quant_params().at(1);
auto shape_size = quant_datas->size();
if (bias_scales.size() == shape_size) {
for (size_t i = 0; i < shape_size; i++) {
bias_scale_tmp = bias_scales[i];
if (fabs(bias_scale_tmp) <= 0.0f) {
MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0.";
return RET_ERROR;
}
if (std::abs(raw_datas[i] / bias_scale_tmp) >= quanted_bias_abs_limit) {
MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[i].scale
<< " is too small, need to update";
// update filter scale and zp
double activate_scale = input_scales[0];
double filter_scale = std::abs(raw_datas[i]) / (activate_scale * quanted_bias_abs_limit);
weight_quant_params[i].scale = filter_scale;
weight_quant_params[i].zeroPoint = 0;
quant_param_holder->set_input_quant_param(1, weight_quant_params);
bias_scale_tmp = std::abs(raw_datas[i]) / quanted_bias_abs_limit;
quant_params->at(i).scale = bias_scale_tmp;
MS_LOG(DEBUG) << "new filter scale: " << filter_scale;
}
auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp);
quant_datas->at(i) = quant_data;
}
return RET_OK;
} else if (bias_scales.size() == 1) {
// for fc, per tensor quant
bias_scale_tmp = quant_params->front().scale;
float max_raw_data = 0.0f;
for (size_t i = 0; i < shape_size; i++) {
if (std::abs(raw_datas[i]) > max_raw_data) {
max_raw_data = std::abs(raw_datas[i]);
}
}
if (fabs(bias_scale_tmp) <= 0.0f) {
MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0.";
return RET_ERROR;
}
if (std::abs(max_raw_data / bias_scale_tmp) >= quanted_bias_abs_limit) {
MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[0].scale
<< " is too small, need to update";
double activate_scale = input_scales[0];
MS_CHECK_TRUE_MSG(activate_scale != 0, RET_ERROR, "activate_scale == 0");
double filter_scale = std::abs(max_raw_data) / (activate_scale * quanted_bias_abs_limit);
weight_quant_params[0].scale = filter_scale;
weight_quant_params[0].zeroPoint = 0;
quant_param_holder->set_input_quant_param(1, weight_quant_params);
bias_scale_tmp = max_raw_data / quanted_bias_abs_limit;
quant_params->front().scale = bias_scale_tmp;
MS_LOG(DEBUG) << "new filter scale: " << filter_scale;
}
for (size_t i = 0; i < shape_size; i++) {
auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp);
quant_datas->at(i) = quant_data;
}
return RET_OK;
}
MS_LOG(ERROR) << "unexpected input_scales size: " << input_scales.size()
<< " weight_scales size: " << weight_quant_params.size();
return RET_ERROR;
}
} // namespace
QuantParamHolderPtr GetCNodeQuantHolder(const PrimitivePtr &primitive) {
MS_CHECK_TRUE_RET(primitive != nullptr, nullptr);
QuantParamHolderPtr quant_params_holder = nullptr;
@ -459,4 +531,92 @@ std::string BoolVectorToString(const std::vector<bool> &bool_vec) {
}
return str;
}
int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive) {
CHECK_NULL_RETURN(bias);
CHECK_NULL_RETURN(primitive);
auto bias_default_param = bias->default_param();
auto bias_param = bias_default_param->cast<tensor::TensorPtr>();
MS_ASSERT(bias_parameter != nullptr);
auto quant_param_holder = GetCNodeQuantHolder(primitive);
MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr.");
auto active_weight_quant_params = quant_param_holder->get_input_quant_params();
auto active_params = active_weight_quant_params.at(FIRST_INPUT);
auto weight_params = active_weight_quant_params.at(SECOND_INPUT);
vector<double> input_scales;
vector<double> filter_scales;
vector<double> bias_scales;
size_t sizeX = active_params.size();
for (size_t i = 0; i < sizeX; i++) {
input_scales.emplace_back(active_params[i].scale);
}
size_t sizeY = weight_params.size();
if (sizeX != sizeY) {
if (sizeX > 1 && sizeY > 1) {
MS_LOG(ERROR) << "input and filter's scale count cannot match!";
return RET_ERROR;
}
}
for (size_t i = 0; i < sizeY; i++) {
filter_scales.emplace_back(weight_params[i].scale);
}
size_t size = std::max(sizeX, sizeY);
for (size_t i = 0; i < size; i++) {
auto scaleX = sizeX > 1 ? input_scales[i] : input_scales[0];
auto scaleY = sizeY > 1 ? filter_scales[i] : filter_scales[0];
bias_scales.push_back(scaleX * scaleY);
}
MS_ASSERT(!bias_scales.empty());
size_t shape_size = bias_param->DataSize();
// set bias quant param
std::vector<schema::QuantParamT> quant_params;
for (double bias_scale : bias_scales) {
schema::QuantParamT quant_param;
if (bias_scale == 0) {
MS_LOG(WARNING) << "bias_scale == 0";
quant_param.scale = 1;
} else {
quant_param.scale = bias_scale;
}
quant_param.numBits = k32Bit;
quant_param.zeroPoint = 0;
quant_param.inited = true;
quant_params.emplace_back(quant_param);
}
// quant bias data
std::vector<int32_t> quant_datas(shape_size);
auto *raw_datas = static_cast<float *>(bias_param->data_c());
if (ComputeBiasDataAndQuantParam(bias_scales, input_scales, raw_datas, quant_param_holder, &quant_params,
&quant_datas) != RET_OK) {
MS_LOG(ERROR) << "compute bias data failed.";
return RET_ERROR;
}
quant_param_holder->set_input_quant_param(THIRD_INPUT, quant_params);
auto ret = SetTensorData(bias_param, quant_datas.data(), shape_size * sizeof(int32_t));
if (ret != RET_OK) {
MS_LOG(ERROR) << "set tensor data failed.";
return RET_ERROR;
}
// set dtype
auto abstractBase = bias->abstract();
if (abstractBase == nullptr) {
MS_LOG(ERROR) << "Abstract of parameter is nullptr, " << bias->name();
return RET_ERROR;
}
if (!utils::isa<abstract::AbstractTensorPtr>(abstractBase)) {
MS_LOG(ERROR) << "Abstract of parameter should be anstract tensor, " << bias->name();
return RET_ERROR;
}
auto abstractTensor = utils::cast<abstract::AbstractTensorPtr>(abstractBase);
if (abstractTensor == nullptr || abstractTensor->element() == nullptr) {
MS_LOG(ERROR) << "abstractTensor is nullptr" << bias->name();
return RET_NULL_PTR;
}
abstractTensor->element()->set_type(TypeIdToType(kNumberTypeInt32));
return RET_OK;
}
} // namespace mindspore::lite::quant

View File

@ -96,6 +96,8 @@ int GetPreferredDim(const PrimitivePtr &primitive, int input_index, const std::v
std::vector<int> ConvertShapeVectorToInt32(const ShapeVector &dims);
int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive);
template <typename T>
int FixedBitQuantFilter(const AnfNodePtr &parameter, const tensor::TensorPtr &weight, const PrimitivePtr &primitive,
QuantType quant_type, int quant_max, int quant_min, size_t bit_num,