From 9ad4064750c5b3ff09bcb744aaa6f50875d2ec9e Mon Sep 17 00:00:00 2001 From: yeyunpeng2020 Date: Mon, 27 Dec 2021 16:07:23 +0800 Subject: [PATCH] Abstract bias correction --- .../lite/tools/converter/quant_param_holder.h | 6 - .../quantizer/bias_correction_strategy.cc | 488 +++++++++++++ .../quantizer/bias_correction_strategy.h | 83 +++ .../tools/converter/quantizer/bitpacking.h | 6 +- .../quantizer/full_quant_quantizer.cc | 643 +----------------- .../quantizer/full_quant_quantizer.h | 40 +- .../converter/quantizer/quantize_util.cc | 160 +++++ .../tools/converter/quantizer/quantize_util.h | 2 + 8 files changed, 743 insertions(+), 685 deletions(-) create mode 100644 mindspore/lite/tools/converter/quantizer/bias_correction_strategy.cc create mode 100644 mindspore/lite/tools/converter/quantizer/bias_correction_strategy.h diff --git a/mindspore/lite/tools/converter/quant_param_holder.h b/mindspore/lite/tools/converter/quant_param_holder.h index 9b7186de0e1..307a76588f9 100644 --- a/mindspore/lite/tools/converter/quant_param_holder.h +++ b/mindspore/lite/tools/converter/quant_param_holder.h @@ -120,12 +120,6 @@ class QuantParamHolder : public Value { std::vector> get_output_quant_params() const { return this->output_quant_params_; } - // deprecated - void ClearInputOutputQuantParam() { - input_quant_params_.clear(); - output_quant_params_.clear(); - } - bool IsInputQuantParamsInited() { if (this->input_quant_params_.empty()) { return false; diff --git a/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.cc b/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.cc new file mode 100644 index 00000000000..9a66c400787 --- /dev/null +++ b/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.cc @@ -0,0 +1,488 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tools/converter/quantizer/bias_correction_strategy.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "src/common/log_adapter.h" +#include "include/errorcode.h" +#include "mindapi/base/type_id.h" +#include "tools/common/tensor_util.h" + +namespace mindspore::lite::quant { +namespace { +constexpr int kHasBiasTensorSize = 3; +const char *kTypeConv2D = schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion); +} // namespace +int BiasCorrectionStrategy::CheckFp32TensorVec(const std::string &node_name, + const std::vector &tensor_vec) { + if (tensor_vec.empty()) { + MS_LOG(ERROR) << "node: " << node_name << " input tensors is 0"; + return RET_ERROR; + } + auto *tensor = tensor_vec[0]; + CHECK_NULL_RETURN(tensor); + if (tensor->data_type() != kNumberTypeFloat32) { + MS_LOG(INFO) << "node: " << node_name << " will not quantize" + << " tensor data_type: " << tensor->data_type(); + return RET_ERROR; + } + return RET_OK; +} + +bool BiasCorrectionStrategy::OpInputDataHandle(OperationType type, const string &op_name, std::vector *data) { + MS_ASSERT(data != nullptr); + std::lock_guard lg(mutex_op_input_); + if (type == STORE) { + if (fp32_op_input_map_.find(op_name) != fp32_op_input_map_.end()) { + // the data has not been fetched by int8 model + return false; + } + fp32_op_input_map_[op_name] = *data; + return true; + } else if (type == FETCH) { + if (fp32_op_input_map_.find(op_name) == fp32_op_input_map_.end()) { + // the data not generated by fp32 model yet + return false; + } + *data = fp32_op_input_map_[op_name]; + fp32_op_input_map_.erase(op_name); + return true; + } else { + MS_LOG(ERROR) << "unexpected type: " << type; + } + return false; +} + +bool BiasCorrectionStrategy::OpOutputChMeanDataHandle(OperationType type, const string &op_name, + std::vector *data) { + MS_ASSERT(data != nullptr); + std::lock_guard lg(mutex_op_output_); + if (type == STORE) { + if (fp32_op_output_ch_mean_map_.find(op_name) != fp32_op_output_ch_mean_map_.end()) { + // the data has not been fetched by int8 model + return false; + } + fp32_op_output_ch_mean_map_[op_name] = *data; + return true; + } else if (type == FETCH) { + if (fp32_op_output_ch_mean_map_.find(op_name) == fp32_op_output_ch_mean_map_.end()) { + // the data not generated by fp32 model yet + return false; + } + *data = fp32_op_output_ch_mean_map_[op_name]; + fp32_op_output_ch_mean_map_.erase(op_name); + return true; + } else { + MS_LOG(ERROR) << "unexpected type: " << type; + } + return false; +} + +KernelCallBack BiasCorrectionStrategy::GetBeforeCallBack(bool int8_op) { + KernelCallBack before_call_back; + if (!int8_op) { + before_call_back = [this](const std::vector &before_inputs, + const std::vector &before_outputs, + const CallBackParam &callParam) -> bool { + if (callParam.node_type == kTypeConv2D) { + if (CheckFp32TensorVec(callParam.node_name, before_inputs) != RET_OK) { + return true; + } + auto tensor = before_inputs[0]; + MS_ASSERT(tensor != nullptr); + size_t elem_count = tensor->ElementsNum(); + MS_CHECK_GT(elem_count, 0, false); + std::vector fp32_op_input(elem_count); + auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size()); + if (ret != EOK) { + MS_LOG(ERROR) << "memcpy error: " << ret; + return false; + } + while (!OpInputDataHandle(STORE, callParam.node_name, &fp32_op_input)) { + std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase)); + } + } + return true; + }; + } else { + before_call_back = [this](const std::vector &before_inputs, + const std::vector &before_outputs, + const CallBackParam &callParam) -> bool { + if (callParam.node_type == kTypeConv2D) { + std::vector fp32_op_input; + while (!OpInputDataHandle(FETCH, callParam.node_name, &fp32_op_input)) { + std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase)); + } + auto tensor = before_inputs[0]; + MS_ASSERT(tensor != nullptr); + // op can be skipped. + if (tensor->data_type() != kNumberTypeInt8) { + MS_LOG(INFO) << "tensor type is " << tensor->data_type(); + return true; + } + // do quantization: activation is always per layer quantized + std::vector quant_datas; + auto quant_params = tensor->quant_params(); + if (quant_params.size() != 1) { + MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size(); + return false; + } + schema::QuantParamT quant_param_t; + quant_param_t.scale = quant_params[0].scale; + quant_param_t.zeroPoint = quant_params[0].zeroPoint; + for (auto float_data : fp32_op_input) { + auto quant_data = QuantizeData(float_data, &quant_param_t, activation_q_max_, activation_q_min_); + quant_datas.push_back(quant_data); + } + + if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) { + MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size() + << " not the same with: " << quant_datas.size() * sizeof(int8_t); + return false; + } + + auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t)); + if (ret != EOK) { + MS_LOG(ERROR) << "memcpy error: " << ret; + return false; + } + } + return true; + }; + } + return before_call_back; +} + +KernelCallBack BiasCorrectionStrategy::GetAfterCallBack(bool int8_op) { + KernelCallBack after_call_back; + if (!int8_op) { + return GetFloatAfterCallBack(); + } + return GetInt8AfterCallBack(); +} + +KernelCallBack BiasCorrectionStrategy::GetInt8AfterCallBack() { + KernelCallBack after_call_back = [this](const std::vector &afterInputs, + const std::vector &afterOutputs, + const CallBackParam &callParam) -> bool { + if (callParam.node_type == kTypeConv2D) { + std::vector fp32_op_output_ch_mean; + while (!OpOutputChMeanDataHandle(FETCH, callParam.node_name, &fp32_op_output_ch_mean)) { + std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase)); + } + auto tensor = afterOutputs[0]; + MS_ASSERT(tensor != nullptr); + // op can be skipped. + if (tensor->data_type() != kNumberTypeInt8) { + MS_LOG(INFO) << "tensor type is " << tensor->data_type(); + return true; + } + const int8_t *tensor_data = static_cast(tensor->data()); + size_t elem_count = tensor->ElementsNum(); + MS_CHECK_GT(elem_count, 0, false); + auto shapes = tensor->shape(); + if (shapes.size() != DIMENSION_4D) { + MS_LOG(ERROR) << "unexpected shape size: " << shapes.size(); + return false; + } + // suppose the the format is NHWC + auto channels = shapes[FOURTH_INPUT]; + if (channels == 0) { + MS_LOG(ERROR) << "unexpected channels: 0"; + return false; + } + auto quant_params = tensor->quant_params(); + if (quant_params.size() != 1) { + MS_LOG(ERROR) << "unexpected activatation quant_params size: " << quant_params.size(); + return false; + } + auto scale = quant_params[0].scale; + auto zp = quant_params[0].zeroPoint; + std::vector dequant_op_output_ch_mean(channels); + auto one_filter_size = elem_count / channels; + for (int i = 0; i < channels; i++) { + float sum = 0; + for (size_t j = 0; j < one_filter_size; j++) { + auto index = j * channels + i; + if (index >= elem_count) { + MS_LOG(ERROR) << "over flow!"; + return false; + } + // deuqant activation + auto float_data = scale * (tensor_data[index] - zp); + sum += float_data; + } + if (one_filter_size == 0) { + MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0."; + return false; + } + sum = sum / one_filter_size; + dequant_op_output_ch_mean[i] = sum; + } + std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(), + dequant_op_output_ch_mean.begin(), std::minus<>()); + + if (op_bias_diff_map_.find(callParam.node_name) != op_bias_diff_map_.end()) { + auto &bias_diff = op_bias_diff_map_[callParam.node_name]; + std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(), + std::plus<>()); + } else { + op_bias_diff_map_[callParam.node_name] = dequant_op_output_ch_mean; + } + } + return true; + }; + return after_call_back; +} + +KernelCallBack BiasCorrectionStrategy::GetFloatAfterCallBack() { + KernelCallBack after_call_back = [this](const std::vector &afterInputs, + const std::vector &afterOutputs, + const CallBackParam &callParam) -> bool { + if (callParam.node_type == kTypeConv2D) { + if (CheckFp32TensorVec(callParam.node_name, afterOutputs) != RET_OK) { + return true; + } + auto tensor = afterOutputs[0]; + MS_ASSERT(tensor != nullptr); + const auto *tensor_data = static_cast(tensor->data()); + size_t elem_count = tensor->ElementsNum(); + MS_CHECK_GT(elem_count, 0, false); + auto shapes = tensor->shape(); + if (shapes.size() != DIMENSION_4D) { + MS_LOG(ERROR) << "unexpected shape size: " << shapes.size(); + return false; + } + // suppose the activation format: NHWC + auto channels = shapes[FOURTH_INPUT]; + if (channels == 0) { + MS_LOG(ERROR) << "unexpected channels: 0"; + return false; + } + std::vector fp32_op_output_ch_mean(channels); + auto one_filter_size = elem_count / channels; + for (int i = 0; i < channels; i++) { + float sum = 0; + for (size_t j = 0; j < one_filter_size; j++) { + auto index = j * channels + i; + if (index >= elem_count) { + MS_LOG(ERROR) << "over flow!"; + return false; + } + sum += tensor_data[index]; + } + if (one_filter_size == 0) { + MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0."; + return false; + } + sum = sum / one_filter_size; + fp32_op_output_ch_mean[i] = sum; + } + while (!OpOutputChMeanDataHandle(STORE, callParam.node_name, &fp32_op_output_ch_mean)) { + std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase)); + } + } + return true; + }; + return after_call_back; +} + +int BiasCorrectionStrategy::Int8Inference() { + // int8 inference + std::vector inputs = int8_session_->GetInputs(); + for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) { + for (size_t input_index = 0; input_index < inputs.size(); input_index++) { + int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]); + if (status != RET_OK) { + MS_LOG(ERROR) << "generate input data failed!"; + return RET_ERROR; + } + } + // before func + KernelCallBack before_call_back = GetBeforeCallBack(true); + // after func + KernelCallBack after_call_back = GetAfterCallBack(true); + int8_session_->BindThread(true); + auto status = int8_session_->RunGraph(before_call_back, after_call_back); + int8_session_->BindThread(false); + if (status != RET_OK) { + MS_LOG(ERROR) << "run model failed!"; + return RET_ERROR; + } + } // end for images + return RET_OK; +} + +int BiasCorrectionStrategy::DoBiasCorrection(const FuncGraphPtr &func_graph) { + // init in8 session + MS_LOG(INFO) << "create quant session"; + flags_.commonQuantParam.quant_type = schema::QuantType_QUANT_ALL; + auto int8_sm = CreateSessionByFuncGraph(func_graph, flags_, this->flags_.commonQuantParam.thread_num); + int8_session_ = int8_sm.session; + int8_model_ = int8_sm.model; + if (int8_session_ == nullptr || int8_model_ == nullptr) { + MS_LOG(ERROR) << "create session failed!"; + return RET_ERROR; + } + + std::future int8_inference = std::async(std::launch::async, &BiasCorrectionStrategy::Int8Inference, this); + // get input tensor + std::vector inputs = fp32_session_->GetInputs(); + // fp32 inference + for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) { + for (size_t input_index = 0; input_index < inputs.size(); input_index++) { + int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]); + if (status != RET_OK) { + MS_LOG(ERROR) << "generate input data from images failed!"; + return RET_ERROR; + } + } + // before func + KernelCallBack before_call_back = GetBeforeCallBack(false); + // after func + KernelCallBack after_call_back = GetAfterCallBack(false); + fp32_session_->BindThread(true); + auto status = fp32_session_->RunGraph(before_call_back, after_call_back); + fp32_session_->BindThread(false); + if (status != RET_OK) { + MS_LOG(ERROR) << "run model failed!"; + return RET_ERROR; + } + } // end for images + + int status = int8_inference.get(); + if (status != RET_OK) { + MS_LOG(ERROR) << "int8 inference failed!"; + return RET_ERROR; + } + if (calibrator_->GetBatchNum() == 0) { + MS_LOG(ERROR) << "divisor 'calibrate_size' cannot be 0."; + return RET_ERROR; + } + for (auto &key_value : op_bias_diff_map_) { + std::for_each(key_value.second.begin(), key_value.second.end(), + [this](float &data) { data = data / calibrator_->GetBatchNum(); }); + } + auto cnodes = func_graph->GetOrderedCnodes(); + for (auto &cnode : cnodes) { + auto op_name = cnode->fullname_with_scope(); + if (op_bias_diff_map_.find(op_name) == op_bias_diff_map_.end()) { + continue; + } + status = DoCNodeBiasCorrection(func_graph, cnode); + if (status != RET_OK) { + MS_LOG(ERROR) << "do node bias correct failed."; + break; + } + } + return status; +} + +int BiasCorrectionStrategy::DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode) { + auto op_name = cnode->fullname_with_scope(); + const auto &bias_diff = op_bias_diff_map_[op_name]; + auto primitive = GetValueNode(cnode->input(0)); + if (primitive == nullptr) { + MS_LOG(ERROR) << op_name << " primitive is nullptr"; + return RET_NULL_PTR; + } + auto quant_param_holder = GetCNodeQuantHolder(primitive); + MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr."); + auto input_quant_params = quant_param_holder->get_input_quant_params(); + if (input_quant_params.size() == kHasBiasTensorSize) { + // compensate the existed + auto bias_quant_params = input_quant_params.at(THIRD_INPUT); + auto bias = cnode->input(THIRD_INPUT + 1); + auto bias_parameter_ptr = bias->cast(); + auto bias_default_param = bias_parameter_ptr->default_param(); + auto bias_param = bias_default_param->cast(); + int *bias_datas = static_cast(bias_param->data_c()); + + if (static_cast(bias_param->DataSize()) != bias_diff.size()) { + MS_LOG(DEBUG) << op_name << " unexpected bias data count: " << bias_param->DataSize() + << " not the same as bias_diff: " << bias_diff.size(); + return RET_ERROR; + } + if (bias_quant_params.size() != bias_diff.size()) { + MS_LOG(ERROR) << op_name << " unexpected bias quant params size: " << bias_quant_params.size() + << " not the same as bias_diff: " << bias_diff.size(); + return RET_ERROR; + } + for (size_t i = 0; i < bias_param->DataSize(); i++) { + auto scale = bias_quant_params[i].scale; + if (fabs(scale) <= 0.0f) { + MS_LOG(ERROR) << op_name << " divisor 'scale' cannot be 0."; + return RET_ERROR; + } + double after_correct = std::round(bias_diff[i] / scale) + bias_datas[i]; + const constexpr int32_t corrected_bias_abs_limit = 0.6 * INT32_MAX; + if (after_correct > corrected_bias_abs_limit) { + MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too large: " << after_correct + << " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale; + bias_datas[i] = static_cast(corrected_bias_abs_limit); + } else if (after_correct < -corrected_bias_abs_limit) { + MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too small: " << after_correct + << " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale; + bias_datas[i] = static_cast(-corrected_bias_abs_limit); + } else { + auto diff = static_cast(std::round(bias_diff[i] / scale)); + bias_datas[i] += diff; + } + } + } else if (input_quant_params.size() == kHasBiasTensorSize - 1) { + MS_LOG(INFO) << op_name << " add bias input"; + // need to add bias input + auto parameter = func_graph->add_parameter(); + if (parameter == nullptr) { + MS_LOG(ERROR) << "parameter is nullptr."; + return RET_NULL_PTR; + } + std::vector shape; + shape.push_back(bias_diff.size()); + + auto tensor_info = CreateTensorInfo(bias_diff.data(), sizeof(float) * bias_diff.size(), shape, kNumberTypeFloat32); + if (tensor_info == nullptr) { + MS_LOG(ERROR) << op_name << " create tensor info failed."; + return RET_ERROR; + } + auto status = InitParameterFromTensorInfo(parameter, tensor_info); + if (status != RET_OK) { + MS_LOG(ERROR) << op_name << " init parameter from tensor info failed"; + return RET_ERROR; + } + parameter->set_name("added_" + op_name + "_bias"); + cnode->add_input(parameter); + status = DoParameterBiasQuant(parameter, primitive); + if (status != RET_OK) { + MS_LOG(ERROR) << op_name << " Do bias quant failed."; + return RET_ERROR; + } + } else { + MS_LOG(WARNING) << op_name << " unexpected size: " << input_quant_params.size() + << ", and shared weight tensor does not support bias correction temporarily."; + } + return RET_OK; +} +} // namespace mindspore::lite::quant diff --git a/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.h b/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.h new file mode 100644 index 00000000000..53e08ce8d80 --- /dev/null +++ b/mindspore/lite/tools/converter/quantizer/bias_correction_strategy.h @@ -0,0 +1,83 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BIASCORRECTION_H +#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BIASCORRECTION_H +#include +#include +#include +#include +#include "base/base.h" +#include "ir/anf.h" +#include "tools/converter/quantizer/calibrator.h" + +namespace mindspore::lite::quant { +enum OperationType { + STORE, + FETCH, +}; + +class BiasCorrectionStrategy { + public: + BiasCorrectionStrategy(const converter::Flags &flags, const std::shared_ptr &calibrator, + session::LiteSession *fp32_session, Model *fp32_model, int activation_q_min, + int activation_q_max) + : flags_(flags), + calibrator_(calibrator), + fp32_session_(fp32_session), + fp32_model_(fp32_model), + activation_q_min_(activation_q_min), + activation_q_max_(activation_q_max) {} + ~BiasCorrectionStrategy() { + if (int8_session_ != nullptr) { + delete int8_session_; + } + if (int8_model_ != nullptr) { + delete int8_model_; + } + } + int DoBiasCorrection(const FuncGraphPtr &func_graph); + + private: + int DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode); + int Int8Inference(); + bool OpInputDataHandle(OperationType type, const string &op_name, std::vector *data); + bool OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector *data); + KernelCallBack GetBeforeCallBack(bool int8_op); + KernelCallBack GetAfterCallBack(bool int8_op); + KernelCallBack GetInt8AfterCallBack(); + KernelCallBack GetFloatAfterCallBack(); + int CheckFp32TensorVec(const std::string &node_name, const std::vector &tensor_vec); + + private: + converter::Flags flags_; + std::shared_ptr calibrator_{nullptr}; + session::LiteSession *fp32_session_{nullptr}; + Model *fp32_model_{nullptr}; + int activation_q_min_{INT8_MIN}; + int activation_q_max_{INT8_MAX}; + + session::LiteSession *int8_session_{nullptr}; + Model *int8_model_{nullptr}; + + std::map> fp32_op_input_map_; // concurrency + std::map> fp32_op_output_ch_mean_map_; // concurrency + std::map> op_bias_diff_map_; // only use by int8 model + std::mutex mutex_op_input_; + std::mutex mutex_op_output_; +}; +} // namespace mindspore::lite::quant +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BIASCORRECTION_H diff --git a/mindspore/lite/tools/converter/quantizer/bitpacking.h b/mindspore/lite/tools/converter/quantizer/bitpacking.h index be465e67a0f..9ff274dee29 100644 --- a/mindspore/lite/tools/converter/quantizer/bitpacking.h +++ b/mindspore/lite/tools/converter/quantizer/bitpacking.h @@ -14,9 +14,9 @@ * limitations under the License. */ -#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER__GENERAL_BITPACKING_H -#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER__GENERAL_BITPACKING_H -#include +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H +#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H +#include #include #include #include diff --git a/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.cc b/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.cc index 1de107a0ccb..0131df6162b 100644 --- a/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.cc +++ b/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.cc @@ -16,18 +16,11 @@ #include "tools/converter/quantizer/full_quant_quantizer.h" #include -#include #include #include #include -#include -#include -#include #include -#include #include -#include -#include "ops/fusion/full_connection.h" #include "ops/tuple_get_item.h" #include "src/tensor.h" #include "tools/converter/quantizer/quant_cast.h" @@ -41,6 +34,7 @@ #include "tools/common/node_util.h" #include "nnacl/op_base.h" #include "src/common/log_util.h" +#include "tools/converter/quantizer/bias_correction_strategy.h" using std::string; using std::vector; @@ -50,88 +44,10 @@ namespace { static const std::set has_bias_operator = {prim::kPrimConv2DFusion, prim::kPrimConv2dTransposeFusion, prim::kPrimMatMulFusion, prim::kPrimFullConnection, prim::kPrimLayerNormFusion}; -constexpr int kHasBiasTensorSize = 3; -constexpr int KBiasBitNum = 32; -const char *kTypeConv2D = schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion); } // namespace -namespace { -int ComputeBiasDataAndQuantParam(const std::vector &bias_scales, const std::vector &input_scales, - const float *raw_datas, const QuantParamHolderPtr &quant_param_holder, - std::vector *quant_params, std::vector *quant_datas) { - MS_ASSERT(raw_datas != nullptr && quant_param_holder != nullptr); - MS_ASSERT(quant_params != nullptr && quant_datas != nullptr); - double bias_scale_tmp; - const constexpr double quanted_bias_abs_limit = 0.5 * INT32_MAX; - MS_CHECK_TRUE_MSG(quant_param_holder->get_input_quant_params().size() > 1, RET_ERROR, "invalid access."); - auto weight_quant_params = quant_param_holder->get_input_quant_params().at(1); - auto shape_size = quant_datas->size(); - if (bias_scales.size() == shape_size) { - for (size_t i = 0; i < shape_size; i++) { - bias_scale_tmp = bias_scales[i]; - if (fabs(bias_scale_tmp) <= 0.0f) { - MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0."; - return RET_ERROR; - } - if (std::abs(raw_datas[i] / bias_scale_tmp) >= quanted_bias_abs_limit) { - MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[i].scale - << " is too small, need to update"; - // update filter scale and zp - double activate_scale = input_scales[0]; - double filter_scale = std::abs(raw_datas[i]) / (activate_scale * quanted_bias_abs_limit); - weight_quant_params[i].scale = filter_scale; - weight_quant_params[i].zeroPoint = 0; - quant_param_holder->set_input_quant_param(1, weight_quant_params); - bias_scale_tmp = std::abs(raw_datas[i]) / quanted_bias_abs_limit; - quant_params->at(i).scale = bias_scale_tmp; - MS_LOG(DEBUG) << "new filter scale: " << filter_scale; - } - auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp); - quant_datas->at(i) = quant_data; - } - return RET_OK; - } else if (bias_scales.size() == 1) { - // for fc, per tensor quant - bias_scale_tmp = quant_params->front().scale; - float max_raw_data = 0.0f; - for (size_t i = 0; i < shape_size; i++) { - if (std::abs(raw_datas[i]) > max_raw_data) { - max_raw_data = std::abs(raw_datas[i]); - } - } - if (fabs(bias_scale_tmp) <= 0.0f) { - MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0."; - return RET_ERROR; - } - if (std::abs(max_raw_data / bias_scale_tmp) >= quanted_bias_abs_limit) { - MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[0].scale - << " is too small, need to update"; - double activate_scale = input_scales[0]; - MS_CHECK_TRUE_MSG(activate_scale != 0, RET_ERROR, "activate_scale == 0"); - double filter_scale = std::abs(max_raw_data) / (activate_scale * quanted_bias_abs_limit); - weight_quant_params[0].scale = filter_scale; - weight_quant_params[0].zeroPoint = 0; - quant_param_holder->set_input_quant_param(1, weight_quant_params); - bias_scale_tmp = max_raw_data / quanted_bias_abs_limit; - quant_params->front().scale = bias_scale_tmp; - MS_LOG(DEBUG) << "new filter scale: " << filter_scale; - } - for (size_t i = 0; i < shape_size; i++) { - auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp); - quant_datas->at(i) = quant_data; - } - return RET_OK; - } - MS_LOG(ERROR) << "unexpected input_scales size: " << input_scales.size() - << " weight_scales size: " << weight_quant_params.size(); - return RET_ERROR; -} -} // namespace - FullQuantQuantizer::~FullQuantQuantizer() { delete fp32_session_; delete fp32_model_; - delete int8_session_; - delete int8_model_; } int FullQuantQuantizer::SetInOutQuantParam(const AnfNodePtr &input_node, const std::unique_ptr &info, @@ -206,94 +122,6 @@ int FullQuantQuantizer::DoValueNodeWeightQuant(const ValueNodePtr &weight, const return RET_OK; } -int FullQuantQuantizer::DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive) { - CHECK_NULL_RETURN(bias); - CHECK_NULL_RETURN(primitive); - auto bias_default_param = bias->default_param(); - auto bias_param = bias_default_param->cast(); - MS_ASSERT(bias_parameter != nullptr); - auto quant_param_holder = GetCNodeQuantHolder(primitive); - MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr."); - auto active_weight_quant_params = quant_param_holder->get_input_quant_params(); - - auto active_params = active_weight_quant_params.at(FIRST_INPUT); - auto weight_params = active_weight_quant_params.at(SECOND_INPUT); - - vector input_scales; - vector filter_scales; - vector bias_scales; - size_t sizeX = active_params.size(); - for (size_t i = 0; i < sizeX; i++) { - input_scales.emplace_back(active_params[i].scale); - } - size_t sizeY = weight_params.size(); - if (sizeX != sizeY) { - if (sizeX > 1 && sizeY > 1) { - MS_LOG(ERROR) << "input and filter's scale count cannot match!"; - return RET_ERROR; - } - } - for (size_t i = 0; i < sizeY; i++) { - filter_scales.emplace_back(weight_params[i].scale); - } - size_t size = std::max(sizeX, sizeY); - for (size_t i = 0; i < size; i++) { - auto scaleX = sizeX > 1 ? input_scales[i] : input_scales[0]; - auto scaleY = sizeY > 1 ? filter_scales[i] : filter_scales[0]; - bias_scales.push_back(scaleX * scaleY); - } - MS_ASSERT(!bias_scales.empty()); - size_t shape_size = bias_param->DataSize(); - - // set bias quant param - std::vector quant_params; - for (double bias_scale : bias_scales) { - schema::QuantParamT quant_param; - if (bias_scale == 0) { - MS_LOG(WARNING) << "bias_scale == 0"; - quant_param.scale = 1; - } else { - quant_param.scale = bias_scale; - } - quant_param.numBits = KBiasBitNum; - quant_param.zeroPoint = 0; - quant_param.inited = true; - quant_params.emplace_back(quant_param); - } - // quant bias data - std::vector quant_datas(shape_size); - - auto *raw_datas = static_cast(bias_param->data_c()); - if (ComputeBiasDataAndQuantParam(bias_scales, input_scales, raw_datas, quant_param_holder, &quant_params, - &quant_datas) != RET_OK) { - MS_LOG(ERROR) << "compute bias data failed."; - return RET_ERROR; - } - quant_param_holder->set_input_quant_param(THIRD_INPUT, quant_params); - auto ret = SetTensorData(bias_param, quant_datas.data(), shape_size * sizeof(int32_t)); - if (ret != RET_OK) { - MS_LOG(ERROR) << "set tensor data failed."; - return RET_ERROR; - } - // set dtype - auto abstractBase = bias->abstract(); - if (abstractBase == nullptr) { - MS_LOG(ERROR) << "Abstract of parameter is nullptr, " << bias->name(); - return RET_ERROR; - } - if (!utils::isa(abstractBase)) { - MS_LOG(ERROR) << "Abstract of parameter should be anstract tensor, " << bias->name(); - return RET_ERROR; - } - auto abstractTensor = utils::cast(abstractBase); - if (abstractTensor == nullptr || abstractTensor->element() == nullptr) { - MS_LOG(ERROR) << "abstractTensor is nullptr" << bias->name(); - return RET_NULL_PTR; - } - abstractTensor->element()->set_type(TypeIdToType(kNumberTypeInt32)); - return RET_OK; -} - int FullQuantQuantizer::IsSupportWeightQuant(const CNodePtr &cnode, const AnfNodePtr &input_node, size_t input_index) { auto primitive = GetValueNode(cnode->input(0)); if (primitive == nullptr) { @@ -628,17 +456,6 @@ int FullQuantQuantizer::MarkQuantNode(const FuncGraphPtr &func_graph) { MS_LOG(ERROR) << cnode->fullname_with_scope() << " add quantized op failed."; return ret; } - auto primitive = GetValueNode(cnode->input(0)); - if (primitive == nullptr) { - MS_LOG(ERROR) << cnode->fullname_with_scope() << " primitive is null"; - return RET_ERROR; - } - auto quant_param_holder = GetCNodeQuantHolder(primitive); - if (quant_param_holder == nullptr) { - MS_LOG(ERROR) << cnode->fullname_with_scope() << " quant_param_holder is null"; - return RET_ERROR; - } - quant_param_holder->ClearInputOutputQuantParam(); } } return RET_OK; @@ -658,7 +475,7 @@ int FullQuantQuantizer::PreProcess(const FuncGraphPtr &func_graph) { break; } InitQMinMax(); - calibrator_ = std::make_unique(this->bit_num_, activation_q_max_, activation_q_min_, + calibrator_ = std::make_shared(this->bit_num_, activation_q_max_, activation_q_min_, this->flags_.fullQuantParam.activation_quant_method, this->flags_.dataPreProcessParam, activation_symmetry_); MSLITE_CHECK_PTR(calibrator_); @@ -670,22 +487,6 @@ int FullQuantQuantizer::PreProcess(const FuncGraphPtr &func_graph) { return RET_OK; } -int FullQuantQuantizer::CheckFp32TensorVec(const std::string &node_name, - const std::vector &tensor_vec) { - if (tensor_vec.empty()) { - MS_LOG(ERROR) << "node: " << node_name << " input tensors is 0"; - return RET_ERROR; - } - auto *tensor = tensor_vec[0]; - CHECK_NULL_RETURN(tensor); - if (tensor->data_type() != kNumberTypeFloat32) { - MS_LOG(INFO) << "node: " << node_name << " will not quantize" - << " tensor data_type: " << tensor->data_type(); - return RET_ERROR; - } - return RET_OK; -} - int FullQuantQuantizer::DoInference(CollectType collect_type) { // get input tensor vector inputs = fp32_session_->GetInputs(); @@ -736,172 +537,6 @@ int FullQuantQuantizer::DoInference(CollectType collect_type) { return RET_OK; } -int FullQuantQuantizer::Int8Inference() { - // int8 inference - vector inputs = int8_session_->GetInputs(); - for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) { - for (size_t input_index = 0; input_index < inputs.size(); input_index++) { - int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]); - if (status != RET_OK) { - MS_LOG(ERROR) << "generate input data failed!"; - return RET_ERROR; - } - } - // before func - KernelCallBack before_call_back = GetBeforeCallBack(true); - // after func - KernelCallBack after_call_back = GetAfterCallBack(true); - int8_session_->BindThread(true); - auto status = int8_session_->RunGraph(before_call_back, after_call_back); - int8_session_->BindThread(false); - if (status != RET_OK) { - MS_LOG(ERROR) << "run model failed!"; - return RET_ERROR; - } - } // end for images - return RET_OK; -} - -int FullQuantQuantizer::BiasCorrection(const FuncGraphPtr &func_graph) { - std::future int8_inference = std::async(std::launch::async, &FullQuantQuantizer::Int8Inference, this); - // get input tensor - vector inputs = fp32_session_->GetInputs(); - // fp32 inference - for (size_t i = 0; i < calibrator_->GetBatchNum(); i++) { - for (size_t input_index = 0; input_index < inputs.size(); input_index++) { - int status = calibrator_->GenerateInputData(inputs[input_index]->tensor_name(), i, inputs[input_index]); - if (status != RET_OK) { - MS_LOG(ERROR) << "generate input data from images failed!"; - return RET_ERROR; - } - } - // before func - KernelCallBack before_call_back = GetBeforeCallBack(false); - // after func - KernelCallBack after_call_back = GetAfterCallBack(false); - fp32_session_->BindThread(true); - auto status = fp32_session_->RunGraph(before_call_back, after_call_back); - fp32_session_->BindThread(false); - if (status != RET_OK) { - MS_LOG(ERROR) << "run model failed!"; - return RET_ERROR; - } - } // end for images - - int status = int8_inference.get(); - if (status != RET_OK) { - MS_LOG(ERROR) << "int8 inference failed!"; - return RET_ERROR; - } - if (calibrator_->GetBatchNum() == 0) { - MS_LOG(ERROR) << "divisor 'calibrate_size' cannot be 0."; - return RET_ERROR; - } - for (auto &key_value : op_bias_diff_map_) { - std::for_each(key_value.second.begin(), key_value.second.end(), - [this](float &data) { data = data / calibrator_->GetBatchNum(); }); - } - auto cnodes = func_graph->GetOrderedCnodes(); - for (auto &cnode : cnodes) { - auto op_name = cnode->fullname_with_scope(); - if (op_bias_diff_map_.find(op_name) == op_bias_diff_map_.end()) { - continue; - } - status = BiasCorrection(func_graph, cnode); - if (status != RET_OK) { - MS_LOG(ERROR) << "do node bias correct failed."; - break; - } - } - return status; -} - -int FullQuantQuantizer::BiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode) { - auto op_name = cnode->fullname_with_scope(); - const auto &bias_diff = op_bias_diff_map_[op_name]; - auto primitive = GetValueNode(cnode->input(0)); - if (primitive == nullptr) { - MS_LOG(ERROR) << op_name << " primitive is nullptr"; - return RET_NULL_PTR; - } - auto quant_param_holder = GetCNodeQuantHolder(primitive); - MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr."); - auto input_quant_params = quant_param_holder->get_input_quant_params(); - if (input_quant_params.size() == kHasBiasTensorSize) { - // compensate the existed - auto bias_quant_params = input_quant_params.at(THIRD_INPUT); - auto bias = cnode->input(THIRD_INPUT + 1); - auto bias_parameter_ptr = bias->cast(); - auto bias_default_param = bias_parameter_ptr->default_param(); - auto bias_param = bias_default_param->cast(); - int *bias_datas = static_cast(bias_param->data_c()); - - if (static_cast(bias_param->DataSize()) != bias_diff.size()) { - MS_LOG(DEBUG) << op_name << " unexpected bias data count: " << bias_param->DataSize() - << " not the same as bias_diff: " << bias_diff.size(); - return RET_ERROR; - } - if (bias_quant_params.size() != bias_diff.size()) { - MS_LOG(ERROR) << op_name << " unexpected bias quant params size: " << bias_quant_params.size() - << " not the same as bias_diff: " << bias_diff.size(); - return RET_ERROR; - } - for (size_t i = 0; i < bias_param->DataSize(); i++) { - auto scale = bias_quant_params[i].scale; - if (fabs(scale) <= 0.0f) { - MS_LOG(ERROR) << op_name << " divisor 'scale' cannot be 0."; - return RET_ERROR; - } - double after_correct = std::round(bias_diff[i] / scale) + bias_datas[i]; - const constexpr int32_t corrected_bias_abs_limit = 0.6 * INT32_MAX; - if (after_correct > corrected_bias_abs_limit) { - MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too large: " << after_correct - << " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale; - bias_datas[i] = static_cast(corrected_bias_abs_limit); - } else if (after_correct < -corrected_bias_abs_limit) { - MS_LOG(WARNING) << op_name << " ch: " << i << " bias after_corrected too small: " << after_correct - << " origin value: " << bias_datas[i] << " bias_diff: " << bias_diff[i] << " scale: " << scale; - bias_datas[i] = static_cast(-corrected_bias_abs_limit); - } else { - auto diff = static_cast(std::round(bias_diff[i] / scale)); - bias_datas[i] += diff; - } - } - } else if (input_quant_params.size() == kHasBiasTensorSize - 1) { - MS_LOG(INFO) << op_name << " add bias input"; - // need to add bias input - auto parameter = func_graph->add_parameter(); - if (parameter == nullptr) { - MS_LOG(ERROR) << "parameter is nullptr."; - return RET_NULL_PTR; - } - ShapeVector shape; - shape.push_back(bias_diff.size()); - - auto tensor_info = CreateTensorInfo(bias_diff.data(), sizeof(float) * bias_diff.size(), shape, kNumberTypeFloat32); - if (tensor_info == nullptr) { - MS_LOG(ERROR) << op_name << " create tensor info failed."; - return RET_ERROR; - } - auto status = InitParameterFromTensorInfo(parameter, tensor_info); - if (status != RET_OK) { - MS_LOG(ERROR) << op_name << " init parameter from tensor info failed"; - return RET_ERROR; - } - parameter->set_name("added_" + op_name + "_bias"); - cnode->add_input(parameter); - status = DoParameterBiasQuant(parameter, primitive); - if (status != RET_OK) { - MS_LOG(ERROR) << op_name << " Do bias quant failed."; - return RET_ERROR; - } - } else { - MS_LOG(WARNING) << op_name << " unexpected size: " << input_quant_params.size() - << ", and shared weight tensor does not support bias correction temporarily."; - } - return RET_OK; -} - int FullQuantQuantizer::DoQuantize(FuncGraphPtr func_graph) { MS_LOG(INFO) << "start to parse config file"; if (flags_.dataPreProcessParam.calibrate_path.empty()) { @@ -968,283 +603,17 @@ int FullQuantQuantizer::DoQuantize(FuncGraphPtr func_graph) { ReturnCode::GetSingleReturnCode()->UpdateReturnCode(status); return RET_ERROR; } - SessionModel int8_sm; if (this->flags_.fullQuantParam.bias_correction) { - // init in8 session - MS_LOG(INFO) << "create quant session"; - flags_.commonQuantParam.quant_type = schema::QuantType_QUANT_ALL; - int8_sm = CreateSessionByFuncGraph(func_graph, flags_, this->flags_.commonQuantParam.thread_num); - int8_session_ = int8_sm.session; - int8_model_ = int8_sm.model; - if (int8_session_ == nullptr || int8_model_ == nullptr) { - MS_LOG(ERROR) << "create session failed!"; - return RET_ERROR; - } MS_LOG(INFO) << "do bias correction"; - status = BiasCorrection(func_graph); + BiasCorrectionStrategy strategy(flags_, calibrator_, fp32_session_, fp32_model_, activation_q_min_, + activation_q_max_); + status = strategy.DoBiasCorrection(func_graph); if (status != RET_OK) { - MS_LOG(ERROR) << "BiasCorrection failed."; + MS_LOG(ERROR) << "bias_correction failed."; return status; } } } return RET_OK; } - -bool FullQuantQuantizer::OpInputDataHandle(OperationType type, const string &op_name, std::vector *data) { - MS_ASSERT(data != nullptr); - std::lock_guard lg(mutex_op_input_); - if (type == STORE) { - if (fp32_op_input_map_.find(op_name) != fp32_op_input_map_.end()) { - // the data has not been fetched by int8 model - return false; - } - fp32_op_input_map_[op_name] = *data; - return true; - } else if (type == FETCH) { - if (fp32_op_input_map_.find(op_name) == fp32_op_input_map_.end()) { - // the data not generated by fp32 model yet - return false; - } - *data = fp32_op_input_map_[op_name]; - fp32_op_input_map_.erase(op_name); - return true; - } else { - MS_LOG(ERROR) << "unexpected type: " << type; - } - return false; -} - -bool FullQuantQuantizer::OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector *data) { - MS_ASSERT(data != nullptr); - std::lock_guard lg(mutex_op_output_); - if (type == STORE) { - if (fp32_op_output_ch_mean_map_.find(op_name) != fp32_op_output_ch_mean_map_.end()) { - // the data has not been fetched by int8 model - return false; - } - fp32_op_output_ch_mean_map_[op_name] = *data; - return true; - } else if (type == FETCH) { - if (fp32_op_output_ch_mean_map_.find(op_name) == fp32_op_output_ch_mean_map_.end()) { - // the data not generated by fp32 model yet - return false; - } - *data = fp32_op_output_ch_mean_map_[op_name]; - fp32_op_output_ch_mean_map_.erase(op_name); - return true; - } else { - MS_LOG(ERROR) << "unexpected type: " << type; - } - return false; -} - -KernelCallBack FullQuantQuantizer::GetBeforeCallBack(bool int8_op) { - KernelCallBack before_call_back; - if (!int8_op) { - before_call_back = [this](const std::vector &before_inputs, - const std::vector &before_outputs, - const CallBackParam &callParam) -> bool { - if (callParam.node_type == kTypeConv2D) { - if (FullQuantQuantizer::CheckFp32TensorVec(callParam.node_name, before_inputs) != RET_OK) { - return true; - } - auto tensor = before_inputs[0]; - MS_ASSERT(tensor != nullptr); - size_t elem_count = tensor->ElementsNum(); - MS_CHECK_GT(elem_count, 0, false); - std::vector fp32_op_input(elem_count); - auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size()); - if (ret != EOK) { - MS_LOG(ERROR) << "memcpy error: " << ret; - return false; - } - while (!OpInputDataHandle(STORE, callParam.node_name, &fp32_op_input)) { - std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase)); - } - } - return true; - }; - } else { - before_call_back = [this](const std::vector &before_inputs, - const std::vector &before_outputs, - const CallBackParam &callParam) -> bool { - if (callParam.node_type == kTypeConv2D) { - vector fp32_op_input; - while (!OpInputDataHandle(FETCH, callParam.node_name, &fp32_op_input)) { - std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase)); - } - auto tensor = before_inputs[0]; - MS_ASSERT(tensor != nullptr); - // op can be skipped. - if (tensor->data_type() != kNumberTypeInt8) { - MS_LOG(INFO) << "tensor type is " << tensor->data_type(); - return true; - } - // do quantization: activation is always per layer quantized - std::vector quant_datas; - auto quant_params = tensor->quant_params(); - if (quant_params.size() != 1) { - MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size(); - return false; - } - schema::QuantParamT quant_param_t; - quant_param_t.scale = quant_params[0].scale; - quant_param_t.zeroPoint = quant_params[0].zeroPoint; - for (auto float_data : fp32_op_input) { - auto quant_data = QuantizeData(float_data, &quant_param_t, activation_q_max_, activation_q_min_); - quant_datas.push_back(quant_data); - } - - if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) { - MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size() - << " not the same with: " << quant_datas.size() * sizeof(int8_t); - return false; - } - - auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t)); - if (ret != EOK) { - MS_LOG(ERROR) << "memcpy error: " << ret; - return false; - } - } - return true; - }; - } - return before_call_back; -} - -KernelCallBack FullQuantQuantizer::GetAfterCallBack(bool int8_op) { - KernelCallBack after_call_back; - if (!int8_op) { - return GetFloatAfterCallBack(); - } - return GetInt8AfterCallBack(); -} - -KernelCallBack FullQuantQuantizer::GetInt8AfterCallBack() { - KernelCallBack after_call_back = [this](const std::vector &afterInputs, - const std::vector &afterOutputs, - const CallBackParam &callParam) -> bool { - if (callParam.node_type == kTypeConv2D) { - vector fp32_op_output_ch_mean; - while (!OpOutputChMeanDataHandle(FETCH, callParam.node_name, &fp32_op_output_ch_mean)) { - std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase)); - } - auto tensor = afterOutputs[0]; - MS_ASSERT(tensor != nullptr); - // op can be skipped. - if (tensor->data_type() != kNumberTypeInt8) { - MS_LOG(INFO) << "tensor type is " << tensor->data_type(); - return true; - } - const int8_t *tensor_data = static_cast(tensor->data()); - size_t elem_count = tensor->ElementsNum(); - MS_CHECK_GT(elem_count, 0, false); - auto shapes = tensor->shape(); - if (shapes.size() != DIMENSION_4D) { - MS_LOG(ERROR) << "unexpected shape size: " << shapes.size(); - return false; - } - // suppose the the format is NHWC - auto channels = shapes[FOURTH_INPUT]; - if (channels == 0) { - MS_LOG(ERROR) << "unexpected channels: 0"; - return false; - } - auto quant_params = tensor->quant_params(); - if (quant_params.size() != 1) { - MS_LOG(ERROR) << "unexpected activatation quant_params size: " << quant_params.size(); - return false; - } - auto scale = quant_params[0].scale; - auto zp = quant_params[0].zeroPoint; - std::vector dequant_op_output_ch_mean(channels); - auto one_filter_size = elem_count / channels; - for (int i = 0; i < channels; i++) { - float sum = 0; - for (size_t j = 0; j < one_filter_size; j++) { - auto index = j * channels + i; - if (index >= elem_count) { - MS_LOG(ERROR) << "over flow!"; - return false; - } - // deuqant activation - auto float_data = scale * (tensor_data[index] - zp); - sum += float_data; - } - if (one_filter_size == 0) { - MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0."; - return false; - } - sum = sum / one_filter_size; - dequant_op_output_ch_mean[i] = sum; - } - std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(), - dequant_op_output_ch_mean.begin(), std::minus<>()); - - if (op_bias_diff_map_.find(callParam.node_name) != op_bias_diff_map_.end()) { - auto &bias_diff = op_bias_diff_map_[callParam.node_name]; - std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(), - std::plus<>()); - } else { - op_bias_diff_map_[callParam.node_name] = dequant_op_output_ch_mean; - } - } - return true; - }; - return after_call_back; -} - -KernelCallBack FullQuantQuantizer::GetFloatAfterCallBack() { - KernelCallBack after_call_back = [this](const std::vector &afterInputs, - const std::vector &afterOutputs, - const CallBackParam &callParam) -> bool { - if (callParam.node_type == kTypeConv2D) { - if (FullQuantQuantizer::CheckFp32TensorVec(callParam.node_name, afterOutputs) != RET_OK) { - return true; - } - auto tensor = afterOutputs[0]; - MS_ASSERT(tensor != nullptr); - const auto *tensor_data = static_cast(tensor->data()); - size_t elem_count = tensor->ElementsNum(); - MS_CHECK_GT(elem_count, 0, false); - auto shapes = tensor->shape(); - if (shapes.size() != DIMENSION_4D) { - MS_LOG(ERROR) << "unexpected shape size: " << shapes.size(); - return false; - } - // suppose the activation format: NHWC - auto channels = shapes[FOURTH_INPUT]; - if (channels == 0) { - MS_LOG(ERROR) << "unexpected channels: 0"; - return false; - } - std::vector fp32_op_output_ch_mean(channels); - auto one_filter_size = elem_count / channels; - for (int i = 0; i < channels; i++) { - float sum = 0; - for (size_t j = 0; j < one_filter_size; j++) { - auto index = j * channels + i; - if (index >= elem_count) { - MS_LOG(ERROR) << "over flow!"; - return false; - } - sum += tensor_data[index]; - } - if (one_filter_size == 0) { - MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0."; - return false; - } - sum = sum / one_filter_size; - fp32_op_output_ch_mean[i] = sum; - } - while (!OpOutputChMeanDataHandle(STORE, callParam.node_name, &fp32_op_output_ch_mean)) { - std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase)); - } - } - return true; - }; - return after_call_back; -} } // namespace mindspore::lite::quant diff --git a/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.h b/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.h index 156f04f86ed..e46f4b2ce09 100644 --- a/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.h +++ b/mindspore/lite/tools/converter/quantizer/full_quant_quantizer.h @@ -39,11 +39,6 @@ #include "src/common/quant_utils.h" namespace mindspore::lite::quant { -enum OperationType { - STORE, - FETCH, -}; - class FullQuantQuantizer : public Quantizer { public: explicit FullQuantQuantizer(const converter::Flags &flags) : Quantizer(flags) { @@ -55,45 +50,20 @@ class FullQuantQuantizer : public Quantizer { int DoQuantize(FuncGraphPtr func_graph) override; private: - bool OpInputDataHandle(OperationType type, const string &op_name, std::vector *data); - - bool OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector *data); - int PreProcess(const FuncGraphPtr &func_graph); - - int CheckFp32TensorVec(const std::string &node_name, const std::vector &tensor_vec); - int DoInference(CollectType collect_type); - int UpdateDivergeInterval(); - int QuantNodeSimpleOp(const CNodePtr &cnode); - int QuantNode(const FuncGraphPtr &func_graph); - int SetInOutQuantParam(const AnfNodePtr &input_node, const std::unique_ptr &info, const PrimitivePtr &primitive, bool is_input, size_t index) const; - int DoParameterWeightQuant(const ParameterPtr &weight, const PrimitivePtr &primitive, bool per_channel, int input_index) const; - int DoValueNodeWeightQuant(const ValueNodePtr &weight, const PrimitivePtr &primitive, bool per_channel, int input_index) const; - int DoParameterNodeQuant(const CNodePtr &cnode, const ParameterPtr &input_node, size_t input_index); - int DoValueNodeQuant(const CNodePtr &cnode, const ValueNodePtr &input_node, size_t input_index); - int IsSupportWeightQuant(const CNodePtr &cnode, const AnfNodePtr &input_node, size_t input_index); - - int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive); - int Int8Inference(); - int BiasCorrection(const FuncGraphPtr &func_graph); - int BiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode); - KernelCallBack GetBeforeCallBack(bool int8_op); - KernelCallBack GetAfterCallBack(bool int8_op); - KernelCallBack GetInt8AfterCallBack(); - KernelCallBack GetFloatAfterCallBack(); void InitQMinMax(); void InitCpuConfig(); void InitKirinConfig(); @@ -117,17 +87,9 @@ class FullQuantQuantizer : public Quantizer { std::set per_channel_ops_; std::set support_activation_; - std::unique_ptr calibrator_{nullptr}; + std::shared_ptr calibrator_{nullptr}; session::LiteSession *fp32_session_{nullptr}; Model *fp32_model_{nullptr}; - session::LiteSession *int8_session_{nullptr}; - Model *int8_model_{nullptr}; - - std::map> fp32_op_input_map_; // concurrency - std::map> fp32_op_output_ch_mean_map_; // concurrency - std::map> op_bias_diff_map_; // only use by int8 model - std::mutex mutex_op_input_; - std::mutex mutex_op_output_; // key is tensor_name std::map> weight_quant_params_bak; diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.cc b/mindspore/lite/tools/converter/quantizer/quantize_util.cc index 197997a07d8..97a92ce913d 100644 --- a/mindspore/lite/tools/converter/quantizer/quantize_util.cc +++ b/mindspore/lite/tools/converter/quantizer/quantize_util.cc @@ -46,7 +46,79 @@ constexpr int kSingleDirBiasTensorSize = 4; constexpr int kLstmBiasShapeSize = 2; constexpr int kLstmBiasIndex = 3; constexpr size_t kBitNumPerByte = 8; + +int ComputeBiasDataAndQuantParam(const std::vector &bias_scales, const std::vector &input_scales, + const float *raw_datas, const QuantParamHolderPtr &quant_param_holder, + std::vector *quant_params, std::vector *quant_datas) { + MS_ASSERT(raw_datas != nullptr && quant_param_holder != nullptr); + MS_ASSERT(quant_params != nullptr && quant_datas != nullptr); + double bias_scale_tmp; + const constexpr double quanted_bias_abs_limit = 0.5 * INT32_MAX; + MS_CHECK_TRUE_MSG(quant_param_holder->get_input_quant_params().size() > 1, RET_ERROR, "invalid access."); + auto weight_quant_params = quant_param_holder->get_input_quant_params().at(1); + auto shape_size = quant_datas->size(); + if (bias_scales.size() == shape_size) { + for (size_t i = 0; i < shape_size; i++) { + bias_scale_tmp = bias_scales[i]; + if (fabs(bias_scale_tmp) <= 0.0f) { + MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0."; + return RET_ERROR; + } + if (std::abs(raw_datas[i] / bias_scale_tmp) >= quanted_bias_abs_limit) { + MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[i].scale + << " is too small, need to update"; + // update filter scale and zp + double activate_scale = input_scales[0]; + double filter_scale = std::abs(raw_datas[i]) / (activate_scale * quanted_bias_abs_limit); + weight_quant_params[i].scale = filter_scale; + weight_quant_params[i].zeroPoint = 0; + quant_param_holder->set_input_quant_param(1, weight_quant_params); + bias_scale_tmp = std::abs(raw_datas[i]) / quanted_bias_abs_limit; + quant_params->at(i).scale = bias_scale_tmp; + MS_LOG(DEBUG) << "new filter scale: " << filter_scale; + } + auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp); + quant_datas->at(i) = quant_data; + } + return RET_OK; + } else if (bias_scales.size() == 1) { + // for fc, per tensor quant + bias_scale_tmp = quant_params->front().scale; + float max_raw_data = 0.0f; + for (size_t i = 0; i < shape_size; i++) { + if (std::abs(raw_datas[i]) > max_raw_data) { + max_raw_data = std::abs(raw_datas[i]); + } + } + if (fabs(bias_scale_tmp) <= 0.0f) { + MS_LOG(ERROR) << "divisor 'bias_scale_tmp' cannot be 0."; + return RET_ERROR; + } + if (std::abs(max_raw_data / bias_scale_tmp) >= quanted_bias_abs_limit) { + MS_LOG(DEBUG) << "quanted bias over flow, maybe the scale of weight: " << weight_quant_params[0].scale + << " is too small, need to update"; + double activate_scale = input_scales[0]; + MS_CHECK_TRUE_MSG(activate_scale != 0, RET_ERROR, "activate_scale == 0"); + double filter_scale = std::abs(max_raw_data) / (activate_scale * quanted_bias_abs_limit); + weight_quant_params[0].scale = filter_scale; + weight_quant_params[0].zeroPoint = 0; + quant_param_holder->set_input_quant_param(1, weight_quant_params); + bias_scale_tmp = max_raw_data / quanted_bias_abs_limit; + quant_params->front().scale = bias_scale_tmp; + MS_LOG(DEBUG) << "new filter scale: " << filter_scale; + } + for (size_t i = 0; i < shape_size; i++) { + auto quant_data = (int32_t)std::round(raw_datas[i] / bias_scale_tmp); + quant_datas->at(i) = quant_data; + } + return RET_OK; + } + MS_LOG(ERROR) << "unexpected input_scales size: " << input_scales.size() + << " weight_scales size: " << weight_quant_params.size(); + return RET_ERROR; +} } // namespace + QuantParamHolderPtr GetCNodeQuantHolder(const PrimitivePtr &primitive) { MS_CHECK_TRUE_RET(primitive != nullptr, nullptr); QuantParamHolderPtr quant_params_holder = nullptr; @@ -459,4 +531,92 @@ std::string BoolVectorToString(const std::vector &bool_vec) { } return str; } + +int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive) { + CHECK_NULL_RETURN(bias); + CHECK_NULL_RETURN(primitive); + auto bias_default_param = bias->default_param(); + auto bias_param = bias_default_param->cast(); + MS_ASSERT(bias_parameter != nullptr); + auto quant_param_holder = GetCNodeQuantHolder(primitive); + MS_CHECK_TRUE_MSG(quant_param_holder != nullptr, RET_NULL_PTR, "quant_param_holder is nullptr."); + auto active_weight_quant_params = quant_param_holder->get_input_quant_params(); + + auto active_params = active_weight_quant_params.at(FIRST_INPUT); + auto weight_params = active_weight_quant_params.at(SECOND_INPUT); + + vector input_scales; + vector filter_scales; + vector bias_scales; + size_t sizeX = active_params.size(); + for (size_t i = 0; i < sizeX; i++) { + input_scales.emplace_back(active_params[i].scale); + } + size_t sizeY = weight_params.size(); + if (sizeX != sizeY) { + if (sizeX > 1 && sizeY > 1) { + MS_LOG(ERROR) << "input and filter's scale count cannot match!"; + return RET_ERROR; + } + } + for (size_t i = 0; i < sizeY; i++) { + filter_scales.emplace_back(weight_params[i].scale); + } + size_t size = std::max(sizeX, sizeY); + for (size_t i = 0; i < size; i++) { + auto scaleX = sizeX > 1 ? input_scales[i] : input_scales[0]; + auto scaleY = sizeY > 1 ? filter_scales[i] : filter_scales[0]; + bias_scales.push_back(scaleX * scaleY); + } + MS_ASSERT(!bias_scales.empty()); + size_t shape_size = bias_param->DataSize(); + + // set bias quant param + std::vector quant_params; + for (double bias_scale : bias_scales) { + schema::QuantParamT quant_param; + if (bias_scale == 0) { + MS_LOG(WARNING) << "bias_scale == 0"; + quant_param.scale = 1; + } else { + quant_param.scale = bias_scale; + } + quant_param.numBits = k32Bit; + quant_param.zeroPoint = 0; + quant_param.inited = true; + quant_params.emplace_back(quant_param); + } + // quant bias data + std::vector quant_datas(shape_size); + + auto *raw_datas = static_cast(bias_param->data_c()); + if (ComputeBiasDataAndQuantParam(bias_scales, input_scales, raw_datas, quant_param_holder, &quant_params, + &quant_datas) != RET_OK) { + MS_LOG(ERROR) << "compute bias data failed."; + return RET_ERROR; + } + quant_param_holder->set_input_quant_param(THIRD_INPUT, quant_params); + auto ret = SetTensorData(bias_param, quant_datas.data(), shape_size * sizeof(int32_t)); + if (ret != RET_OK) { + MS_LOG(ERROR) << "set tensor data failed."; + return RET_ERROR; + } + // set dtype + auto abstractBase = bias->abstract(); + if (abstractBase == nullptr) { + MS_LOG(ERROR) << "Abstract of parameter is nullptr, " << bias->name(); + return RET_ERROR; + } + if (!utils::isa(abstractBase)) { + MS_LOG(ERROR) << "Abstract of parameter should be anstract tensor, " << bias->name(); + return RET_ERROR; + } + auto abstractTensor = utils::cast(abstractBase); + if (abstractTensor == nullptr || abstractTensor->element() == nullptr) { + MS_LOG(ERROR) << "abstractTensor is nullptr" << bias->name(); + return RET_NULL_PTR; + } + abstractTensor->element()->set_type(TypeIdToType(kNumberTypeInt32)); + return RET_OK; +} } // namespace mindspore::lite::quant diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.h b/mindspore/lite/tools/converter/quantizer/quantize_util.h index b320af594ab..e928d4401c3 100644 --- a/mindspore/lite/tools/converter/quantizer/quantize_util.h +++ b/mindspore/lite/tools/converter/quantizer/quantize_util.h @@ -96,6 +96,8 @@ int GetPreferredDim(const PrimitivePtr &primitive, int input_index, const std::v std::vector ConvertShapeVectorToInt32(const ShapeVector &dims); +int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive); + template int FixedBitQuantFilter(const AnfNodePtr ¶meter, const tensor::TensorPtr &weight, const PrimitivePtr &primitive, QuantType quant_type, int quant_max, int quant_min, size_t bit_num,