!28283 fix full quant support low bit bug & optimize bias correction

Merge pull request !28283 from yeyunpeng2020/quant
This commit is contained in:
i-robot 2021-12-29 02:29:49 +00:00 committed by Gitee
commit 195868c646
10 changed files with 289 additions and 263 deletions

View File

@ -465,7 +465,7 @@ int MatmulBaseInt8CPUKernel::RunArm64Sdot() {
batch_input_ptr_ = a_ptr + i * param_->row_ * param_->deep_;
auto ret = ParallelLaunch(this->ms_context_, Arm64SdotPreRun, this, op_parameter_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "RunArm64Sdot error: [" << ret << "]";
MS_LOG(ERROR) << "Arm64SdotPreRun error: [" << ret << "]";
return ret;
}
@ -476,7 +476,7 @@ int MatmulBaseInt8CPUKernel::RunArm64Sdot() {
ret = ParallelLaunch(this->ms_context_, Arm64SdotRun, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "RunArm64Sdot error: [" << ret << "]";
MS_LOG(ERROR) << "Arm64SdotRun error: [" << ret << "]";
return ret;
}
}

View File

@ -146,9 +146,9 @@ static STATUS CompressTensor(schema::TensorT *tensor_input, const std::unique_pt
repetition_packed = quant::PackRepetition<int16_t>(bit_num, tensor_input);
}
}
if (bit_num != kBitNum8 && bit_num != kBitNum16 && !repetition_packed &&
if (!tensor_input->data.empty() && bit_num != kBitNum8 && bit_num != kBitNum16 && !repetition_packed &&
dst_node->quantType != schema::QuantType_QUANT_NONE) {
auto status = DoBitPack(bit_num, tensor_input);
auto status = quant::DoBitPack(bit_num, tensor_input);
if (status != RET_OK) {
MS_LOG(ERROR) << "do bit pack failed. " << status;
return RET_ERROR;

View File

@ -33,7 +33,6 @@
namespace mindspore {
namespace lite {
namespace {
enum QuantBitNum { QuantBitNum_INT8 = 8, QuantBitNum_INT16 = 16 };
const int kZeroPointGap = 128;
} // namespace
int SetFuncGraphOutput(const FuncGraphPtr &graph, const std::vector<AnfNodePtr> &outputs) {
@ -114,47 +113,6 @@ STATUS ReplaceTensorOfNode(schema::MetaGraphT *graphT, uint32_t nodeIdx, uint32_
return RET_OK;
}
int DoBitPack(const int &bit_num, schema::TensorT *tensor_input) {
if (bit_num > 0 && bit_num < 8) {
std::vector<int8_t> origin_data(tensor_input->data.size());
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int8_t), tensor_input->data.data(),
tensor_input->data.size() * sizeof(uint8_t));
if (status != EOK) {
MS_LOG(ERROR) << "memcpy failed. " << status;
return RET_ERROR;
}
std::vector<uint8_t> pack_data{};
BitPack::BitPacking<int8_t, uint8_t>(bit_num, origin_data, &pack_data);
tensor_input->data.resize(pack_data.size() * sizeof(uint8_t));
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
pack_data.size() * sizeof(uint8_t));
if (status != EOK) {
MS_LOG(ERROR) << "memcpy_s failed. " << status;
return RET_ERROR;
}
} else if (bit_num > QuantBitNum_INT8 && bit_num < QuantBitNum_INT16) {
auto shape_size =
std::accumulate(tensor_input->dims.begin(), tensor_input->dims.end(), size_t(1), std::multiplies<size_t>());
std::vector<int16_t> origin_data(shape_size);
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int16_t), tensor_input->data.data(),
tensor_input->data.size() * sizeof(uint8_t));
if (status != EOK) {
MS_LOG(ERROR) << "memcpy failed. " << status;
return RET_ERROR;
}
std::vector<uint16_t> pack_data{};
BitPack::BitPacking<int16_t, uint16_t>(bit_num, origin_data, &pack_data);
tensor_input->data.resize(pack_data.size() * sizeof(uint16_t));
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
pack_data.size() * sizeof(uint16_t));
if (status != EOK) {
MS_LOG(ERROR) << "memcpy_s failed. " << status;
return RET_ERROR;
}
}
return RET_OK;
}
NodeIter InsertNode(schema::MetaGraphT *graphT, uint32_t existNodeIdx, InsertPlace place, size_t inoutIndex,
std::unique_ptr<CNodeT> toAddNode, STATUS *errorCode, int *insert_num,
const OpDefCopyer &opDefCopyer) {

View File

@ -56,8 +56,6 @@ STATUS AddTensor2Node(schema::MetaGraphT *graphT, uint32_t nodeIdx, std::unique_
STATUS ReplaceTensorOfNode(schema::MetaGraphT *graphT, uint32_t nodeIdx, uint32_t inTensorIdx,
std::unique_ptr<schema::TensorT> tensor);
int DoBitPack(const int &bit_num, schema::TensorT *tensor_input);
NodeIter InsertNode(schema::MetaGraphT *graphT, uint32_t existNodeIdx, InsertPlace place, size_t inoutIndex,
std::unique_ptr<schema::CNodeT> toAddNode, STATUS *errorCode, int *insert_num,
const OpDefCopyer &opDefCopyer = GetSimpleOpCopyer());

View File

@ -199,7 +199,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) {
return RET_ERROR;
}
int bit_num = tensor->quantParams.front()->numBits;
if (DoBitPack(bit_num, tensor.get()) != RET_OK) {
if (quant::DoBitPack(bit_num, tensor.get()) != RET_OK) {
MS_LOG(ERROR) << "bit pack failed.";
return RET_ERROR;
}
@ -212,7 +212,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) {
quantParam->dstDtype == TypeId::kNumberTypeFloat32 || quantParam->dstDtype == TypeId::kNumberTypeFloat) {
status = ComputeDataToInt8(tensor);
int bit_num = tensor->quantParams.front()->numBits;
if (DoBitPack(bit_num, tensor.get()) != RET_OK) {
if (quant::DoBitPack(bit_num, tensor.get()) != RET_OK) {
MS_LOG(ERROR) << "bit pack failed.";
return RET_ERROR;
}

View File

@ -33,24 +33,9 @@
namespace mindspore::lite::quant {
namespace {
constexpr int kHasBiasTensorSize = 3;
const char *kTypeConv2D = schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion);
const std::set<std::string> kSupportBiasCorrectionNode = {
schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion)};
} // namespace
int BiasCorrectionStrategy::CheckFp32TensorVec(const std::string &node_name,
const std::vector<mindspore::tensor::MSTensor *> &tensor_vec) {
if (tensor_vec.empty()) {
MS_LOG(ERROR) << "node: " << node_name << " input tensors is 0";
return RET_ERROR;
}
auto *tensor = tensor_vec[0];
CHECK_NULL_RETURN(tensor);
if (tensor->data_type() != kNumberTypeFloat32) {
MS_LOG(INFO) << "node: " << node_name << " will not quantize"
<< " tensor data_type: " << tensor->data_type();
return RET_ERROR;
}
return RET_OK;
}
bool BiasCorrectionStrategy::OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data) {
MS_ASSERT(data != nullptr);
std::lock_guard<std::mutex> lg(mutex_op_input_);
@ -103,76 +88,9 @@ bool BiasCorrectionStrategy::OpOutputChMeanDataHandle(OperationType type, const
KernelCallBack BiasCorrectionStrategy::GetBeforeCallBack(bool int8_op) {
KernelCallBack before_call_back;
if (!int8_op) {
before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
if (CheckFp32TensorVec(callParam.node_name, before_inputs) != RET_OK) {
return true;
}
auto tensor = before_inputs[0];
MS_ASSERT(tensor != nullptr);
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
std::vector<float> fp32_op_input(elem_count);
auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size());
if (ret != EOK) {
MS_LOG(ERROR) << "memcpy error: " << ret;
return false;
}
while (!OpInputDataHandle(STORE, callParam.node_name, &fp32_op_input)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
}
return true;
};
} else {
before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
std::vector<float> fp32_op_input;
while (!OpInputDataHandle(FETCH, callParam.node_name, &fp32_op_input)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
auto tensor = before_inputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeInt8) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
// do quantization: activation is always per layer quantized
std::vector<int8_t> quant_datas;
auto quant_params = tensor->quant_params();
if (quant_params.size() != 1) {
MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size();
return false;
}
schema::QuantParamT quant_param_t;
quant_param_t.scale = quant_params[0].scale;
quant_param_t.zeroPoint = quant_params[0].zeroPoint;
for (auto float_data : fp32_op_input) {
auto quant_data = QuantizeData<int8_t>(float_data, &quant_param_t, activation_q_max_, activation_q_min_);
quant_datas.push_back(quant_data);
}
if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) {
MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size()
<< " not the same with: " << quant_datas.size() * sizeof(int8_t);
return false;
}
auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t));
if (ret != EOK) {
MS_LOG(ERROR) << "memcpy error: " << ret;
return false;
}
}
return true;
};
return GetFloatBeforeCallBack();
}
return before_call_back;
return GetInt8BeforeCallBack();
}
KernelCallBack BiasCorrectionStrategy::GetAfterCallBack(bool int8_op) {
@ -183,74 +101,134 @@ KernelCallBack BiasCorrectionStrategy::GetAfterCallBack(bool int8_op) {
return GetInt8AfterCallBack();
}
KernelCallBack BiasCorrectionStrategy::GetInt8AfterCallBack() {
KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
std::vector<float> fp32_op_output_ch_mean;
while (!OpOutputChMeanDataHandle(FETCH, callParam.node_name, &fp32_op_output_ch_mean)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
auto tensor = afterOutputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeInt8) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
const int8_t *tensor_data = static_cast<int8_t *>(tensor->data());
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
auto shapes = tensor->shape();
if (shapes.size() != DIMENSION_4D) {
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
return false;
}
// suppose the the format is NHWC
auto channels = shapes[FOURTH_INPUT];
if (channels == 0) {
MS_LOG(ERROR) << "unexpected channels: 0";
return false;
}
auto quant_params = tensor->quant_params();
if (quant_params.size() != 1) {
MS_LOG(ERROR) << "unexpected activatation quant_params size: " << quant_params.size();
return false;
}
auto scale = quant_params[0].scale;
auto zp = quant_params[0].zeroPoint;
std::vector<float> dequant_op_output_ch_mean(channels);
auto one_filter_size = elem_count / channels;
for (int i = 0; i < channels; i++) {
float sum = 0;
for (size_t j = 0; j < one_filter_size; j++) {
auto index = j * channels + i;
if (index >= elem_count) {
MS_LOG(ERROR) << "over flow!";
return false;
}
// deuqant activation
auto float_data = scale * (tensor_data[index] - zp);
sum += float_data;
}
if (one_filter_size == 0) {
MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
return false;
}
sum = sum / one_filter_size;
dequant_op_output_ch_mean[i] = sum;
}
std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(),
dequant_op_output_ch_mean.begin(), std::minus<>());
KernelCallBack BiasCorrectionStrategy::GetFloatBeforeCallBack() {
auto before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &call_param) -> bool {
if (kSupportBiasCorrectionNode.find(call_param.node_type) == kSupportBiasCorrectionNode.end()) {
return true;
}
auto tensor = before_inputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeFloat32) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
std::vector<float> fp32_op_input(elem_count);
auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size());
if (ret != EOK) {
MS_LOG(ERROR) << "memcpy error: " << ret;
return false;
}
while (!OpInputDataHandle(STORE, call_param.node_name, &fp32_op_input)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
return true;
};
return before_call_back;
}
if (op_bias_diff_map_.find(callParam.node_name) != op_bias_diff_map_.end()) {
auto &bias_diff = op_bias_diff_map_[callParam.node_name];
std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(),
std::plus<>());
} else {
op_bias_diff_map_[callParam.node_name] = dequant_op_output_ch_mean;
}
KernelCallBack BiasCorrectionStrategy::GetInt8BeforeCallBack() {
auto before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &call_param) -> bool {
if (kSupportBiasCorrectionNode.find(call_param.node_type) == kSupportBiasCorrectionNode.end()) {
return true;
}
auto tensor = before_inputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeInt8) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
// Get origin data
std::vector<float> fp32_op_input;
while (!OpInputDataHandle(FETCH, call_param.node_name, &fp32_op_input)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
// do quantization: activation is always per layer quantized
std::vector<int8_t> quant_datas;
auto quant_params = tensor->quant_params();
if (quant_params.size() != 1) {
MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size();
return false;
}
schema::QuantParamT quant_param_t;
quant_param_t.scale = quant_params[0].scale;
quant_param_t.zeroPoint = quant_params[0].zeroPoint;
for (auto float_data : fp32_op_input) {
auto quant_data = QuantizeData<int8_t>(float_data, &quant_param_t, activation_q_max_, activation_q_min_);
quant_datas.push_back(quant_data);
}
if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) {
MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size()
<< " not the same with: " << quant_datas.size() * sizeof(int8_t);
return false;
}
auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t));
if (ret != EOK) {
MS_LOG(ERROR) << "memcpy error: " << ret;
return false;
}
return true;
};
return before_call_back;
}
KernelCallBack BiasCorrectionStrategy::GetInt8AfterCallBack() {
auto after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const CallBackParam &call_param) -> bool {
if (kSupportBiasCorrectionNode.find(call_param.node_type) == kSupportBiasCorrectionNode.end()) {
return true;
}
auto tensor = after_outputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeInt8) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
std::vector<float> fp32_op_output_ch_mean;
while (!OpOutputChMeanDataHandle(FETCH, call_param.node_name, &fp32_op_output_ch_mean)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
// Calculate the difference between original and quantified
// DeQuant Data
std::vector<double> dequant_data;
auto ret = DeQuantData(tensor, &dequant_data);
if (ret != RET_OK) {
MS_LOG(ERROR) << "DeQuant data failed.";
return false;
}
std::vector<float> dequant_op_output_ch_mean;
// Calculate output per channel means
ret = CalculatePerChannelMeans<double>(dequant_data.data(), dequant_data.size(), tensor->shape(),
&dequant_op_output_ch_mean);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Calculate Per channel means failed.";
return false;
}
// Calculate current layer diff
std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(),
dequant_op_output_ch_mean.begin(), std::minus<>());
// Accumulate the diff of all rounds
if (op_bias_diff_sum_map_.find(call_param.node_name) != op_bias_diff_sum_map_.end()) {
auto &bias_diff = op_bias_diff_sum_map_[call_param.node_name];
std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(),
std::plus<>());
} else {
op_bias_diff_sum_map_[call_param.node_name] = dequant_op_output_ch_mean;
}
return true;
};
@ -258,51 +236,29 @@ KernelCallBack BiasCorrectionStrategy::GetInt8AfterCallBack() {
}
KernelCallBack BiasCorrectionStrategy::GetFloatAfterCallBack() {
KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
const CallBackParam &callParam) -> bool {
if (callParam.node_type == kTypeConv2D) {
if (CheckFp32TensorVec(callParam.node_name, afterOutputs) != RET_OK) {
return true;
}
auto tensor = afterOutputs[0];
MS_ASSERT(tensor != nullptr);
const auto *tensor_data = static_cast<const float *>(tensor->data());
size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
auto shapes = tensor->shape();
if (shapes.size() != DIMENSION_4D) {
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
return false;
}
// suppose the activation format: NHWC
auto channels = shapes[FOURTH_INPUT];
if (channels == 0) {
MS_LOG(ERROR) << "unexpected channels: 0";
return false;
}
std::vector<float> fp32_op_output_ch_mean(channels);
auto one_filter_size = elem_count / channels;
for (int i = 0; i < channels; i++) {
float sum = 0;
for (size_t j = 0; j < one_filter_size; j++) {
auto index = j * channels + i;
if (index >= elem_count) {
MS_LOG(ERROR) << "over flow!";
return false;
}
sum += tensor_data[index];
}
if (one_filter_size == 0) {
MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
return false;
}
sum = sum / one_filter_size;
fp32_op_output_ch_mean[i] = sum;
}
while (!OpOutputChMeanDataHandle(STORE, callParam.node_name, &fp32_op_output_ch_mean)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
auto after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const CallBackParam &call_param) -> bool {
if (kSupportBiasCorrectionNode.find(call_param.node_type) == kSupportBiasCorrectionNode.end()) {
return true;
}
auto tensor = after_outputs[0];
MS_ASSERT(tensor != nullptr);
// op can be skipped.
if (tensor->data_type() != kNumberTypeFloat32) {
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
return true;
}
std::vector<float> fp32_op_output_ch_mean;
// Calculate output per channel means
auto ret = CalculatePerChannelMeans<float>(static_cast<float *>(tensor->data()), tensor->ElementsNum(),
tensor->shape(), &fp32_op_output_ch_mean);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Calculate Per channel means failed.";
return false;
}
while (!OpOutputChMeanDataHandle(STORE, call_param.node_name, &fp32_op_output_ch_mean)) {
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
}
return true;
};
@ -335,18 +291,26 @@ int BiasCorrectionStrategy::Int8Inference() {
return RET_OK;
}
int BiasCorrectionStrategy::DoBiasCorrection(const FuncGraphPtr &func_graph) {
int BiasCorrectionStrategy::CreateQuantModel(const FuncGraphPtr &quant_func_graph) {
// init in8 session
MS_LOG(INFO) << "create quant session";
flags_.commonQuantParam.quant_type = schema::QuantType_QUANT_ALL;
auto int8_sm = CreateSessionByFuncGraph(func_graph, flags_, this->flags_.commonQuantParam.thread_num);
auto int8_sm = CreateSessionByFuncGraph(quant_func_graph, flags_, this->flags_.commonQuantParam.thread_num);
int8_session_ = int8_sm.session;
int8_model_ = int8_sm.model;
if (int8_session_ == nullptr || int8_model_ == nullptr) {
MS_LOG(ERROR) << "create session failed!";
return RET_ERROR;
return RET_NULL_PTR;
}
return RET_OK;
}
int BiasCorrectionStrategy::DoCPUBiasCorrection(const FuncGraphPtr &quant_func_graph) {
auto ret = CreateQuantModel(quant_func_graph);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Create quant model failed:" << ret;
return ret;
}
std::future<int> int8_inference = std::async(std::launch::async, &BiasCorrectionStrategy::Int8Inference, this);
// get input tensor
std::vector<mindspore::tensor::MSTensor *> inputs = fp32_session_->GetInputs();
@ -381,17 +345,17 @@ int BiasCorrectionStrategy::DoBiasCorrection(const FuncGraphPtr &func_graph) {
MS_LOG(ERROR) << "divisor 'calibrate_size' cannot be 0.";
return RET_ERROR;
}
for (auto &key_value : op_bias_diff_map_) {
for (auto &key_value : op_bias_diff_sum_map_) {
std::for_each(key_value.second.begin(), key_value.second.end(),
[this](float &data) { data = data / calibrator_->GetBatchNum(); });
}
auto cnodes = func_graph->GetOrderedCnodes();
auto cnodes = quant_func_graph->GetOrderedCnodes();
for (auto &cnode : cnodes) {
auto op_name = cnode->fullname_with_scope();
if (op_bias_diff_map_.find(op_name) == op_bias_diff_map_.end()) {
if (op_bias_diff_sum_map_.find(op_name) == op_bias_diff_sum_map_.end()) {
continue;
}
status = DoCNodeBiasCorrection(func_graph, cnode);
status = DoCNodeBiasCorrection(quant_func_graph, cnode);
if (status != RET_OK) {
MS_LOG(ERROR) << "do node bias correct failed.";
break;
@ -400,9 +364,9 @@ int BiasCorrectionStrategy::DoBiasCorrection(const FuncGraphPtr &func_graph) {
return status;
}
int BiasCorrectionStrategy::DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
int BiasCorrectionStrategy::DoCNodeBiasCorrection(const FuncGraphPtr &quant_func_graph, const CNodePtr &cnode) {
auto op_name = cnode->fullname_with_scope();
const auto &bias_diff = op_bias_diff_map_[op_name];
const auto &bias_diff = op_bias_diff_sum_map_[op_name];
auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
if (primitive == nullptr) {
MS_LOG(ERROR) << op_name << " primitive is nullptr";
@ -454,7 +418,7 @@ int BiasCorrectionStrategy::DoCNodeBiasCorrection(const FuncGraphPtr &func_graph
} else if (input_quant_params.size() == kHasBiasTensorSize - 1) {
MS_LOG(INFO) << op_name << " add bias input";
// need to add bias input
auto parameter = func_graph->add_parameter();
auto parameter = quant_func_graph->add_parameter();
if (parameter == nullptr) {
MS_LOG(ERROR) << "parameter is nullptr.";
return RET_NULL_PTR;

View File

@ -49,18 +49,56 @@ class BiasCorrectionStrategy {
delete int8_model_;
}
}
int DoBiasCorrection(const FuncGraphPtr &func_graph);
int DoCPUBiasCorrection(const FuncGraphPtr &quant_func_graph);
private:
int DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode);
int CreateQuantModel(const FuncGraphPtr &quant_func_graph);
int DoCNodeBiasCorrection(const FuncGraphPtr &quant_func_graph, const CNodePtr &cnode);
int Int8Inference();
bool OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
bool OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
KernelCallBack GetBeforeCallBack(bool int8_op);
KernelCallBack GetFloatBeforeCallBack();
KernelCallBack GetInt8BeforeCallBack();
KernelCallBack GetAfterCallBack(bool int8_op);
KernelCallBack GetInt8AfterCallBack();
KernelCallBack GetFloatAfterCallBack();
int CheckFp32TensorVec(const std::string &node_name, const std::vector<mindspore::tensor::MSTensor *> &tensor_vec);
template <typename T>
int CalculatePerChannelMeans(const T *tensor_data, size_t elem_count, std::vector<int> shapes,
std::vector<float> *per_channel_mean) {
// const auto *tensor_data = static_cast<const float *>(tensor->data());
// size_t elem_count = tensor->ElementsNum();
MS_CHECK_GT(elem_count, 0, false);
// auto shapes = tensor->shape();
if (shapes.size() != DIMENSION_4D) {
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
return RET_ERROR;
}
// suppose the activation format: NHWC
auto channels = shapes[FOURTH_INPUT];
if (channels == 0) {
MS_LOG(ERROR) << "unexpected channels: 0";
return RET_ERROR;
}
per_channel_mean->resize(channels);
auto bucket_size = elem_count / channels;
for (int i = 0; i < channels; i++) {
float sum = 0;
for (size_t j = 0; j < bucket_size; j++) {
auto index = j * channels + i;
if (index >= elem_count) {
MS_LOG(ERROR) << "over flow!";
return RET_ERROR;
}
sum += tensor_data[index];
}
MS_CHECK_GT(bucket_size, 0, false);
sum = sum / bucket_size;
per_channel_mean->at(i) = sum;
}
return RET_OK;
}
private:
converter::Flags flags_;
@ -75,7 +113,7 @@ class BiasCorrectionStrategy {
std::map<std::string, std::vector<float>> fp32_op_input_map_; // concurrency
std::map<std::string, std::vector<float>> fp32_op_output_ch_mean_map_; // concurrency
std::map<std::string, std::vector<float>> op_bias_diff_map_; // only use by int8 model
std::map<std::string, std::vector<float>> op_bias_diff_sum_map_; // Record the sum of diffs in tensor
std::mutex mutex_op_input_;
std::mutex mutex_op_output_;
};

View File

@ -607,7 +607,7 @@ int FullQuantQuantizer::DoQuantize(FuncGraphPtr func_graph) {
MS_LOG(INFO) << "do bias correction";
BiasCorrectionStrategy strategy(flags_, calibrator_, fp32_session_, fp32_model_, activation_q_min_,
activation_q_max_);
status = strategy.DoBiasCorrection(func_graph);
status = strategy.DoCPUBiasCorrection(func_graph);
if (status != RET_OK) {
MS_LOG(ERROR) << "bias_correction failed.";
return status;

View File

@ -619,4 +619,65 @@ int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive
abstractTensor->element()->set_type(TypeIdToType(kNumberTypeInt32));
return RET_OK;
}
int DeQuantData(const int8_t *tensor_data, int64_t elements_num, std::vector<lite::LiteQuantParam> quant_params,
std::vector<double> *dequant_data, int preferred_dim) {
if (quant_params.size() != 1) {
MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size() << " only support per-layer now.";
return RET_ERROR;
}
auto scale = quant_params[0].scale;
auto zp = quant_params[0].zeroPoint;
dequant_data->resize(elements_num);
for (int64_t i = 0; i < elements_num; i++) {
dequant_data->at(i) = scale * (tensor_data[i] - zp);
}
return RET_OK;
}
int DeQuantData(mindspore::tensor::MSTensor *tensor, std::vector<double> *dequant_data, int preferred_dim) {
return DeQuantData(static_cast<int8_t *>(tensor->data()), tensor->ElementsNum(), tensor->quant_params(), dequant_data,
preferred_dim);
}
int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input) {
if (bit_num > 0 && bit_num < k8Bit) {
std::vector<int8_t> origin_data(tensor_input->data.size());
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int8_t), tensor_input->data.data(),
tensor_input->data.size() * sizeof(uint8_t));
if (status != EOK) {
MS_LOG(ERROR) << tensor_input->name << " memcpy failed. " << status;
return RET_ERROR;
}
std::vector<uint8_t> pack_data{};
BitPack::BitPacking<int8_t, uint8_t>(bit_num, origin_data, &pack_data);
tensor_input->data.resize(pack_data.size() * sizeof(uint8_t));
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
pack_data.size() * sizeof(uint8_t));
if (status != EOK) {
MS_LOG(ERROR) << "memcpy_s failed. " << status;
return RET_ERROR;
}
} else if (bit_num > k8Bit && bit_num < k16Bit) {
auto shape_size =
std::accumulate(tensor_input->dims.begin(), tensor_input->dims.end(), size_t(1), std::multiplies<size_t>());
std::vector<int16_t> origin_data(shape_size);
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int16_t), tensor_input->data.data(),
tensor_input->data.size() * sizeof(uint8_t));
if (status != EOK) {
MS_LOG(ERROR) << "memcpy failed. " << status;
return RET_ERROR;
}
std::vector<uint16_t> pack_data{};
BitPack::BitPacking<int16_t, uint16_t>(bit_num, origin_data, &pack_data);
tensor_input->data.resize(pack_data.size() * sizeof(uint16_t));
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
pack_data.size() * sizeof(uint16_t));
if (status != EOK) {
MS_LOG(ERROR) << "memcpy_s failed. " << status;
return RET_ERROR;
}
}
return RET_OK;
}
} // namespace mindspore::lite::quant

View File

@ -98,6 +98,13 @@ std::vector<int> ConvertShapeVectorToInt32(const ShapeVector &dims);
int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive);
int DeQuantData(mindspore::tensor::MSTensor *tensor, std::vector<double> *dequant_data, int preferred_dim = 0);
int DeQuantData(const int8_t *tensor_data, int64_t elements_num, std::vector<lite::LiteQuantParam> quant_params,
std::vector<double> *dequant_data, int preferred_dim);
int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input);
template <typename T>
int FixedBitQuantFilter(const AnfNodePtr &parameter, const tensor::TensorPtr &weight, const PrimitivePtr &primitive,
QuantType quant_type, int quant_max, int quant_min, size_t bit_num,