forked from mindspore-Ecosystem/mindspore
!28283 fix full quant support low bit bug & optimize bias correction
Merge pull request !28283 from yeyunpeng2020/quant
This commit is contained in:
commit
195868c646
|
@ -465,7 +465,7 @@ int MatmulBaseInt8CPUKernel::RunArm64Sdot() {
|
|||
batch_input_ptr_ = a_ptr + i * param_->row_ * param_->deep_;
|
||||
auto ret = ParallelLaunch(this->ms_context_, Arm64SdotPreRun, this, op_parameter_->thread_num_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunArm64Sdot error: [" << ret << "]";
|
||||
MS_LOG(ERROR) << "Arm64SdotPreRun error: [" << ret << "]";
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -476,7 +476,7 @@ int MatmulBaseInt8CPUKernel::RunArm64Sdot() {
|
|||
|
||||
ret = ParallelLaunch(this->ms_context_, Arm64SdotRun, this, thread_count_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "RunArm64Sdot error: [" << ret << "]";
|
||||
MS_LOG(ERROR) << "Arm64SdotRun error: [" << ret << "]";
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -146,9 +146,9 @@ static STATUS CompressTensor(schema::TensorT *tensor_input, const std::unique_pt
|
|||
repetition_packed = quant::PackRepetition<int16_t>(bit_num, tensor_input);
|
||||
}
|
||||
}
|
||||
if (bit_num != kBitNum8 && bit_num != kBitNum16 && !repetition_packed &&
|
||||
if (!tensor_input->data.empty() && bit_num != kBitNum8 && bit_num != kBitNum16 && !repetition_packed &&
|
||||
dst_node->quantType != schema::QuantType_QUANT_NONE) {
|
||||
auto status = DoBitPack(bit_num, tensor_input);
|
||||
auto status = quant::DoBitPack(bit_num, tensor_input);
|
||||
if (status != RET_OK) {
|
||||
MS_LOG(ERROR) << "do bit pack failed. " << status;
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -33,7 +33,6 @@
|
|||
namespace mindspore {
|
||||
namespace lite {
|
||||
namespace {
|
||||
enum QuantBitNum { QuantBitNum_INT8 = 8, QuantBitNum_INT16 = 16 };
|
||||
const int kZeroPointGap = 128;
|
||||
} // namespace
|
||||
int SetFuncGraphOutput(const FuncGraphPtr &graph, const std::vector<AnfNodePtr> &outputs) {
|
||||
|
@ -114,47 +113,6 @@ STATUS ReplaceTensorOfNode(schema::MetaGraphT *graphT, uint32_t nodeIdx, uint32_
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int DoBitPack(const int &bit_num, schema::TensorT *tensor_input) {
|
||||
if (bit_num > 0 && bit_num < 8) {
|
||||
std::vector<int8_t> origin_data(tensor_input->data.size());
|
||||
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int8_t), tensor_input->data.data(),
|
||||
tensor_input->data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::vector<uint8_t> pack_data{};
|
||||
BitPack::BitPacking<int8_t, uint8_t>(bit_num, origin_data, &pack_data);
|
||||
tensor_input->data.resize(pack_data.size() * sizeof(uint8_t));
|
||||
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
|
||||
pack_data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy_s failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (bit_num > QuantBitNum_INT8 && bit_num < QuantBitNum_INT16) {
|
||||
auto shape_size =
|
||||
std::accumulate(tensor_input->dims.begin(), tensor_input->dims.end(), size_t(1), std::multiplies<size_t>());
|
||||
std::vector<int16_t> origin_data(shape_size);
|
||||
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int16_t), tensor_input->data.data(),
|
||||
tensor_input->data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::vector<uint16_t> pack_data{};
|
||||
BitPack::BitPacking<int16_t, uint16_t>(bit_num, origin_data, &pack_data);
|
||||
tensor_input->data.resize(pack_data.size() * sizeof(uint16_t));
|
||||
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
|
||||
pack_data.size() * sizeof(uint16_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy_s failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
NodeIter InsertNode(schema::MetaGraphT *graphT, uint32_t existNodeIdx, InsertPlace place, size_t inoutIndex,
|
||||
std::unique_ptr<CNodeT> toAddNode, STATUS *errorCode, int *insert_num,
|
||||
const OpDefCopyer &opDefCopyer) {
|
||||
|
|
|
@ -56,8 +56,6 @@ STATUS AddTensor2Node(schema::MetaGraphT *graphT, uint32_t nodeIdx, std::unique_
|
|||
STATUS ReplaceTensorOfNode(schema::MetaGraphT *graphT, uint32_t nodeIdx, uint32_t inTensorIdx,
|
||||
std::unique_ptr<schema::TensorT> tensor);
|
||||
|
||||
int DoBitPack(const int &bit_num, schema::TensorT *tensor_input);
|
||||
|
||||
NodeIter InsertNode(schema::MetaGraphT *graphT, uint32_t existNodeIdx, InsertPlace place, size_t inoutIndex,
|
||||
std::unique_ptr<schema::CNodeT> toAddNode, STATUS *errorCode, int *insert_num,
|
||||
const OpDefCopyer &opDefCopyer = GetSimpleOpCopyer());
|
||||
|
|
|
@ -199,7 +199,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) {
|
|||
return RET_ERROR;
|
||||
}
|
||||
int bit_num = tensor->quantParams.front()->numBits;
|
||||
if (DoBitPack(bit_num, tensor.get()) != RET_OK) {
|
||||
if (quant::DoBitPack(bit_num, tensor.get()) != RET_OK) {
|
||||
MS_LOG(ERROR) << "bit pack failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -212,7 +212,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) {
|
|||
quantParam->dstDtype == TypeId::kNumberTypeFloat32 || quantParam->dstDtype == TypeId::kNumberTypeFloat) {
|
||||
status = ComputeDataToInt8(tensor);
|
||||
int bit_num = tensor->quantParams.front()->numBits;
|
||||
if (DoBitPack(bit_num, tensor.get()) != RET_OK) {
|
||||
if (quant::DoBitPack(bit_num, tensor.get()) != RET_OK) {
|
||||
MS_LOG(ERROR) << "bit pack failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
|
|
@ -33,24 +33,9 @@
|
|||
namespace mindspore::lite::quant {
|
||||
namespace {
|
||||
constexpr int kHasBiasTensorSize = 3;
|
||||
const char *kTypeConv2D = schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion);
|
||||
const std::set<std::string> kSupportBiasCorrectionNode = {
|
||||
schema::EnumNamePrimitiveType(schema::PrimitiveType_Conv2DFusion)};
|
||||
} // namespace
|
||||
int BiasCorrectionStrategy::CheckFp32TensorVec(const std::string &node_name,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &tensor_vec) {
|
||||
if (tensor_vec.empty()) {
|
||||
MS_LOG(ERROR) << "node: " << node_name << " input tensors is 0";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto *tensor = tensor_vec[0];
|
||||
CHECK_NULL_RETURN(tensor);
|
||||
if (tensor->data_type() != kNumberTypeFloat32) {
|
||||
MS_LOG(INFO) << "node: " << node_name << " will not quantize"
|
||||
<< " tensor data_type: " << tensor->data_type();
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
bool BiasCorrectionStrategy::OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data) {
|
||||
MS_ASSERT(data != nullptr);
|
||||
std::lock_guard<std::mutex> lg(mutex_op_input_);
|
||||
|
@ -103,76 +88,9 @@ bool BiasCorrectionStrategy::OpOutputChMeanDataHandle(OperationType type, const
|
|||
KernelCallBack BiasCorrectionStrategy::GetBeforeCallBack(bool int8_op) {
|
||||
KernelCallBack before_call_back;
|
||||
if (!int8_op) {
|
||||
before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &callParam) -> bool {
|
||||
if (callParam.node_type == kTypeConv2D) {
|
||||
if (CheckFp32TensorVec(callParam.node_name, before_inputs) != RET_OK) {
|
||||
return true;
|
||||
}
|
||||
auto tensor = before_inputs[0];
|
||||
MS_ASSERT(tensor != nullptr);
|
||||
size_t elem_count = tensor->ElementsNum();
|
||||
MS_CHECK_GT(elem_count, 0, false);
|
||||
std::vector<float> fp32_op_input(elem_count);
|
||||
auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size());
|
||||
if (ret != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy error: " << ret;
|
||||
return false;
|
||||
}
|
||||
while (!OpInputDataHandle(STORE, callParam.node_name, &fp32_op_input)) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
} else {
|
||||
before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &callParam) -> bool {
|
||||
if (callParam.node_type == kTypeConv2D) {
|
||||
std::vector<float> fp32_op_input;
|
||||
while (!OpInputDataHandle(FETCH, callParam.node_name, &fp32_op_input)) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
|
||||
}
|
||||
auto tensor = before_inputs[0];
|
||||
MS_ASSERT(tensor != nullptr);
|
||||
// op can be skipped.
|
||||
if (tensor->data_type() != kNumberTypeInt8) {
|
||||
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
|
||||
return true;
|
||||
}
|
||||
// do quantization: activation is always per layer quantized
|
||||
std::vector<int8_t> quant_datas;
|
||||
auto quant_params = tensor->quant_params();
|
||||
if (quant_params.size() != 1) {
|
||||
MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size();
|
||||
return false;
|
||||
}
|
||||
schema::QuantParamT quant_param_t;
|
||||
quant_param_t.scale = quant_params[0].scale;
|
||||
quant_param_t.zeroPoint = quant_params[0].zeroPoint;
|
||||
for (auto float_data : fp32_op_input) {
|
||||
auto quant_data = QuantizeData<int8_t>(float_data, &quant_param_t, activation_q_max_, activation_q_min_);
|
||||
quant_datas.push_back(quant_data);
|
||||
}
|
||||
|
||||
if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) {
|
||||
MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size()
|
||||
<< " not the same with: " << quant_datas.size() * sizeof(int8_t);
|
||||
return false;
|
||||
}
|
||||
|
||||
auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t));
|
||||
if (ret != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy error: " << ret;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
return GetFloatBeforeCallBack();
|
||||
}
|
||||
return before_call_back;
|
||||
return GetInt8BeforeCallBack();
|
||||
}
|
||||
|
||||
KernelCallBack BiasCorrectionStrategy::GetAfterCallBack(bool int8_op) {
|
||||
|
@ -183,74 +101,134 @@ KernelCallBack BiasCorrectionStrategy::GetAfterCallBack(bool int8_op) {
|
|||
return GetInt8AfterCallBack();
|
||||
}
|
||||
|
||||
KernelCallBack BiasCorrectionStrategy::GetInt8AfterCallBack() {
|
||||
KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
|
||||
const CallBackParam &callParam) -> bool {
|
||||
if (callParam.node_type == kTypeConv2D) {
|
||||
std::vector<float> fp32_op_output_ch_mean;
|
||||
while (!OpOutputChMeanDataHandle(FETCH, callParam.node_name, &fp32_op_output_ch_mean)) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
|
||||
}
|
||||
auto tensor = afterOutputs[0];
|
||||
MS_ASSERT(tensor != nullptr);
|
||||
// op can be skipped.
|
||||
if (tensor->data_type() != kNumberTypeInt8) {
|
||||
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
|
||||
return true;
|
||||
}
|
||||
const int8_t *tensor_data = static_cast<int8_t *>(tensor->data());
|
||||
size_t elem_count = tensor->ElementsNum();
|
||||
MS_CHECK_GT(elem_count, 0, false);
|
||||
auto shapes = tensor->shape();
|
||||
if (shapes.size() != DIMENSION_4D) {
|
||||
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
|
||||
return false;
|
||||
}
|
||||
// suppose the the format is NHWC
|
||||
auto channels = shapes[FOURTH_INPUT];
|
||||
if (channels == 0) {
|
||||
MS_LOG(ERROR) << "unexpected channels: 0";
|
||||
return false;
|
||||
}
|
||||
auto quant_params = tensor->quant_params();
|
||||
if (quant_params.size() != 1) {
|
||||
MS_LOG(ERROR) << "unexpected activatation quant_params size: " << quant_params.size();
|
||||
return false;
|
||||
}
|
||||
auto scale = quant_params[0].scale;
|
||||
auto zp = quant_params[0].zeroPoint;
|
||||
std::vector<float> dequant_op_output_ch_mean(channels);
|
||||
auto one_filter_size = elem_count / channels;
|
||||
for (int i = 0; i < channels; i++) {
|
||||
float sum = 0;
|
||||
for (size_t j = 0; j < one_filter_size; j++) {
|
||||
auto index = j * channels + i;
|
||||
if (index >= elem_count) {
|
||||
MS_LOG(ERROR) << "over flow!";
|
||||
return false;
|
||||
}
|
||||
// deuqant activation
|
||||
auto float_data = scale * (tensor_data[index] - zp);
|
||||
sum += float_data;
|
||||
}
|
||||
if (one_filter_size == 0) {
|
||||
MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
|
||||
return false;
|
||||
}
|
||||
sum = sum / one_filter_size;
|
||||
dequant_op_output_ch_mean[i] = sum;
|
||||
}
|
||||
std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(),
|
||||
dequant_op_output_ch_mean.begin(), std::minus<>());
|
||||
KernelCallBack BiasCorrectionStrategy::GetFloatBeforeCallBack() {
|
||||
auto before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &call_param) -> bool {
|
||||
if (kSupportBiasCorrectionNode.find(call_param.node_type) == kSupportBiasCorrectionNode.end()) {
|
||||
return true;
|
||||
}
|
||||
auto tensor = before_inputs[0];
|
||||
MS_ASSERT(tensor != nullptr);
|
||||
// op can be skipped.
|
||||
if (tensor->data_type() != kNumberTypeFloat32) {
|
||||
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
|
||||
return true;
|
||||
}
|
||||
size_t elem_count = tensor->ElementsNum();
|
||||
MS_CHECK_GT(elem_count, 0, false);
|
||||
std::vector<float> fp32_op_input(elem_count);
|
||||
auto ret = memcpy_s(fp32_op_input.data(), fp32_op_input.size() * sizeof(float), tensor->data(), tensor->Size());
|
||||
if (ret != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy error: " << ret;
|
||||
return false;
|
||||
}
|
||||
while (!OpInputDataHandle(STORE, call_param.node_name, &fp32_op_input)) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
|
||||
}
|
||||
return true;
|
||||
};
|
||||
return before_call_back;
|
||||
}
|
||||
|
||||
if (op_bias_diff_map_.find(callParam.node_name) != op_bias_diff_map_.end()) {
|
||||
auto &bias_diff = op_bias_diff_map_[callParam.node_name];
|
||||
std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(),
|
||||
std::plus<>());
|
||||
} else {
|
||||
op_bias_diff_map_[callParam.node_name] = dequant_op_output_ch_mean;
|
||||
}
|
||||
KernelCallBack BiasCorrectionStrategy::GetInt8BeforeCallBack() {
|
||||
auto before_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &call_param) -> bool {
|
||||
if (kSupportBiasCorrectionNode.find(call_param.node_type) == kSupportBiasCorrectionNode.end()) {
|
||||
return true;
|
||||
}
|
||||
auto tensor = before_inputs[0];
|
||||
MS_ASSERT(tensor != nullptr);
|
||||
// op can be skipped.
|
||||
if (tensor->data_type() != kNumberTypeInt8) {
|
||||
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
|
||||
return true;
|
||||
}
|
||||
// Get origin data
|
||||
std::vector<float> fp32_op_input;
|
||||
while (!OpInputDataHandle(FETCH, call_param.node_name, &fp32_op_input)) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
|
||||
}
|
||||
// do quantization: activation is always per layer quantized
|
||||
std::vector<int8_t> quant_datas;
|
||||
auto quant_params = tensor->quant_params();
|
||||
if (quant_params.size() != 1) {
|
||||
MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size();
|
||||
return false;
|
||||
}
|
||||
schema::QuantParamT quant_param_t;
|
||||
quant_param_t.scale = quant_params[0].scale;
|
||||
quant_param_t.zeroPoint = quant_params[0].zeroPoint;
|
||||
for (auto float_data : fp32_op_input) {
|
||||
auto quant_data = QuantizeData<int8_t>(float_data, &quant_param_t, activation_q_max_, activation_q_min_);
|
||||
quant_datas.push_back(quant_data);
|
||||
}
|
||||
|
||||
if (tensor->Size() != quant_datas.size() * sizeof(int8_t)) {
|
||||
MS_LOG(ERROR) << "unexpected tensor size: " << quant_datas.size()
|
||||
<< " not the same with: " << quant_datas.size() * sizeof(int8_t);
|
||||
return false;
|
||||
}
|
||||
|
||||
auto ret = memcpy_s(tensor->data(), tensor->Size(), quant_datas.data(), quant_datas.size() * sizeof(int8_t));
|
||||
if (ret != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy error: " << ret;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
return before_call_back;
|
||||
}
|
||||
|
||||
KernelCallBack BiasCorrectionStrategy::GetInt8AfterCallBack() {
|
||||
auto after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
|
||||
const CallBackParam &call_param) -> bool {
|
||||
if (kSupportBiasCorrectionNode.find(call_param.node_type) == kSupportBiasCorrectionNode.end()) {
|
||||
return true;
|
||||
}
|
||||
auto tensor = after_outputs[0];
|
||||
MS_ASSERT(tensor != nullptr);
|
||||
// op can be skipped.
|
||||
if (tensor->data_type() != kNumberTypeInt8) {
|
||||
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
|
||||
return true;
|
||||
}
|
||||
std::vector<float> fp32_op_output_ch_mean;
|
||||
while (!OpOutputChMeanDataHandle(FETCH, call_param.node_name, &fp32_op_output_ch_mean)) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
|
||||
}
|
||||
|
||||
// Calculate the difference between original and quantified
|
||||
// DeQuant Data
|
||||
std::vector<double> dequant_data;
|
||||
auto ret = DeQuantData(tensor, &dequant_data);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "DeQuant data failed.";
|
||||
return false;
|
||||
}
|
||||
std::vector<float> dequant_op_output_ch_mean;
|
||||
// Calculate output per channel means
|
||||
ret = CalculatePerChannelMeans<double>(dequant_data.data(), dequant_data.size(), tensor->shape(),
|
||||
&dequant_op_output_ch_mean);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Calculate Per channel means failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Calculate current layer diff
|
||||
std::transform(fp32_op_output_ch_mean.begin(), fp32_op_output_ch_mean.end(), dequant_op_output_ch_mean.begin(),
|
||||
dequant_op_output_ch_mean.begin(), std::minus<>());
|
||||
|
||||
// Accumulate the diff of all rounds
|
||||
if (op_bias_diff_sum_map_.find(call_param.node_name) != op_bias_diff_sum_map_.end()) {
|
||||
auto &bias_diff = op_bias_diff_sum_map_[call_param.node_name];
|
||||
std::transform(bias_diff.begin(), bias_diff.end(), dequant_op_output_ch_mean.begin(), bias_diff.begin(),
|
||||
std::plus<>());
|
||||
} else {
|
||||
op_bias_diff_sum_map_[call_param.node_name] = dequant_op_output_ch_mean;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
@ -258,51 +236,29 @@ KernelCallBack BiasCorrectionStrategy::GetInt8AfterCallBack() {
|
|||
}
|
||||
|
||||
KernelCallBack BiasCorrectionStrategy::GetFloatAfterCallBack() {
|
||||
KernelCallBack after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &afterInputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &afterOutputs,
|
||||
const CallBackParam &callParam) -> bool {
|
||||
if (callParam.node_type == kTypeConv2D) {
|
||||
if (CheckFp32TensorVec(callParam.node_name, afterOutputs) != RET_OK) {
|
||||
return true;
|
||||
}
|
||||
auto tensor = afterOutputs[0];
|
||||
MS_ASSERT(tensor != nullptr);
|
||||
const auto *tensor_data = static_cast<const float *>(tensor->data());
|
||||
size_t elem_count = tensor->ElementsNum();
|
||||
MS_CHECK_GT(elem_count, 0, false);
|
||||
auto shapes = tensor->shape();
|
||||
if (shapes.size() != DIMENSION_4D) {
|
||||
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
|
||||
return false;
|
||||
}
|
||||
// suppose the activation format: NHWC
|
||||
auto channels = shapes[FOURTH_INPUT];
|
||||
if (channels == 0) {
|
||||
MS_LOG(ERROR) << "unexpected channels: 0";
|
||||
return false;
|
||||
}
|
||||
std::vector<float> fp32_op_output_ch_mean(channels);
|
||||
auto one_filter_size = elem_count / channels;
|
||||
for (int i = 0; i < channels; i++) {
|
||||
float sum = 0;
|
||||
for (size_t j = 0; j < one_filter_size; j++) {
|
||||
auto index = j * channels + i;
|
||||
if (index >= elem_count) {
|
||||
MS_LOG(ERROR) << "over flow!";
|
||||
return false;
|
||||
}
|
||||
sum += tensor_data[index];
|
||||
}
|
||||
if (one_filter_size == 0) {
|
||||
MS_LOG(ERROR) << "divisor 'one_filter_size' cannot be 0.";
|
||||
return false;
|
||||
}
|
||||
sum = sum / one_filter_size;
|
||||
fp32_op_output_ch_mean[i] = sum;
|
||||
}
|
||||
while (!OpOutputChMeanDataHandle(STORE, callParam.node_name, &fp32_op_output_ch_mean)) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
|
||||
}
|
||||
auto after_call_back = [this](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
|
||||
const CallBackParam &call_param) -> bool {
|
||||
if (kSupportBiasCorrectionNode.find(call_param.node_type) == kSupportBiasCorrectionNode.end()) {
|
||||
return true;
|
||||
}
|
||||
auto tensor = after_outputs[0];
|
||||
MS_ASSERT(tensor != nullptr);
|
||||
// op can be skipped.
|
||||
if (tensor->data_type() != kNumberTypeFloat32) {
|
||||
MS_LOG(INFO) << "tensor type is " << tensor->data_type();
|
||||
return true;
|
||||
}
|
||||
std::vector<float> fp32_op_output_ch_mean;
|
||||
// Calculate output per channel means
|
||||
auto ret = CalculatePerChannelMeans<float>(static_cast<float *>(tensor->data()), tensor->ElementsNum(),
|
||||
tensor->shape(), &fp32_op_output_ch_mean);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Calculate Per channel means failed.";
|
||||
return false;
|
||||
}
|
||||
while (!OpOutputChMeanDataHandle(STORE, call_param.node_name, &fp32_op_output_ch_mean)) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(kMillisecondsBase));
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
@ -335,18 +291,26 @@ int BiasCorrectionStrategy::Int8Inference() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
int BiasCorrectionStrategy::DoBiasCorrection(const FuncGraphPtr &func_graph) {
|
||||
int BiasCorrectionStrategy::CreateQuantModel(const FuncGraphPtr &quant_func_graph) {
|
||||
// init in8 session
|
||||
MS_LOG(INFO) << "create quant session";
|
||||
flags_.commonQuantParam.quant_type = schema::QuantType_QUANT_ALL;
|
||||
auto int8_sm = CreateSessionByFuncGraph(func_graph, flags_, this->flags_.commonQuantParam.thread_num);
|
||||
auto int8_sm = CreateSessionByFuncGraph(quant_func_graph, flags_, this->flags_.commonQuantParam.thread_num);
|
||||
int8_session_ = int8_sm.session;
|
||||
int8_model_ = int8_sm.model;
|
||||
if (int8_session_ == nullptr || int8_model_ == nullptr) {
|
||||
MS_LOG(ERROR) << "create session failed!";
|
||||
return RET_ERROR;
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int BiasCorrectionStrategy::DoCPUBiasCorrection(const FuncGraphPtr &quant_func_graph) {
|
||||
auto ret = CreateQuantModel(quant_func_graph);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Create quant model failed:" << ret;
|
||||
return ret;
|
||||
}
|
||||
std::future<int> int8_inference = std::async(std::launch::async, &BiasCorrectionStrategy::Int8Inference, this);
|
||||
// get input tensor
|
||||
std::vector<mindspore::tensor::MSTensor *> inputs = fp32_session_->GetInputs();
|
||||
|
@ -381,17 +345,17 @@ int BiasCorrectionStrategy::DoBiasCorrection(const FuncGraphPtr &func_graph) {
|
|||
MS_LOG(ERROR) << "divisor 'calibrate_size' cannot be 0.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
for (auto &key_value : op_bias_diff_map_) {
|
||||
for (auto &key_value : op_bias_diff_sum_map_) {
|
||||
std::for_each(key_value.second.begin(), key_value.second.end(),
|
||||
[this](float &data) { data = data / calibrator_->GetBatchNum(); });
|
||||
}
|
||||
auto cnodes = func_graph->GetOrderedCnodes();
|
||||
auto cnodes = quant_func_graph->GetOrderedCnodes();
|
||||
for (auto &cnode : cnodes) {
|
||||
auto op_name = cnode->fullname_with_scope();
|
||||
if (op_bias_diff_map_.find(op_name) == op_bias_diff_map_.end()) {
|
||||
if (op_bias_diff_sum_map_.find(op_name) == op_bias_diff_sum_map_.end()) {
|
||||
continue;
|
||||
}
|
||||
status = DoCNodeBiasCorrection(func_graph, cnode);
|
||||
status = DoCNodeBiasCorrection(quant_func_graph, cnode);
|
||||
if (status != RET_OK) {
|
||||
MS_LOG(ERROR) << "do node bias correct failed.";
|
||||
break;
|
||||
|
@ -400,9 +364,9 @@ int BiasCorrectionStrategy::DoBiasCorrection(const FuncGraphPtr &func_graph) {
|
|||
return status;
|
||||
}
|
||||
|
||||
int BiasCorrectionStrategy::DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
|
||||
int BiasCorrectionStrategy::DoCNodeBiasCorrection(const FuncGraphPtr &quant_func_graph, const CNodePtr &cnode) {
|
||||
auto op_name = cnode->fullname_with_scope();
|
||||
const auto &bias_diff = op_bias_diff_map_[op_name];
|
||||
const auto &bias_diff = op_bias_diff_sum_map_[op_name];
|
||||
auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
|
||||
if (primitive == nullptr) {
|
||||
MS_LOG(ERROR) << op_name << " primitive is nullptr";
|
||||
|
@ -454,7 +418,7 @@ int BiasCorrectionStrategy::DoCNodeBiasCorrection(const FuncGraphPtr &func_graph
|
|||
} else if (input_quant_params.size() == kHasBiasTensorSize - 1) {
|
||||
MS_LOG(INFO) << op_name << " add bias input";
|
||||
// need to add bias input
|
||||
auto parameter = func_graph->add_parameter();
|
||||
auto parameter = quant_func_graph->add_parameter();
|
||||
if (parameter == nullptr) {
|
||||
MS_LOG(ERROR) << "parameter is nullptr.";
|
||||
return RET_NULL_PTR;
|
||||
|
|
|
@ -49,18 +49,56 @@ class BiasCorrectionStrategy {
|
|||
delete int8_model_;
|
||||
}
|
||||
}
|
||||
int DoBiasCorrection(const FuncGraphPtr &func_graph);
|
||||
int DoCPUBiasCorrection(const FuncGraphPtr &quant_func_graph);
|
||||
|
||||
private:
|
||||
int DoCNodeBiasCorrection(const FuncGraphPtr &func_graph, const CNodePtr &cnode);
|
||||
int CreateQuantModel(const FuncGraphPtr &quant_func_graph);
|
||||
int DoCNodeBiasCorrection(const FuncGraphPtr &quant_func_graph, const CNodePtr &cnode);
|
||||
int Int8Inference();
|
||||
bool OpInputDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
|
||||
bool OpOutputChMeanDataHandle(OperationType type, const string &op_name, std::vector<float> *data);
|
||||
KernelCallBack GetBeforeCallBack(bool int8_op);
|
||||
KernelCallBack GetFloatBeforeCallBack();
|
||||
KernelCallBack GetInt8BeforeCallBack();
|
||||
KernelCallBack GetAfterCallBack(bool int8_op);
|
||||
KernelCallBack GetInt8AfterCallBack();
|
||||
KernelCallBack GetFloatAfterCallBack();
|
||||
int CheckFp32TensorVec(const std::string &node_name, const std::vector<mindspore::tensor::MSTensor *> &tensor_vec);
|
||||
|
||||
template <typename T>
|
||||
int CalculatePerChannelMeans(const T *tensor_data, size_t elem_count, std::vector<int> shapes,
|
||||
std::vector<float> *per_channel_mean) {
|
||||
// const auto *tensor_data = static_cast<const float *>(tensor->data());
|
||||
// size_t elem_count = tensor->ElementsNum();
|
||||
MS_CHECK_GT(elem_count, 0, false);
|
||||
// auto shapes = tensor->shape();
|
||||
if (shapes.size() != DIMENSION_4D) {
|
||||
MS_LOG(ERROR) << "unexpected shape size: " << shapes.size();
|
||||
return RET_ERROR;
|
||||
}
|
||||
// suppose the activation format: NHWC
|
||||
auto channels = shapes[FOURTH_INPUT];
|
||||
if (channels == 0) {
|
||||
MS_LOG(ERROR) << "unexpected channels: 0";
|
||||
return RET_ERROR;
|
||||
}
|
||||
per_channel_mean->resize(channels);
|
||||
auto bucket_size = elem_count / channels;
|
||||
for (int i = 0; i < channels; i++) {
|
||||
float sum = 0;
|
||||
for (size_t j = 0; j < bucket_size; j++) {
|
||||
auto index = j * channels + i;
|
||||
if (index >= elem_count) {
|
||||
MS_LOG(ERROR) << "over flow!";
|
||||
return RET_ERROR;
|
||||
}
|
||||
sum += tensor_data[index];
|
||||
}
|
||||
MS_CHECK_GT(bucket_size, 0, false);
|
||||
sum = sum / bucket_size;
|
||||
per_channel_mean->at(i) = sum;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
private:
|
||||
converter::Flags flags_;
|
||||
|
@ -75,7 +113,7 @@ class BiasCorrectionStrategy {
|
|||
|
||||
std::map<std::string, std::vector<float>> fp32_op_input_map_; // concurrency
|
||||
std::map<std::string, std::vector<float>> fp32_op_output_ch_mean_map_; // concurrency
|
||||
std::map<std::string, std::vector<float>> op_bias_diff_map_; // only use by int8 model
|
||||
std::map<std::string, std::vector<float>> op_bias_diff_sum_map_; // Record the sum of diffs in tensor
|
||||
std::mutex mutex_op_input_;
|
||||
std::mutex mutex_op_output_;
|
||||
};
|
||||
|
|
|
@ -607,7 +607,7 @@ int FullQuantQuantizer::DoQuantize(FuncGraphPtr func_graph) {
|
|||
MS_LOG(INFO) << "do bias correction";
|
||||
BiasCorrectionStrategy strategy(flags_, calibrator_, fp32_session_, fp32_model_, activation_q_min_,
|
||||
activation_q_max_);
|
||||
status = strategy.DoBiasCorrection(func_graph);
|
||||
status = strategy.DoCPUBiasCorrection(func_graph);
|
||||
if (status != RET_OK) {
|
||||
MS_LOG(ERROR) << "bias_correction failed.";
|
||||
return status;
|
||||
|
|
|
@ -619,4 +619,65 @@ int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive
|
|||
abstractTensor->element()->set_type(TypeIdToType(kNumberTypeInt32));
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeQuantData(const int8_t *tensor_data, int64_t elements_num, std::vector<lite::LiteQuantParam> quant_params,
|
||||
std::vector<double> *dequant_data, int preferred_dim) {
|
||||
if (quant_params.size() != 1) {
|
||||
MS_LOG(ERROR) << "unexpected quant_params size: " << quant_params.size() << " only support per-layer now.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto scale = quant_params[0].scale;
|
||||
auto zp = quant_params[0].zeroPoint;
|
||||
dequant_data->resize(elements_num);
|
||||
for (int64_t i = 0; i < elements_num; i++) {
|
||||
dequant_data->at(i) = scale * (tensor_data[i] - zp);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int DeQuantData(mindspore::tensor::MSTensor *tensor, std::vector<double> *dequant_data, int preferred_dim) {
|
||||
return DeQuantData(static_cast<int8_t *>(tensor->data()), tensor->ElementsNum(), tensor->quant_params(), dequant_data,
|
||||
preferred_dim);
|
||||
}
|
||||
|
||||
int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input) {
|
||||
if (bit_num > 0 && bit_num < k8Bit) {
|
||||
std::vector<int8_t> origin_data(tensor_input->data.size());
|
||||
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int8_t), tensor_input->data.data(),
|
||||
tensor_input->data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << tensor_input->name << " memcpy failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::vector<uint8_t> pack_data{};
|
||||
BitPack::BitPacking<int8_t, uint8_t>(bit_num, origin_data, &pack_data);
|
||||
tensor_input->data.resize(pack_data.size() * sizeof(uint8_t));
|
||||
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
|
||||
pack_data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy_s failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (bit_num > k8Bit && bit_num < k16Bit) {
|
||||
auto shape_size =
|
||||
std::accumulate(tensor_input->dims.begin(), tensor_input->dims.end(), size_t(1), std::multiplies<size_t>());
|
||||
std::vector<int16_t> origin_data(shape_size);
|
||||
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int16_t), tensor_input->data.data(),
|
||||
tensor_input->data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::vector<uint16_t> pack_data{};
|
||||
BitPack::BitPacking<int16_t, uint16_t>(bit_num, origin_data, &pack_data);
|
||||
tensor_input->data.resize(pack_data.size() * sizeof(uint16_t));
|
||||
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
|
||||
pack_data.size() * sizeof(uint16_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy_s failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::lite::quant
|
||||
|
|
|
@ -98,6 +98,13 @@ std::vector<int> ConvertShapeVectorToInt32(const ShapeVector &dims);
|
|||
|
||||
int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive);
|
||||
|
||||
int DeQuantData(mindspore::tensor::MSTensor *tensor, std::vector<double> *dequant_data, int preferred_dim = 0);
|
||||
|
||||
int DeQuantData(const int8_t *tensor_data, int64_t elements_num, std::vector<lite::LiteQuantParam> quant_params,
|
||||
std::vector<double> *dequant_data, int preferred_dim);
|
||||
|
||||
int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input);
|
||||
|
||||
template <typename T>
|
||||
int FixedBitQuantFilter(const AnfNodePtr ¶meter, const tensor::TensorPtr &weight, const PrimitivePtr &primitive,
|
||||
QuantType quant_type, int quant_max, int quant_min, size_t bit_num,
|
||||
|
|
Loading…
Reference in New Issue