optimize mixed bit quantization code

This commit is contained in:
yeyunpeng2020 2022-08-08 11:25:14 +08:00
parent a397693ca6
commit e58ede7d5a
6 changed files with 79 additions and 91 deletions

View File

@ -21,7 +21,9 @@
#include "ir/anf.h"
#include "ir/tensor.h"
#include "tools/converter/quantizer/fse_bit_stream.h"
#include "tools/converter/quantizer/mixed_bit_weight_quantizer.h"
#include "schema/inner/model_generated.h"
#include "src/common/log_adapter.h"
#include "src/common/quant_utils.h"
namespace mindspore::lite::quant {
constexpr int MAX_SYMS = 65534;

View File

@ -14,17 +14,18 @@
* limitations under the License.
*/
#include "tools/converter/quantizer/mixed_bit_weight_quantizer.h"
#include "tools/converter/quantizer/mixed_bit_weight_quantization.h"
#include <cmath>
#include <cfloat>
#include <map>
#include "tools/common/statistic_utils.h"
#include "tools/converter/quantizer/quantize_util.h"
namespace mindspore::lite::quant {
constexpr float kTwentyFour = 24.0f;
void MixedBitWeightQuantizer::GetBiasCorrection(float *weights, int element_num, float scale,
float *origin_dequant_datas) {
void MixedBitWeightQuantization::CalculateBiasCorrection(float *weights, int element_num, float scale,
float *origin_dequant_datas) {
MS_ASSERT(weights != nullptr);
MS_ASSERT(origin_dequant_datas != nullptr);
MS_ASSERT(element_num > 0);
@ -60,7 +61,7 @@ void MixedBitWeightQuantizer::GetBiasCorrection(float *weights, int element_num,
}
// the error is currently measured per channel.
float MixedBitWeightQuantizer::CalculateMeanError(std::vector<float> norms2, std::vector<float> dnorms2) {
float MixedBitWeightQuantization::CalculateMeanError(std::vector<float> norms2, std::vector<float> dnorms2) {
int error_count = 0;
float mse_error = 1e-10f;
const float soft = 1e-7f;
@ -77,8 +78,8 @@ float MixedBitWeightQuantizer::CalculateMeanError(std::vector<float> norms2, std
}
// the `preferred` dim should point to the output channels dimension.
float MixedBitWeightQuantizer::MeasureQuantizationError(float *weights, const int *shape, int dims, int preferred_dim,
float scale) {
float MixedBitWeightQuantization::MeasureQuantizationError(float *weights, const int *shape, int dims,
int preferred_dim, float scale) {
MS_ASSERT(weights != nullptr);
MS_ASSERT(shape != nullptr);
// Init
@ -109,7 +110,7 @@ float MixedBitWeightQuantizer::MeasureQuantizationError(float *weights, const in
MS_ASSERT(bucket_volume != 0);
const float upround_offset = 0.5;
// Bias Correction
GetBiasCorrection(weights, element_num, scale, origin_dequant_datas.data());
CalculateBiasCorrection(weights, element_num, scale, origin_dequant_datas.data());
for (int i = 0; i < element_num; i++) {
int bucket = (i / bucket_volume) % bucket_count;
norms2[bucket] += weights[i] * weights[i];
@ -122,7 +123,7 @@ float MixedBitWeightQuantizer::MeasureQuantizationError(float *weights, const in
return mean_error;
}
LayerParam MixedBitWeightQuantizer::CalculateLayerParams(const float *weights, int element_num) {
LayerParam MixedBitWeightQuantization::CalculateLayerParams(const float *weights, int element_num) {
MS_ASSERT(weights != nullptr);
float temp_norm_tot = 0.0;
for (int i = 0; i < element_num; i++) {
@ -133,7 +134,7 @@ LayerParam MixedBitWeightQuantizer::CalculateLayerParams(const float *weights, i
return ret;
}
MinMax MixedBitWeightQuantizer::GetMinMax(const float *arr, int arrc) {
MinMax MixedBitWeightQuantization::GetMinMax(const float *arr, int arrc) {
MS_ASSERT(arr != nullptr);
MinMax min_max = {INFINITY, -INFINITY};
for (int i = 0; i < arrc; i++)
@ -144,9 +145,9 @@ MinMax MixedBitWeightQuantizer::GetMinMax(const float *arr, int arrc) {
return min_max;
}
BinarySearchResult MixedBitWeightQuantizer::BinarySearchForQuantizationScale(float *weights, int *shape, int dims,
int preferred_dim, int max_iters,
float target_err, float rel_tol) {
BinarySearchResult MixedBitWeightQuantization::BinarySearchForQuantizationScale(float *weights, int *shape, int dims,
int preferred_dim, int max_iters,
float target_err, float rel_tol) {
MS_ASSERT(weights != nullptr);
MS_ASSERT(shape != nullptr);
int element_num = 1;
@ -191,8 +192,8 @@ BinarySearchResult MixedBitWeightQuantizer::BinarySearchForQuantizationScale(flo
}
}
float MixedBitWeightQuantizer::GetDx(float *weights, int *shape, int dims, int preferred_dim,
const std::string &description) {
float MixedBitWeightQuantization::GetDx(float *weights, int *shape, int dims, int preferred_dim,
const std::string &description) {
MS_ASSERT(weights != nullptr);
MS_ASSERT(shape != nullptr);
static std::map<std::string, LayerParam> param_map;
@ -213,10 +214,10 @@ float MixedBitWeightQuantizer::GetDx(float *weights, int *shape, int dims, int p
return (target_relative_err_ + target_search_tolerance_ * std::sqrt(kTwentyFour / element_num)) / params.inv_norm;
}
int MixedBitWeightQuantizer::DoQuantization(float *weights, std::vector<int64_t> shape, int preferred_dim,
std::vector<schema::QuantParamT> *quant_params,
std::vector<int16_t> *quant_datas, const std::string &description,
bool use_auto_tune_alg) {
int MixedBitWeightQuantization::DoQuantization(float *weights, std::vector<int64_t> shape, int preferred_dim,
std::vector<schema::QuantParamT> *quant_params,
std::vector<int16_t> *quant_datas, const std::string &description,
bool use_auto_tune_alg) {
MS_ASSERT(weights != nullptr);
MS_ASSERT(quant_params != nullptr);
MS_ASSERT(quant_datas != nullptr);
@ -252,8 +253,8 @@ int MixedBitWeightQuantizer::DoQuantization(float *weights, std::vector<int64_t>
return RET_OK;
}
int MixedBitWeightQuantizer::QuantizeByScale(const float *weights, int weightsc, float scale,
schema::QuantParamT *quant_params, std::vector<int16_t> *quant_datas) {
int MixedBitWeightQuantization::QuantizeByScale(const float *weights, int weightsc, float scale,
schema::QuantParamT *quant_params, std::vector<int16_t> *quant_datas) {
MS_ASSERT(weights != nullptr);
MS_ASSERT(weightsc <= quant_datas->size());
const float upround_offset = 0.5;
@ -269,4 +270,40 @@ int MixedBitWeightQuantizer::QuantizeByScale(const float *weights, int weightsc,
quant_params->inited = true;
return RET_OK;
}
int MixedBitWeightQuantization::QuantFilter(const PrimitivePtr &primitive, const AnfNodePtr &parameter_node,
const tensor::TensorPtr &weight, int index, schema::QuantType quant_type,
bool use_auto_tune_alg) {
CHECK_NULL_RETURN(primitive);
CHECK_NULL_RETURN(weight);
std::vector<schema::QuantParamT> quant_params;
int elem_count = weight->DataSize();
auto *raw_data = static_cast<float *>(weight->data_c());
if (raw_data == nullptr) {
MS_LOG(ERROR) << "rawDatas is nullptr";
return RET_ERROR;
}
std::vector<int16_t> quant_data(elem_count);
auto ret = DoQuantization(static_cast<float *>(weight->data_c()), weight->shape_c(), 0, &quant_params, &quant_data,
parameter_node->fullname_with_scope(), use_auto_tune_alg);
if (ret != RET_OK) {
return ret;
}
ret = UpdateTensorDataAndSize(parameter_node, weight, quant_data.data(), quant_data.size() * sizeof(int16_t),
kNumberTypeInt16);
if (ret != RET_OK) {
MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
return RET_ERROR;
}
if (quant_params.empty()) {
MS_LOG(ERROR) << "quant_params empty";
return RET_ERROR;
}
auto quant_param_holder = GetCNodeQuantHolder(primitive);
quant_param_holder->set_input_quant_param(index, quant_params);
quant_param_holder->set_quant_type(quant_type);
return ret;
}
} // namespace mindspore::lite::quant

View File

@ -14,8 +14,8 @@
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_MIXED_BIT_WEIGHT_QUANTIZER_H_
#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_MIXED_BIT_WEIGHT_QUANTIZER_H_
#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_MIXED_BIT_WEIGHT_QUANTIZATION_H_
#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_MIXED_BIT_WEIGHT_QUANTIZATION_H_
#include <cstdint>
#include <vector>
#include <cmath>
@ -23,6 +23,7 @@
#include "schema/inner/model_generated.h"
#include "src/common/log_adapter.h"
#include "src/common/quant_utils.h"
#include "ir/tensor.h"
namespace mindspore::lite::quant {
constexpr float kBinarySearchStep = 2.0;
@ -36,20 +37,22 @@ typedef struct {
MinMax mm;
} LayerParam;
class MixedBitWeightQuantizer {
class MixedBitWeightQuantization {
public:
explicit MixedBitWeightQuantizer(float target_relative_err = 0.01, float target_search_tolerance = 0.01,
int max_search_iters = 100)
explicit MixedBitWeightQuantization(float target_relative_err, float target_search_tolerance = 0.01,
int max_search_iters = 100)
: target_relative_err_(target_relative_err),
target_search_tolerance_(target_search_tolerance),
max_search_iters_(max_search_iters) {}
~MixedBitWeightQuantizer() = default;
~MixedBitWeightQuantization() = default;
int DoQuantization(float *weights, std::vector<int64_t> shape, int preferred_dim,
std::vector<schema::QuantParamT> *quant_params, std::vector<int16_t> *quant_datas,
const std::string &description, bool use_auto_tune_alg);
int QuantFilter(const PrimitivePtr &primitive, const AnfNodePtr &parameter_node, const tensor::TensorPtr &weight,
int index, schema::QuantType quant_type, bool use_auto_tune_alg = false);
private:
int DoQuantization(float *weights, std::vector<int64_t> shape, int preferred_dim,
std::vector<schema::QuantParamT> *quant_params, std::vector<int16_t> *quant_datas,
const std::string &description, bool use_auto_tune_alg = false);
float MeasureQuantizationError(float *weights, const int *shape, int dims, int preferred_dim, float scale);
static MinMax GetMinMax(const float *arr, int arrc);
@ -63,7 +66,7 @@ class MixedBitWeightQuantizer {
float GetDx(float *weights, int *shape, int dims, int preferred_dim, const std::string &description);
void GetBiasCorrection(float *weights, int element_num, float scale, float *origin_dequant_datas);
void CalculateBiasCorrection(float *weights, int element_num, float scale, float *origin_dequant_datas);
float CalculateMeanError(std::vector<float> norms2, std::vector<float> dnorms2);
@ -75,4 +78,4 @@ class MixedBitWeightQuantizer {
int max_search_iters_;
};
} // namespace mindspore::lite::quant
#endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_MIXED_BIT_WEIGHT_QUANTIZER_H_
#endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_MIXED_BIT_WEIGHT_QUANTIZATION_H_

View File

@ -490,56 +490,6 @@ void CalQuantAssitInfo(const schema::PrimitiveT &primitive, const std::vector<in
}
}
int MixedBitQuantFilter(const AnfNodePtr &parameter_node, const tensor::TensorPtr &weight,
const PrimitivePtr &primitive, schema::QuantType quant_type, WeightQuantType weight_quant_type,
TypeId quant_data_type, double init_scale, int index, int preferred_dim, bool symmetric,
bool use_auto_tune_alg) {
MS_CHECK_TRUE_RET(primitive != nullptr, RET_NULL_PTR);
MS_CHECK_TRUE_RET(weight != nullptr, RET_NULL_PTR);
auto dims = weight->shape();
if (weight_quant_type == FIXED_BIT_PER_CHANNEL) {
if (dims.size() <= 1) {
MS_LOG(WARNING) << "dims is " << dims.size() << " can not per_channel";
weight_quant_type = FIXED_BIT_PER_LAYER;
}
}
std::vector<schema::QuantParamT> quant_params;
int elem_count = weight->DataSize();
auto *raw_data = static_cast<float *>(weight->data_c());
if (raw_data == nullptr) {
MS_LOG(ERROR) << "rawDatas is nullptr";
return RET_ERROR;
}
std::vector<int16_t> quant_data(elem_count);
if (weight_quant_type != MIXED_BIT_PER_LAYER) {
MS_LOG(ERROR) << "Unsupported weight quant type:" << weight_quant_type;
return RET_ERROR;
}
MixedBitWeightQuantizer quantizer(init_scale);
auto ret = quantizer.DoQuantization(static_cast<float *>(weight->data_c()), weight->shape_c(), 0, &quant_params,
&quant_data, parameter_node->fullname_with_scope(), use_auto_tune_alg);
if (ret != RET_OK) {
return ret;
}
auto status = UpdateTensorDataAndSize(parameter_node, weight, quant_data.data(), quant_data.size() * sizeof(int16_t),
quant_data_type);
if (status != RET_OK) {
MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
return RET_ERROR;
}
if (quant_params.empty()) {
MS_LOG(ERROR) << "quant_params empty";
return RET_ERROR;
}
auto quant_param_holder = GetCNodeQuantHolder(primitive);
quant_param_holder->set_input_quant_param(index, quant_params);
quant_param_holder->set_quant_type(quant_type);
return ret;
}
bool CheckNodeInSet(const CNodePtr &cnode, const std::set<PrimitivePtr> &support_primitive_types) {
for (const auto &type : support_primitive_types) {
if (opt::CheckPrimitiveType(cnode, type)) {

View File

@ -46,7 +46,7 @@
#include "abstract/dshape.h"
#include "tools/converter/quantizer/huffman_encode.h"
#include "tools/converter/quantizer/quant_params.h"
#include "tools/converter/quantizer/mixed_bit_weight_quantizer.h"
#include "tools/converter/quantizer/mixed_bit_weight_quantization.h"
#include "src/litert/lite_session.h"
#include "src/common/file_utils.h"
#include "src/common/quant_utils.h"
@ -67,11 +67,6 @@ void CalQuantAssitInfo(const schema::PrimitiveT &primitive, const std::vector<in
bool TensorQuantParamsInited(const schema::TensorT &tensor);
int MixedBitQuantFilter(const AnfNodePtr &parameter_node, const tensor::TensorPtr &weight,
const PrimitivePtr &primitive, schema::QuantType quant_type, WeightQuantType weight_quant_type,
TypeId quant_data_type, double init_scale, int index, int preferred_dim, bool symmetric,
bool use_auto_tune_alg);
int CalChannels(const std::vector<int> &dims, int channel_cnt, bool *channel_at_first);
int GetPreferredDim(const CNodePtr &cnode, int input_index, const std::vector<int> &dims);

View File

@ -25,6 +25,7 @@
#include "tools/converter/quantizer/fse_encoder.h"
#include "tools/converter/quantizer/tensor_compressor.h"
#include "tools/converter/quantizer/cluster_quantization.h"
#include "tools/converter/quantizer/mixed_bit_weight_quantization.h"
namespace mindspore::lite::quant {
static const float kScaleFactor = (0.01 * 0.01 * 0.01 * 24.0);
@ -147,9 +148,9 @@ int WeightQuantizer::DoMixBitQuant(const CNodePtr &cnode, const ParameterPtr &pa
WeightQuantType weight_quant_type, bool symmetric) {
auto primitive = GetValueNode<PrimitivePtr>(cnode->input(0));
CHECK_NULL_RETURN(primitive);
auto status = MixedBitQuantFilter(parameter, tensor_info, primitive, param_->commonQuantParam.quant_type,
WeightQuantType::MIXED_BIT_PER_LAYER, type_id_, mixed_bit_init_scale_, idx - 1,
preferred_dim, symmetric, is_auto_tune_);
auto mixed_bit_quantization = MixedBitWeightQuantization(mixed_bit_init_scale_);
auto status = mixed_bit_quantization.QuantFilter(primitive, parameter, tensor_info, idx - 1,
param_->commonQuantParam.quant_type, is_auto_tune_);
if (status == RET_OK) {
FSEEncoder fse_encoder;
auto quant_param_holder = GetCNodeQuantHolder(primitive);