From 366826c3d1feda587c5e62d1cbf58db3eb56c89d Mon Sep 17 00:00:00 2001 From: yeyunpeng2020 Date: Mon, 18 Jul 2022 16:19:22 +0800 Subject: [PATCH] mv tensor compress to compressor --- .../graph/tensor_quant_pass.cc | 6 +- .../tools/converter/quantizer/bitpacking.h | 11 +- .../tools/converter/quantizer/quant_params.h | 18 ++ .../converter/quantizer/quantize_util.cc | 57 ---- .../tools/converter/quantizer/quantize_util.h | 234 +--------------- .../converter/quantizer/tensor_compressor.cc | 87 ++++++ .../converter/quantizer/tensor_compressor.h | 257 ++++++++++++++++++ .../lite/tools/lite_exporter/anf_exporter.cc | 8 +- 8 files changed, 379 insertions(+), 299 deletions(-) create mode 100644 mindspore/lite/tools/converter/quantizer/tensor_compressor.cc create mode 100644 mindspore/lite/tools/converter/quantizer/tensor_compressor.h diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/tensor_quant_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/graph/tensor_quant_pass.cc index 19618686f32..ccb70911e60 100644 --- a/mindspore/lite/tools/converter/legacy_optimizer/graph/tensor_quant_pass.cc +++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/tensor_quant_pass.cc @@ -21,6 +21,7 @@ #include #include "tools/converter/converter_context.h" #include "tools/converter/quantizer/quantize_util.h" +#include "tools/converter/quantizer/tensor_compressor.h" #include "tools/common/tensor_util.h" #include "tools/common/graph_util.h" #include "tools/common/meta_graph_utils.h" @@ -192,6 +193,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) { index++; continue; } + auto compressor = quant::TensorCompressor(); if (tensor->quantParams.size() > 1) { // perchannel status = ComputeQuantTensorPerChannel(tensor.get(), index, *graph); @@ -200,7 +202,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) { return RET_ERROR; } int bit_num = tensor->quantParams.front()->numBits; - if (quant::DoBitPack(bit_num, tensor.get()) != RET_OK) { + if (compressor.DoBitPack(bit_num, tensor.get()) != RET_OK) { MS_LOG(ERROR) << "bit pack failed."; return RET_ERROR; } @@ -213,7 +215,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) { quantParam->dstDtype == TypeId::kNumberTypeFloat32 || quantParam->dstDtype == TypeId::kNumberTypeFloat) { status = ComputeDataToInt8(tensor); int bit_num = tensor->quantParams.front()->numBits; - if (quant::DoBitPack(bit_num, tensor.get()) != RET_OK) { + if (compressor.DoBitPack(bit_num, tensor.get()) != RET_OK) { MS_LOG(ERROR) << "bit pack failed."; return RET_ERROR; } diff --git a/mindspore/lite/tools/converter/quantizer/bitpacking.h b/mindspore/lite/tools/converter/quantizer/bitpacking.h index 687c5afe220..2782b42ca15 100644 --- a/mindspore/lite/tools/converter/quantizer/bitpacking.h +++ b/mindspore/lite/tools/converter/quantizer/bitpacking.h @@ -16,12 +16,15 @@ #ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H_ #define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H_ +#include #include #include #include #include #include +#include "tools/converter/quantizer/quant_params.h" +using mindspore::lite::quant::k8Bit; namespace mindspore::lite { class BitPack { public: @@ -36,8 +39,8 @@ class BitPack { DoBinary(bit_num, tmp, &bit_data_vec, packed_data_vec); } size_t remain_bit_data = bit_data_vec.size(); - if (sizeof(T1) * 8 > remain_bit_data && remain_bit_data > 0) { - for (size_t i = 0; i < sizeof(T1) * 8 - remain_bit_data; i++) { + if (sizeof(T1) * k8Bit > remain_bit_data && remain_bit_data > 0) { + for (size_t i = 0; i < sizeof(T1) * k8Bit - remain_bit_data; i++) { bit_data_vec.push(false); } PackFromOriginToUint(&bit_data_vec, packed_data_vec); @@ -49,7 +52,7 @@ class BitPack { static void PackFromOriginToUint(std::stack *ans, std::vector *packed_data_vec) { MS_ASSERT(ans != nullptr); uint32_t result = 0; - for (size_t i = 0; i < sizeof(T2) * 8; i++) { + for (size_t i = 0; i < sizeof(T2) * k8Bit; i++) { bool bit_tmp = ans->top(); result = (result << 1) + static_cast(bit_tmp); ans->pop(); @@ -64,7 +67,7 @@ class BitPack { bool a = n % 2; n = n / 2; ans->push(a); - if (ans->size() == sizeof(T2) * 8) { + if (ans->size() == sizeof(T2) * k8Bit) { PackFromOriginToUint(ans, packed_data_vec); } } diff --git a/mindspore/lite/tools/converter/quantizer/quant_params.h b/mindspore/lite/tools/converter/quantizer/quant_params.h index d7656802f3f..695afd83c6a 100644 --- a/mindspore/lite/tools/converter/quantizer/quant_params.h +++ b/mindspore/lite/tools/converter/quantizer/quant_params.h @@ -22,6 +22,24 @@ #include #include "schema/inner/model_generated.h" namespace mindspore::lite::quant { +enum WeightQuantType { + FIXED_BIT_PER_CHANNEL = 0, + FIXED_BIT_PER_LAYER = 1, + MIXED_BIT_PER_LAYER = 2, +}; +constexpr size_t k2Bit = 2; +constexpr size_t k8Bit = 8; +constexpr size_t k10Bit = 10; +constexpr size_t k16Bit = 16; +constexpr size_t k32Bit = 32; +constexpr size_t kMaxNum1024 = 1024; +constexpr size_t kMillisecondsBase = 10; +constexpr float kDelta = 0.1; +constexpr float kRatio = 10.0; +constexpr int kCpuBindMode = 1; +constexpr int kPrimIndex = 0; +constexpr int kPrimOffset = 1; + enum ActivationQuantizedMethod { MAX_MIN = 0, KL = 1, diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.cc b/mindspore/lite/tools/converter/quantizer/quantize_util.cc index ea9d650cf27..8f504dcb6b7 100644 --- a/mindspore/lite/tools/converter/quantizer/quantize_util.cc +++ b/mindspore/lite/tools/converter/quantizer/quantize_util.cc @@ -49,7 +49,6 @@ constexpr int kLstmWeightShapeSize = 3; constexpr int kSingleDirBiasTensorSize = 4; constexpr int kLstmBiasShapeSize = 2; constexpr int kLstmBiasIndex = 3; -constexpr size_t kBitNumPerByte = 8; constexpr size_t kGatherAxisIndex = 3; int ComputeBiasDataAndQuantParam(const std::vector &bias_scales, const std::vector &input_scales, @@ -595,21 +594,6 @@ bool CheckNodeInSet(const CNodePtr &cnode, const std::set &support return false; } -std::string BoolVectorToString(const std::vector &bool_vec) { - size_t size_in_byte = static_cast(ceil(bool_vec.size() / kBitNumPerByte)); - std::string str(size_in_byte, '\0'); - auto iter = str.begin(); - size_t shift = kBitNumPerByte; - for (bool bit : bool_vec) { - *iter |= bit << (shift - 1); - if (--shift == 0) { - iter++; - shift = kBitNumPerByte; - } - } - return str; -} - int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive) { CHECK_NULL_RETURN(bias); CHECK_NULL_RETURN(primitive); @@ -703,47 +687,6 @@ int DeQuantData(const mindspore::MSTensor *tensor, std::vector *dequant_ tensor->QuantParams(), dequant_data, preferred_dim); } -int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input) { - if (bit_num > 0 && bit_num < k8Bit) { - std::vector origin_data(tensor_input->data.size()); - auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int8_t), tensor_input->data.data(), - tensor_input->data.size() * sizeof(uint8_t)); - if (status != EOK) { - MS_LOG(ERROR) << tensor_input->name << " memcpy failed. " << status; - return RET_ERROR; - } - std::vector pack_data{}; - BitPack::BitPacking(bit_num, origin_data, &pack_data); - tensor_input->data.resize(pack_data.size() * sizeof(uint8_t)); - status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(), - pack_data.size() * sizeof(uint8_t)); - if (status != EOK) { - MS_LOG(ERROR) << "memcpy_s failed. " << status; - return RET_ERROR; - } - } else if (bit_num > k8Bit && bit_num < k16Bit) { - auto shape_size = - std::accumulate(tensor_input->dims.begin(), tensor_input->dims.end(), size_t(1), std::multiplies()); - std::vector origin_data(shape_size); - auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int16_t), tensor_input->data.data(), - tensor_input->data.size() * sizeof(uint8_t)); - if (status != EOK) { - MS_LOG(ERROR) << "memcpy failed. " << status; - return RET_ERROR; - } - std::vector pack_data{}; - BitPack::BitPacking(bit_num, origin_data, &pack_data); - tensor_input->data.resize(pack_data.size() * sizeof(uint16_t)); - status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(), - pack_data.size() * sizeof(uint16_t)); - if (status != EOK) { - MS_LOG(ERROR) << "memcpy_s failed. " << status; - return RET_ERROR; - } - } - return RET_OK; -} - int GetElementNumFromShape(const std::vector &dims, int *total_size) { CHECK_NULL_RETURN(total_size); *total_size = 1; diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.h b/mindspore/lite/tools/converter/quantizer/quantize_util.h index f486f60e599..316005efa9f 100644 --- a/mindspore/lite/tools/converter/quantizer/quantize_util.h +++ b/mindspore/lite/tools/converter/quantizer/quantize_util.h @@ -45,7 +45,7 @@ #include "ir/primitive.h" #include "abstract/dshape.h" #include "tools/converter/quantizer/huffman_encode.h" -#include "tools/converter/quantizer/bitpacking.h" +#include "tools/converter/quantizer/quant_params.h" #include "tools/converter/quantizer/mixed_bit_weight_quantizer.h" #include "src/runtime/lite_session.h" #include "tools/converter/graphdef_transform.h" @@ -56,24 +56,6 @@ #include "tools/common/string_util.h" namespace mindspore::lite::quant { -enum WeightQuantType { - FIXED_BIT_PER_CHANNEL = 0, - FIXED_BIT_PER_LAYER = 1, - MIXED_BIT_PER_LAYER = 2, -}; -constexpr size_t k2Bit = 2; -constexpr size_t k8Bit = 8; -constexpr size_t k10Bit = 10; -constexpr size_t k16Bit = 16; -constexpr size_t k32Bit = 32; -constexpr size_t kMaxNum1024 = 1024; -constexpr size_t kMillisecondsBase = 10; -constexpr float kDelta = 0.1; -constexpr float kRatio = 10.0; -constexpr int kCpuBindMode = 1; -constexpr int kPrimIndex = 0; -constexpr int kPrimOffset = 1; - QuantParamHolderPtr GetCNodeQuantHolder(const PrimitivePtr &primitive); QuantParamHolderPtr GetCNodeQuantHolder(const CNodePtr &cnode); @@ -102,8 +84,6 @@ int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive int DeQuantData(const mindspore::MSTensor *tensor, std::vector *dequant_data, int preferred_dim = 0); -int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input); - int GetQuantType(const CNodePtr &cnode); template @@ -198,221 +178,9 @@ void GetLiteParameter(const AnfNodePtr &node, ParameterPtr *param_node, tensor:: bool CheckNodeInSet(const CNodePtr &cnode, const std::set &support_primitive_types); -std::string BoolVectorToString(const std::vector &bool_vec); - int GetElementNumFromShape(const std::vector &dims, int *total_size); int GetBucketAllIndex(const std::vector &dims, int preferred_dim, std::vector> *buckets_data_index); - -template -bool IndexingCompress(const std::set &quant_data_set, const std::map &unique_value_index_map, - size_t unique_value_bit, size_t unique_value_cnt, size_t pack_repetition_size_in_byte, - size_t bit_num, schema::TensorT *tensor) { - auto quant_data_array = reinterpret_cast(tensor->data.data()); - std::vector quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T)); - - std::vector bits(pack_repetition_size_in_byte * k8Bit); - size_t index = 0; - // write unique_value_cnt: bit_num bit for unsigned - for (size_t i = 0; i < bit_num; i++) { - bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & (0x1); - } - // write the unique value set: each value has bit_num bit signed - for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) { - for (size_t i = 0; i < bit_num; i++) { - bits[index++] = ((*iter + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1); - } - } - // write the index: each index has unique_value_bit unsigned - for (auto quant_value : quant_data) { - for (size_t i = 0; i < unique_value_bit; i++) { - bits[index++] = (unique_value_index_map.at(quant_value) >> (unique_value_bit - i - 1)) & (0x1); - } - } - if (index > pack_repetition_size_in_byte * k8Bit) { - MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than " - << pack_repetition_size_in_byte * k8Bit; - return false; - } - // update tensor data - auto new_data_str = BoolVectorToString(bits); - auto ret = memcpy_s(tensor->data.data(), tensor->data.size(), new_data_str.c_str(), new_data_str.size()); - if (ret != EOK) { - MS_LOG(ERROR) << "memcpy error"; - return false; - } - tensor->data.resize(new_data_str.size()); - - tensor->weightQuantCompressType = schema::WeightQuantCompressType_INDEXING; - MS_LOG(DEBUG) << "set WeightQuantCompressType_INDEXING"; - return true; -} - -template -bool SparsityCompress(const std::set &quant_data_set, const std::map &unique_value_index_map, - size_t unique_value_bit, size_t unique_value_cnt, size_t pack_sparsity_size_in_byte, - size_t nz_cnt, size_t coor_best_bit, size_t bit_num, schema::TensorT *tensor) { - auto quant_data_array = reinterpret_cast(tensor->data.data()); - std::vector quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T)); - auto &quant_params = tensor->quantParams; - auto elem_cnt = quant_data.size(); - auto channel_cnt = quant_params.size(); - MS_CHECK_TRUE_MSG(channel_cnt != 0, false, "div zero."); - auto elem_perchannel = elem_cnt / channel_cnt; - - std::vector bits(pack_sparsity_size_in_byte * k8Bit); - int index = 0; - // coor_best_bit - for (size_t i = 0; i < k8Bit; i++) { - bits[index++] = (coor_best_bit >> (k8Bit - i - 1)) & 0x1; - } - // nz_cnt - for (size_t i = 0; i < k32Bit; i++) { - bits[index++] = (nz_cnt >> (k32Bit - i - 1)) & 0x1; - } - // unique_value cnt - for (size_t i = 0; i < bit_num; i++) { - bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & 0x1; - } - // unique_values - for (auto unique_value : quant_data_set) { - for (size_t i = 0; i < bit_num; i++) { - bits[index++] = ((unique_value + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1); - } - } - // nz values indexing && get coor - std::vector coors(nz_cnt); - size_t coors_index = 0; - size_t prev_index = -1; - for (size_t di = 0; di < elem_cnt; di++) { - auto cur_channel = di / elem_perchannel; - auto zp = quant_params[cur_channel]->zeroPoint; - auto nz_value = quant_data[di]; - if (nz_value != zp || (di - prev_index) >= static_cast((1 << coor_best_bit))) { - MS_ASSERT(coors_index < nz_cnt); - coors[coors_index++] = di - prev_index - 1; - prev_index = di; - for (size_t i = 0; i < unique_value_bit; i++) { - bits[index++] = (unique_value_index_map.at(nz_value) >> (unique_value_bit - i - 1)) & (0x1); - } - } - } - // write coor - for (auto coor : coors) { - for (size_t i = 0; i < coor_best_bit; i++) { - bits[index++] = (coor >> (coor_best_bit - i - 1)) & 0x1; - } - } - if ((unsigned int)index > pack_sparsity_size_in_byte * k8Bit) { - MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than " - << pack_sparsity_size_in_byte * k8Bit; - return false; - } - auto new_data_str = BoolVectorToString(bits); - auto ret = memcpy_s(tensor->data.data(), tensor->data.size(), new_data_str.c_str(), new_data_str.size()); - if (ret != EOK) { - MS_LOG(ERROR) << "memcpy error"; - return false; - } - tensor->data.resize(new_data_str.size()); - - tensor->weightQuantCompressType = schema::WeightQuantCompressType_SPARSE; - MS_LOG(INFO) << "set WeightQuantCompressType_SPARSITY"; - return true; -} - -template -size_t CalCoorBestBit(const std::vector &quant_data, size_t elem_cnt, - const std::vector> &quant_params, int unique_value_bit, - size_t *coor_best_bit) { - MS_ASSERT(!quant_params.empty()); - size_t best_nn_cnt = 0; - size_t min_len_in_bit = std::numeric_limits::max(); - for (size_t bit = k2Bit; bit <= k10Bit; bit++) { - // search - int nn_cnt = 0; - int prev_index = -1; - auto channel_cnt = quant_params.size(); - MS_ASSERT(channel_cnt > 0); - auto elem_perchannel = elem_cnt / channel_cnt; - for (size_t i = 0; i < elem_cnt; i++) { - auto cur_channel = i / elem_perchannel; - auto zp = quant_params[cur_channel]->zeroPoint; - if (quant_data[i] != zp || (static_cast(i) - prev_index) >= ((1 << bit))) { - nn_cnt++; - prev_index = i; - } - } - - size_t len_in_bit = nn_cnt * bit + nn_cnt * unique_value_bit; - if (len_in_bit < min_len_in_bit) { - min_len_in_bit = len_in_bit; - *coor_best_bit = bit; - best_nn_cnt = nn_cnt; - } - } - return best_nn_cnt; -} - -template -bool PackRepetition(size_t bit_num, schema::TensorT *tensor) { - if (tensor->weightQuantCompressType != schema::WeightQuantCompressType_NONE) { - MS_LOG(INFO) << tensor->name << " is shared weight."; - return true; - } - auto quant_data_array = reinterpret_cast(tensor->data.data()); - std::vector quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T)); - auto elem_cnt = quant_data.size(); - auto dims = tensor->dims; - size_t elem_cnt_by_dims = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<>()); - if (elem_cnt != elem_cnt_by_dims) { - MS_LOG(ERROR) << tensor->name << " elem_cnt: " << elem_cnt << " not equal elem_cnt_by_dims: " << elem_cnt_by_dims; - return false; - } - - auto &quant_params = tensor->quantParams; - - std::set quant_data_set; - for (auto quant_value : quant_data) { - quant_data_set.insert(quant_value); - } - std::map unique_value_index_map; - auto index = 0; - for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) { - unique_value_index_map[*iter] = index++; - } - - auto unique_value_cnt = quant_data_set.size(); - size_t unique_value_bit = ceil(log2(unique_value_cnt)); - auto pack_repetition_size_in_bit = bit_num + bit_num * unique_value_cnt + unique_value_bit * elem_cnt; - size_t pack_repetition_size_in_byte = ceil(1.0 * pack_repetition_size_in_bit / k8Bit); - size_t origin_size_in_byte = ceil(1.0 * bit_num * elem_cnt / k8Bit); - - size_t coor_best_bit = 0; - auto nz_cnt = CalCoorBestBit(quant_data, elem_cnt, quant_params, unique_value_bit, &coor_best_bit); - // 1. coor_best_bit 2. nz_cnt 3. quant_data_set size 4. unique_values 5. unique_value indexing 6. nz values coord - const auto pack_sparsity_size_in_bit = - 1 * k8Bit + 4 * k8Bit + bit_num + bit_num * unique_value_cnt + unique_value_bit * nz_cnt + nz_cnt * coor_best_bit; - size_t pack_sparsity_size_in_byte = ceil(1.0 * pack_sparsity_size_in_bit / k8Bit); - MS_LOG(DEBUG) << "coor_best_bit: " << coor_best_bit << " ori: " << origin_size_in_byte - << " indexing: " << pack_repetition_size_in_byte << " sparse: " << pack_sparsity_size_in_byte; - auto min_byte_need = std::min({origin_size_in_byte, pack_repetition_size_in_byte, pack_sparsity_size_in_byte}); - if (min_byte_need == origin_size_in_byte) { - return false; - } else if (min_byte_need == pack_repetition_size_in_byte) { - MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_repetition_size_in_byte; - return IndexingCompress(quant_data_set, unique_value_index_map, unique_value_bit, unique_value_cnt, - pack_repetition_size_in_byte, bit_num, tensor); - } else if (min_byte_need == pack_sparsity_size_in_byte) { - MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_sparsity_size_in_byte; - return SparsityCompress(quant_data_set, unique_value_index_map, unique_value_bit, unique_value_cnt, - pack_sparsity_size_in_byte, nz_cnt, coor_best_bit, bit_num, tensor); - } else { - MS_LOG(DEBUG) << "unexpected: " << min_byte_need << " not in {" << origin_size_in_byte << " " - << pack_repetition_size_in_byte << " " << pack_sparsity_size_in_byte << "}"; - } - return false; -} } // namespace mindspore::lite::quant #endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_QUANTIZE_UTIL_H_ diff --git a/mindspore/lite/tools/converter/quantizer/tensor_compressor.cc b/mindspore/lite/tools/converter/quantizer/tensor_compressor.cc new file mode 100644 index 00000000000..94be42d80aa --- /dev/null +++ b/mindspore/lite/tools/converter/quantizer/tensor_compressor.cc @@ -0,0 +1,87 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tools/converter/quantizer/tensor_compressor.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace mindspore::lite::quant { +namespace { +constexpr size_t kBitNumPerByte = 8; +} +std::string TensorCompressor::BoolVectorToString(const std::vector &bool_vec) { + size_t size_in_byte = static_cast(ceil(bool_vec.size() / kBitNumPerByte)); + std::string str(size_in_byte, '\0'); + auto iter = str.begin(); + size_t shift = kBitNumPerByte; + for (bool bit : bool_vec) { + *iter |= bit << (shift - 1); + if (--shift == 0) { + iter++; + shift = kBitNumPerByte; + } + } + return str; +} + +int TensorCompressor::DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input) { + if (bit_num > 0 && bit_num < k8Bit) { + std::vector origin_data(tensor_input->data.size()); + auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int8_t), tensor_input->data.data(), + tensor_input->data.size() * sizeof(uint8_t)); + if (status != EOK) { + MS_LOG(ERROR) << tensor_input->name << " memcpy failed. " << status; + return RET_ERROR; + } + std::vector pack_data{}; + BitPack::BitPacking(bit_num, origin_data, &pack_data); + tensor_input->data.resize(pack_data.size() * sizeof(uint8_t)); + status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(), + pack_data.size() * sizeof(uint8_t)); + if (status != EOK) { + MS_LOG(ERROR) << "memcpy_s failed. " << status; + return RET_ERROR; + } + } else if (bit_num > k8Bit && bit_num < k16Bit) { + auto shape_size = + std::accumulate(tensor_input->dims.begin(), tensor_input->dims.end(), size_t(1), std::multiplies()); + std::vector origin_data(shape_size); + auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int16_t), tensor_input->data.data(), + tensor_input->data.size() * sizeof(uint8_t)); + if (status != EOK) { + MS_LOG(ERROR) << "memcpy failed. " << status; + return RET_ERROR; + } + std::vector pack_data{}; + BitPack::BitPacking(bit_num, origin_data, &pack_data); + tensor_input->data.resize(pack_data.size() * sizeof(uint16_t)); + status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(), + pack_data.size() * sizeof(uint16_t)); + if (status != EOK) { + MS_LOG(ERROR) << "memcpy_s failed. " << status; + return RET_ERROR; + } + } + return RET_OK; +} +} // namespace mindspore::lite::quant diff --git a/mindspore/lite/tools/converter/quantizer/tensor_compressor.h b/mindspore/lite/tools/converter/quantizer/tensor_compressor.h new file mode 100644 index 00000000000..68ad61f6603 --- /dev/null +++ b/mindspore/lite/tools/converter/quantizer/tensor_compressor.h @@ -0,0 +1,257 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_TENSOR_COMPRESSOR_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_TENSOR_COMPRESSOR_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "src/common/log_adapter.h" +#include "schema/inner/model_generated.h" +#include "tools/converter/quantizer/bitpacking.h" +#include "include/errorcode.h" +#include "tools/converter/quantizer/quant_params.h" + +namespace mindspore::lite::quant { +class TensorCompressor { + public: + template + bool PackRepetition(size_t bit_num, schema::TensorT *tensor) { + if (tensor->weightQuantCompressType != schema::WeightQuantCompressType_NONE) { + MS_LOG(INFO) << tensor->name << " is shared weight."; + return true; + } + auto quant_data_array = reinterpret_cast(tensor->data.data()); + std::vector quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T)); + auto elem_cnt = quant_data.size(); + auto dims = tensor->dims; + size_t elem_cnt_by_dims = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<>()); + if (elem_cnt != elem_cnt_by_dims) { + MS_LOG(ERROR) << tensor->name << " elem_cnt: " << elem_cnt << " not equal elem_cnt_by_dims: " << elem_cnt_by_dims; + return false; + } + + auto &quant_params = tensor->quantParams; + + std::set quant_data_set; + for (auto quant_value : quant_data) { + quant_data_set.insert(quant_value); + } + std::map unique_value_index_map; + auto index = 0; + for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) { + unique_value_index_map[*iter] = index++; + } + + auto unique_value_cnt = quant_data_set.size(); + size_t unique_value_bit = ceil(log2(unique_value_cnt)); + auto pack_repetition_size_in_bit = bit_num + bit_num * unique_value_cnt + unique_value_bit * elem_cnt; + size_t pack_repetition_size_in_byte = ceil(1.0 * pack_repetition_size_in_bit / k8Bit); + size_t origin_size_in_byte = ceil(1.0 * bit_num * elem_cnt / k8Bit); + + size_t coor_best_bit = 0; + auto nz_cnt = CalCoorBestBit(quant_data, elem_cnt, quant_params, unique_value_bit, &coor_best_bit); + // 1. coor_best_bit 2. nz_cnt 3. quant_data_set size 4. unique_values 5. unique_value indexing 6. nz values coord + const auto pack_sparsity_size_in_bit = + 1 * k8Bit + 4 * k8Bit + bit_num + bit_num * unique_value_cnt + unique_value_bit * nz_cnt + nz_cnt * coor_best_bit; + size_t pack_sparsity_size_in_byte = ceil(1.0 * pack_sparsity_size_in_bit / k8Bit); + MS_LOG(DEBUG) << "coor_best_bit: " << coor_best_bit << " ori: " << origin_size_in_byte + << " indexing: " << pack_repetition_size_in_byte << " sparse: " << pack_sparsity_size_in_byte; + auto min_byte_need = std::min({origin_size_in_byte, pack_repetition_size_in_byte, pack_sparsity_size_in_byte}); + if (min_byte_need == origin_size_in_byte) { + return false; + } else if (min_byte_need == pack_repetition_size_in_byte) { + MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_repetition_size_in_byte; + return IndexingCompress(quant_data_set, unique_value_index_map, unique_value_bit, unique_value_cnt, + pack_repetition_size_in_byte, bit_num, tensor); + } else if (min_byte_need == pack_sparsity_size_in_byte) { + MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_sparsity_size_in_byte; + return SparsityCompress(quant_data_set, unique_value_index_map, unique_value_bit, unique_value_cnt, + pack_sparsity_size_in_byte, nz_cnt, coor_best_bit, bit_num, tensor); + } else { + MS_LOG(DEBUG) << "unexpected: " << min_byte_need << " not in {" << origin_size_in_byte << " " + << pack_repetition_size_in_byte << " " << pack_sparsity_size_in_byte << "}"; + } + return false; + } + + int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input); + + private: + template + bool IndexingCompress(const std::set &quant_data_set, const std::map &unique_value_index_map, + size_t unique_value_bit, size_t unique_value_cnt, size_t pack_repetition_size_in_byte, + size_t bit_num, schema::TensorT *tensor) { + auto quant_data_array = reinterpret_cast(tensor->data.data()); + std::vector quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T)); + + std::vector bits(pack_repetition_size_in_byte * k8Bit); + size_t index = 0; + // write unique_value_cnt: bit_num bit for unsigned + for (size_t i = 0; i < bit_num; i++) { + bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & (0x1); + } + // write the unique value set: each value has bit_num bit signed + for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) { + for (size_t i = 0; i < bit_num; i++) { + bits[index++] = ((*iter + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1); + } + } + // write the index: each index has unique_value_bit unsigned + for (auto quant_value : quant_data) { + for (size_t i = 0; i < unique_value_bit; i++) { + bits[index++] = (unique_value_index_map.at(quant_value) >> (unique_value_bit - i - 1)) & (0x1); + } + } + if (index > pack_repetition_size_in_byte * k8Bit) { + MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than " + << pack_repetition_size_in_byte * k8Bit; + return false; + } + // update tensor data + auto new_data_str = BoolVectorToString(bits); + auto ret = memcpy_s(tensor->data.data(), tensor->data.size(), new_data_str.c_str(), new_data_str.size()); + if (ret != EOK) { + MS_LOG(ERROR) << "memcpy error"; + return false; + } + tensor->data.resize(new_data_str.size()); + + tensor->weightQuantCompressType = schema::WeightQuantCompressType_INDEXING; + MS_LOG(DEBUG) << "set WeightQuantCompressType_INDEXING"; + return true; + } + + template + bool SparsityCompress(const std::set &quant_data_set, const std::map &unique_value_index_map, + size_t unique_value_bit, size_t unique_value_cnt, size_t pack_sparsity_size_in_byte, + size_t nz_cnt, size_t coor_best_bit, size_t bit_num, schema::TensorT *tensor) { + auto quant_data_array = reinterpret_cast(tensor->data.data()); + std::vector quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T)); + auto &quant_params = tensor->quantParams; + auto elem_cnt = quant_data.size(); + auto channel_cnt = quant_params.size(); + if (channel_cnt == 0) { + MS_LOG(ERROR) << "quant_params is empty."; + return false; + } + auto elem_perchannel = elem_cnt / channel_cnt; + + std::vector bits(pack_sparsity_size_in_byte * k8Bit); + int index = 0; + // coor_best_bit + for (size_t i = 0; i < k8Bit; i++) { + bits[index++] = (coor_best_bit >> (k8Bit - i - 1)) & 0x1; + } + // nz_cnt + for (size_t i = 0; i < k32Bit; i++) { + bits[index++] = (nz_cnt >> (k32Bit - i - 1)) & 0x1; + } + // unique_value cnt + for (size_t i = 0; i < bit_num; i++) { + bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & 0x1; + } + // unique_values + for (auto unique_value : quant_data_set) { + for (size_t i = 0; i < bit_num; i++) { + bits[index++] = ((unique_value + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1); + } + } + // nz values indexing && get coor + std::vector coors(nz_cnt); + size_t coors_index = 0; + size_t prev_index = -1; + for (size_t di = 0; di < elem_cnt; di++) { + auto cur_channel = di / elem_perchannel; + auto zp = quant_params[cur_channel]->zeroPoint; + auto nz_value = quant_data[di]; + if (nz_value != zp || (di - prev_index) >= static_cast((1 << coor_best_bit))) { + MS_ASSERT(coors_index < nz_cnt); + coors[coors_index++] = di - prev_index - 1; + prev_index = di; + for (size_t i = 0; i < unique_value_bit; i++) { + bits[index++] = (unique_value_index_map.at(nz_value) >> (unique_value_bit - i - 1)) & (0x1); + } + } + } + // write coor + for (auto coor : coors) { + for (size_t i = 0; i < coor_best_bit; i++) { + bits[index++] = (coor >> (coor_best_bit - i - 1)) & 0x1; + } + } + if ((unsigned int)index > pack_sparsity_size_in_byte * k8Bit) { + MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than " + << pack_sparsity_size_in_byte * k8Bit; + return false; + } + auto new_data_str = BoolVectorToString(bits); + auto ret = memcpy_s(tensor->data.data(), tensor->data.size(), new_data_str.c_str(), new_data_str.size()); + if (ret != EOK) { + MS_LOG(ERROR) << "memcpy error"; + return false; + } + tensor->data.resize(new_data_str.size()); + + tensor->weightQuantCompressType = schema::WeightQuantCompressType_SPARSE; + MS_LOG(INFO) << "set WeightQuantCompressType_SPARSITY"; + return true; + } + + template + size_t CalCoorBestBit(const std::vector &quant_data, size_t elem_cnt, + const std::vector> &quant_params, int unique_value_bit, + size_t *coor_best_bit) { + MS_ASSERT(!quant_params.empty()); + size_t best_nn_cnt = 0; + size_t min_len_in_bit = std::numeric_limits::max(); + for (size_t bit = k2Bit; bit <= k10Bit; bit++) { + // search + int nn_cnt = 0; + int prev_index = -1; + auto channel_cnt = quant_params.size(); + MS_ASSERT(channel_cnt > 0); + auto elem_perchannel = elem_cnt / channel_cnt; + for (size_t i = 0; i < elem_cnt; i++) { + auto cur_channel = i / elem_perchannel; + auto zp = quant_params[cur_channel]->zeroPoint; + if (quant_data[i] != zp || (static_cast(i) - prev_index) >= ((1 << bit))) { + nn_cnt++; + prev_index = i; + } + } + + size_t len_in_bit = nn_cnt * bit + nn_cnt * unique_value_bit; + if (len_in_bit < min_len_in_bit) { + min_len_in_bit = len_in_bit; + *coor_best_bit = bit; + best_nn_cnt = nn_cnt; + } + } + return best_nn_cnt; + } + + std::string BoolVectorToString(const std::vector &bool_vec); +}; +} // namespace mindspore::lite::quant +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_TENSOR_COMPRESSOR_H_ diff --git a/mindspore/lite/tools/lite_exporter/anf_exporter.cc b/mindspore/lite/tools/lite_exporter/anf_exporter.cc index 3af113665c2..271df20c186 100644 --- a/mindspore/lite/tools/lite_exporter/anf_exporter.cc +++ b/mindspore/lite/tools/lite_exporter/anf_exporter.cc @@ -47,6 +47,7 @@ #include "tools/converter/converter_context.h" #include "tools/converter/quantizer/quantize_util.h" #include "tools/converter/quantizer/fse_encoder.h" +#include "tools/converter/quantizer/tensor_compressor.h" #include "nnacl/op_base.h" using mindspore::ops::PrimitiveC; @@ -131,14 +132,15 @@ static STATUS CompressTensor(schema::TensorT *tensor_input, const std::unique_pt if (dst_node->quantType != schema::QuantType_QUANT_WEIGHT) { return RET_OK; } + auto compressor = quant::TensorCompressor(); if (bit_num == kBitNumMix) { tensor_input->quantParams.clear(); } else if (bit_num == kBitNum8) { - (void)quant::PackRepetition(bit_num, tensor_input); + (void)compressor.PackRepetition(bit_num, tensor_input); } else if (bit_num == kBitNum16) { - (void)quant::PackRepetition(bit_num, tensor_input); + (void)compressor.PackRepetition(bit_num, tensor_input); } else { - auto status = quant::DoBitPack(bit_num, tensor_input); + auto status = compressor.DoBitPack(bit_num, tensor_input); if (status != RET_OK) { MS_LOG(ERROR) << "do bit pack failed. " << status; return RET_ERROR;