!38311 mv tensor compress to compressor
Merge pull request !38311 from yeyunpeng2020/master_compression
This commit is contained in:
commit
c9937a553f
|
@ -21,6 +21,7 @@
|
|||
#include <algorithm>
|
||||
#include "tools/converter/converter_context.h"
|
||||
#include "tools/converter/quantizer/quantize_util.h"
|
||||
#include "tools/converter/quantizer/tensor_compressor.h"
|
||||
#include "tools/common/tensor_util.h"
|
||||
#include "tools/common/graph_util.h"
|
||||
#include "tools/common/meta_graph_utils.h"
|
||||
|
@ -192,6 +193,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) {
|
|||
index++;
|
||||
continue;
|
||||
}
|
||||
auto compressor = quant::TensorCompressor();
|
||||
|
||||
if (tensor->quantParams.size() > 1) { // perchannel
|
||||
status = ComputeQuantTensorPerChannel(tensor.get(), index, *graph);
|
||||
|
@ -200,7 +202,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) {
|
|||
return RET_ERROR;
|
||||
}
|
||||
int bit_num = tensor->quantParams.front()->numBits;
|
||||
if (quant::DoBitPack(bit_num, tensor.get()) != RET_OK) {
|
||||
if (compressor.DoBitPack(bit_num, tensor.get()) != RET_OK) {
|
||||
MS_LOG(ERROR) << "bit pack failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
@ -213,7 +215,7 @@ STATUS TensorQuantPass::Run(schema::MetaGraphT *graph) {
|
|||
quantParam->dstDtype == TypeId::kNumberTypeFloat32 || quantParam->dstDtype == TypeId::kNumberTypeFloat) {
|
||||
status = ComputeDataToInt8(tensor);
|
||||
int bit_num = tensor->quantParams.front()->numBits;
|
||||
if (quant::DoBitPack(bit_num, tensor.get()) != RET_OK) {
|
||||
if (compressor.DoBitPack(bit_num, tensor.get()) != RET_OK) {
|
||||
MS_LOG(ERROR) << "bit pack failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
|
|
@ -16,12 +16,15 @@
|
|||
|
||||
#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H_
|
||||
#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_BITPACKING_H_
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <stack>
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
#include "tools/converter/quantizer/quant_params.h"
|
||||
|
||||
using mindspore::lite::quant::k8Bit;
|
||||
namespace mindspore::lite {
|
||||
class BitPack {
|
||||
public:
|
||||
|
@ -36,8 +39,8 @@ class BitPack {
|
|||
DoBinary<T2>(bit_num, tmp, &bit_data_vec, packed_data_vec);
|
||||
}
|
||||
size_t remain_bit_data = bit_data_vec.size();
|
||||
if (sizeof(T1) * 8 > remain_bit_data && remain_bit_data > 0) {
|
||||
for (size_t i = 0; i < sizeof(T1) * 8 - remain_bit_data; i++) {
|
||||
if (sizeof(T1) * k8Bit > remain_bit_data && remain_bit_data > 0) {
|
||||
for (size_t i = 0; i < sizeof(T1) * k8Bit - remain_bit_data; i++) {
|
||||
bit_data_vec.push(false);
|
||||
}
|
||||
PackFromOriginToUint<T2>(&bit_data_vec, packed_data_vec);
|
||||
|
@ -49,7 +52,7 @@ class BitPack {
|
|||
static void PackFromOriginToUint(std::stack<bool> *ans, std::vector<T2> *packed_data_vec) {
|
||||
MS_ASSERT(ans != nullptr);
|
||||
uint32_t result = 0;
|
||||
for (size_t i = 0; i < sizeof(T2) * 8; i++) {
|
||||
for (size_t i = 0; i < sizeof(T2) * k8Bit; i++) {
|
||||
bool bit_tmp = ans->top();
|
||||
result = (result << 1) + static_cast<size_t>(bit_tmp);
|
||||
ans->pop();
|
||||
|
@ -64,7 +67,7 @@ class BitPack {
|
|||
bool a = n % 2;
|
||||
n = n / 2;
|
||||
ans->push(a);
|
||||
if (ans->size() == sizeof(T2) * 8) {
|
||||
if (ans->size() == sizeof(T2) * k8Bit) {
|
||||
PackFromOriginToUint(ans, packed_data_vec);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,24 @@
|
|||
#include <set>
|
||||
#include "schema/inner/model_generated.h"
|
||||
namespace mindspore::lite::quant {
|
||||
enum WeightQuantType {
|
||||
FIXED_BIT_PER_CHANNEL = 0,
|
||||
FIXED_BIT_PER_LAYER = 1,
|
||||
MIXED_BIT_PER_LAYER = 2,
|
||||
};
|
||||
constexpr size_t k2Bit = 2;
|
||||
constexpr size_t k8Bit = 8;
|
||||
constexpr size_t k10Bit = 10;
|
||||
constexpr size_t k16Bit = 16;
|
||||
constexpr size_t k32Bit = 32;
|
||||
constexpr size_t kMaxNum1024 = 1024;
|
||||
constexpr size_t kMillisecondsBase = 10;
|
||||
constexpr float kDelta = 0.1;
|
||||
constexpr float kRatio = 10.0;
|
||||
constexpr int kCpuBindMode = 1;
|
||||
constexpr int kPrimIndex = 0;
|
||||
constexpr int kPrimOffset = 1;
|
||||
|
||||
enum ActivationQuantizedMethod {
|
||||
MAX_MIN = 0,
|
||||
KL = 1,
|
||||
|
|
|
@ -49,7 +49,6 @@ constexpr int kLstmWeightShapeSize = 3;
|
|||
constexpr int kSingleDirBiasTensorSize = 4;
|
||||
constexpr int kLstmBiasShapeSize = 2;
|
||||
constexpr int kLstmBiasIndex = 3;
|
||||
constexpr size_t kBitNumPerByte = 8;
|
||||
constexpr size_t kGatherAxisIndex = 3;
|
||||
|
||||
int ComputeBiasDataAndQuantParam(const std::vector<double> &bias_scales, const std::vector<double> &input_scales,
|
||||
|
@ -595,21 +594,6 @@ bool CheckNodeInSet(const CNodePtr &cnode, const std::set<PrimitivePtr> &support
|
|||
return false;
|
||||
}
|
||||
|
||||
std::string BoolVectorToString(const std::vector<bool> &bool_vec) {
|
||||
size_t size_in_byte = static_cast<size_t>(ceil(bool_vec.size() / kBitNumPerByte));
|
||||
std::string str(size_in_byte, '\0');
|
||||
auto iter = str.begin();
|
||||
size_t shift = kBitNumPerByte;
|
||||
for (bool bit : bool_vec) {
|
||||
*iter |= bit << (shift - 1);
|
||||
if (--shift == 0) {
|
||||
iter++;
|
||||
shift = kBitNumPerByte;
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive) {
|
||||
CHECK_NULL_RETURN(bias);
|
||||
CHECK_NULL_RETURN(primitive);
|
||||
|
@ -703,47 +687,6 @@ int DeQuantData(const mindspore::MSTensor *tensor, std::vector<double> *dequant_
|
|||
tensor->QuantParams(), dequant_data, preferred_dim);
|
||||
}
|
||||
|
||||
int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input) {
|
||||
if (bit_num > 0 && bit_num < k8Bit) {
|
||||
std::vector<int8_t> origin_data(tensor_input->data.size());
|
||||
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int8_t), tensor_input->data.data(),
|
||||
tensor_input->data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << tensor_input->name << " memcpy failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::vector<uint8_t> pack_data{};
|
||||
BitPack::BitPacking<int8_t, uint8_t>(bit_num, origin_data, &pack_data);
|
||||
tensor_input->data.resize(pack_data.size() * sizeof(uint8_t));
|
||||
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
|
||||
pack_data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy_s failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (bit_num > k8Bit && bit_num < k16Bit) {
|
||||
auto shape_size =
|
||||
std::accumulate(tensor_input->dims.begin(), tensor_input->dims.end(), size_t(1), std::multiplies<size_t>());
|
||||
std::vector<int16_t> origin_data(shape_size);
|
||||
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int16_t), tensor_input->data.data(),
|
||||
tensor_input->data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::vector<uint16_t> pack_data{};
|
||||
BitPack::BitPacking<int16_t, uint16_t>(bit_num, origin_data, &pack_data);
|
||||
tensor_input->data.resize(pack_data.size() * sizeof(uint16_t));
|
||||
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
|
||||
pack_data.size() * sizeof(uint16_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy_s failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int GetElementNumFromShape(const std::vector<int> &dims, int *total_size) {
|
||||
CHECK_NULL_RETURN(total_size);
|
||||
*total_size = 1;
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
#include "ir/primitive.h"
|
||||
#include "abstract/dshape.h"
|
||||
#include "tools/converter/quantizer/huffman_encode.h"
|
||||
#include "tools/converter/quantizer/bitpacking.h"
|
||||
#include "tools/converter/quantizer/quant_params.h"
|
||||
#include "tools/converter/quantizer/mixed_bit_weight_quantizer.h"
|
||||
#include "src/runtime/lite_session.h"
|
||||
#include "tools/converter/graphdef_transform.h"
|
||||
|
@ -56,24 +56,6 @@
|
|||
#include "tools/common/string_util.h"
|
||||
|
||||
namespace mindspore::lite::quant {
|
||||
enum WeightQuantType {
|
||||
FIXED_BIT_PER_CHANNEL = 0,
|
||||
FIXED_BIT_PER_LAYER = 1,
|
||||
MIXED_BIT_PER_LAYER = 2,
|
||||
};
|
||||
constexpr size_t k2Bit = 2;
|
||||
constexpr size_t k8Bit = 8;
|
||||
constexpr size_t k10Bit = 10;
|
||||
constexpr size_t k16Bit = 16;
|
||||
constexpr size_t k32Bit = 32;
|
||||
constexpr size_t kMaxNum1024 = 1024;
|
||||
constexpr size_t kMillisecondsBase = 10;
|
||||
constexpr float kDelta = 0.1;
|
||||
constexpr float kRatio = 10.0;
|
||||
constexpr int kCpuBindMode = 1;
|
||||
constexpr int kPrimIndex = 0;
|
||||
constexpr int kPrimOffset = 1;
|
||||
|
||||
QuantParamHolderPtr GetCNodeQuantHolder(const PrimitivePtr &primitive);
|
||||
|
||||
QuantParamHolderPtr GetCNodeQuantHolder(const CNodePtr &cnode);
|
||||
|
@ -102,8 +84,6 @@ int DoParameterBiasQuant(const ParameterPtr &bias, const PrimitivePtr &primitive
|
|||
|
||||
int DeQuantData(const mindspore::MSTensor *tensor, std::vector<double> *dequant_data, int preferred_dim = 0);
|
||||
|
||||
int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input);
|
||||
|
||||
int GetQuantType(const CNodePtr &cnode);
|
||||
|
||||
template <typename T>
|
||||
|
@ -198,221 +178,9 @@ void GetLiteParameter(const AnfNodePtr &node, ParameterPtr *param_node, tensor::
|
|||
|
||||
bool CheckNodeInSet(const CNodePtr &cnode, const std::set<PrimitivePtr> &support_primitive_types);
|
||||
|
||||
std::string BoolVectorToString(const std::vector<bool> &bool_vec);
|
||||
|
||||
int GetElementNumFromShape(const std::vector<int> &dims, int *total_size);
|
||||
|
||||
int GetBucketAllIndex(const std::vector<int> &dims, int preferred_dim,
|
||||
std::vector<std::vector<int>> *buckets_data_index);
|
||||
|
||||
template <typename T>
|
||||
bool IndexingCompress(const std::set<T> &quant_data_set, const std::map<T, size_t> &unique_value_index_map,
|
||||
size_t unique_value_bit, size_t unique_value_cnt, size_t pack_repetition_size_in_byte,
|
||||
size_t bit_num, schema::TensorT *tensor) {
|
||||
auto quant_data_array = reinterpret_cast<T *>(tensor->data.data());
|
||||
std::vector<T> quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T));
|
||||
|
||||
std::vector<bool> bits(pack_repetition_size_in_byte * k8Bit);
|
||||
size_t index = 0;
|
||||
// write unique_value_cnt: bit_num bit for unsigned
|
||||
for (size_t i = 0; i < bit_num; i++) {
|
||||
bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & (0x1);
|
||||
}
|
||||
// write the unique value set: each value has bit_num bit signed
|
||||
for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) {
|
||||
for (size_t i = 0; i < bit_num; i++) {
|
||||
bits[index++] = ((*iter + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1);
|
||||
}
|
||||
}
|
||||
// write the index: each index has unique_value_bit unsigned
|
||||
for (auto quant_value : quant_data) {
|
||||
for (size_t i = 0; i < unique_value_bit; i++) {
|
||||
bits[index++] = (unique_value_index_map.at(quant_value) >> (unique_value_bit - i - 1)) & (0x1);
|
||||
}
|
||||
}
|
||||
if (index > pack_repetition_size_in_byte * k8Bit) {
|
||||
MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than "
|
||||
<< pack_repetition_size_in_byte * k8Bit;
|
||||
return false;
|
||||
}
|
||||
// update tensor data
|
||||
auto new_data_str = BoolVectorToString(bits);
|
||||
auto ret = memcpy_s(tensor->data.data(), tensor->data.size(), new_data_str.c_str(), new_data_str.size());
|
||||
if (ret != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy error";
|
||||
return false;
|
||||
}
|
||||
tensor->data.resize(new_data_str.size());
|
||||
|
||||
tensor->weightQuantCompressType = schema::WeightQuantCompressType_INDEXING;
|
||||
MS_LOG(DEBUG) << "set WeightQuantCompressType_INDEXING";
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool SparsityCompress(const std::set<T> &quant_data_set, const std::map<T, size_t> &unique_value_index_map,
|
||||
size_t unique_value_bit, size_t unique_value_cnt, size_t pack_sparsity_size_in_byte,
|
||||
size_t nz_cnt, size_t coor_best_bit, size_t bit_num, schema::TensorT *tensor) {
|
||||
auto quant_data_array = reinterpret_cast<T *>(tensor->data.data());
|
||||
std::vector<T> quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T));
|
||||
auto &quant_params = tensor->quantParams;
|
||||
auto elem_cnt = quant_data.size();
|
||||
auto channel_cnt = quant_params.size();
|
||||
MS_CHECK_TRUE_MSG(channel_cnt != 0, false, "div zero.");
|
||||
auto elem_perchannel = elem_cnt / channel_cnt;
|
||||
|
||||
std::vector<bool> bits(pack_sparsity_size_in_byte * k8Bit);
|
||||
int index = 0;
|
||||
// coor_best_bit
|
||||
for (size_t i = 0; i < k8Bit; i++) {
|
||||
bits[index++] = (coor_best_bit >> (k8Bit - i - 1)) & 0x1;
|
||||
}
|
||||
// nz_cnt
|
||||
for (size_t i = 0; i < k32Bit; i++) {
|
||||
bits[index++] = (nz_cnt >> (k32Bit - i - 1)) & 0x1;
|
||||
}
|
||||
// unique_value cnt
|
||||
for (size_t i = 0; i < bit_num; i++) {
|
||||
bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & 0x1;
|
||||
}
|
||||
// unique_values
|
||||
for (auto unique_value : quant_data_set) {
|
||||
for (size_t i = 0; i < bit_num; i++) {
|
||||
bits[index++] = ((unique_value + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1);
|
||||
}
|
||||
}
|
||||
// nz values indexing && get coor
|
||||
std::vector<size_t> coors(nz_cnt);
|
||||
size_t coors_index = 0;
|
||||
size_t prev_index = -1;
|
||||
for (size_t di = 0; di < elem_cnt; di++) {
|
||||
auto cur_channel = di / elem_perchannel;
|
||||
auto zp = quant_params[cur_channel]->zeroPoint;
|
||||
auto nz_value = quant_data[di];
|
||||
if (nz_value != zp || (di - prev_index) >= static_cast<size_t>((1 << coor_best_bit))) {
|
||||
MS_ASSERT(coors_index < nz_cnt);
|
||||
coors[coors_index++] = di - prev_index - 1;
|
||||
prev_index = di;
|
||||
for (size_t i = 0; i < unique_value_bit; i++) {
|
||||
bits[index++] = (unique_value_index_map.at(nz_value) >> (unique_value_bit - i - 1)) & (0x1);
|
||||
}
|
||||
}
|
||||
}
|
||||
// write coor
|
||||
for (auto coor : coors) {
|
||||
for (size_t i = 0; i < coor_best_bit; i++) {
|
||||
bits[index++] = (coor >> (coor_best_bit - i - 1)) & 0x1;
|
||||
}
|
||||
}
|
||||
if ((unsigned int)index > pack_sparsity_size_in_byte * k8Bit) {
|
||||
MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than "
|
||||
<< pack_sparsity_size_in_byte * k8Bit;
|
||||
return false;
|
||||
}
|
||||
auto new_data_str = BoolVectorToString(bits);
|
||||
auto ret = memcpy_s(tensor->data.data(), tensor->data.size(), new_data_str.c_str(), new_data_str.size());
|
||||
if (ret != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy error";
|
||||
return false;
|
||||
}
|
||||
tensor->data.resize(new_data_str.size());
|
||||
|
||||
tensor->weightQuantCompressType = schema::WeightQuantCompressType_SPARSE;
|
||||
MS_LOG(INFO) << "set WeightQuantCompressType_SPARSITY";
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t CalCoorBestBit(const std::vector<T> &quant_data, size_t elem_cnt,
|
||||
const std::vector<std::unique_ptr<schema::QuantParamT>> &quant_params, int unique_value_bit,
|
||||
size_t *coor_best_bit) {
|
||||
MS_ASSERT(!quant_params.empty());
|
||||
size_t best_nn_cnt = 0;
|
||||
size_t min_len_in_bit = std::numeric_limits<size_t>::max();
|
||||
for (size_t bit = k2Bit; bit <= k10Bit; bit++) {
|
||||
// search
|
||||
int nn_cnt = 0;
|
||||
int prev_index = -1;
|
||||
auto channel_cnt = quant_params.size();
|
||||
MS_ASSERT(channel_cnt > 0);
|
||||
auto elem_perchannel = elem_cnt / channel_cnt;
|
||||
for (size_t i = 0; i < elem_cnt; i++) {
|
||||
auto cur_channel = i / elem_perchannel;
|
||||
auto zp = quant_params[cur_channel]->zeroPoint;
|
||||
if (quant_data[i] != zp || (static_cast<int>(i) - prev_index) >= ((1 << bit))) {
|
||||
nn_cnt++;
|
||||
prev_index = i;
|
||||
}
|
||||
}
|
||||
|
||||
size_t len_in_bit = nn_cnt * bit + nn_cnt * unique_value_bit;
|
||||
if (len_in_bit < min_len_in_bit) {
|
||||
min_len_in_bit = len_in_bit;
|
||||
*coor_best_bit = bit;
|
||||
best_nn_cnt = nn_cnt;
|
||||
}
|
||||
}
|
||||
return best_nn_cnt;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool PackRepetition(size_t bit_num, schema::TensorT *tensor) {
|
||||
if (tensor->weightQuantCompressType != schema::WeightQuantCompressType_NONE) {
|
||||
MS_LOG(INFO) << tensor->name << " is shared weight.";
|
||||
return true;
|
||||
}
|
||||
auto quant_data_array = reinterpret_cast<T *>(tensor->data.data());
|
||||
std::vector<T> quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T));
|
||||
auto elem_cnt = quant_data.size();
|
||||
auto dims = tensor->dims;
|
||||
size_t elem_cnt_by_dims = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<>());
|
||||
if (elem_cnt != elem_cnt_by_dims) {
|
||||
MS_LOG(ERROR) << tensor->name << " elem_cnt: " << elem_cnt << " not equal elem_cnt_by_dims: " << elem_cnt_by_dims;
|
||||
return false;
|
||||
}
|
||||
|
||||
auto &quant_params = tensor->quantParams;
|
||||
|
||||
std::set<T> quant_data_set;
|
||||
for (auto quant_value : quant_data) {
|
||||
quant_data_set.insert(quant_value);
|
||||
}
|
||||
std::map<T, size_t> unique_value_index_map;
|
||||
auto index = 0;
|
||||
for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) {
|
||||
unique_value_index_map[*iter] = index++;
|
||||
}
|
||||
|
||||
auto unique_value_cnt = quant_data_set.size();
|
||||
size_t unique_value_bit = ceil(log2(unique_value_cnt));
|
||||
auto pack_repetition_size_in_bit = bit_num + bit_num * unique_value_cnt + unique_value_bit * elem_cnt;
|
||||
size_t pack_repetition_size_in_byte = ceil(1.0 * pack_repetition_size_in_bit / k8Bit);
|
||||
size_t origin_size_in_byte = ceil(1.0 * bit_num * elem_cnt / k8Bit);
|
||||
|
||||
size_t coor_best_bit = 0;
|
||||
auto nz_cnt = CalCoorBestBit<T>(quant_data, elem_cnt, quant_params, unique_value_bit, &coor_best_bit);
|
||||
// 1. coor_best_bit 2. nz_cnt 3. quant_data_set size 4. unique_values 5. unique_value indexing 6. nz values coord
|
||||
const auto pack_sparsity_size_in_bit =
|
||||
1 * k8Bit + 4 * k8Bit + bit_num + bit_num * unique_value_cnt + unique_value_bit * nz_cnt + nz_cnt * coor_best_bit;
|
||||
size_t pack_sparsity_size_in_byte = ceil(1.0 * pack_sparsity_size_in_bit / k8Bit);
|
||||
MS_LOG(DEBUG) << "coor_best_bit: " << coor_best_bit << " ori: " << origin_size_in_byte
|
||||
<< " indexing: " << pack_repetition_size_in_byte << " sparse: " << pack_sparsity_size_in_byte;
|
||||
auto min_byte_need = std::min({origin_size_in_byte, pack_repetition_size_in_byte, pack_sparsity_size_in_byte});
|
||||
if (min_byte_need == origin_size_in_byte) {
|
||||
return false;
|
||||
} else if (min_byte_need == pack_repetition_size_in_byte) {
|
||||
MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_repetition_size_in_byte;
|
||||
return IndexingCompress<T>(quant_data_set, unique_value_index_map, unique_value_bit, unique_value_cnt,
|
||||
pack_repetition_size_in_byte, bit_num, tensor);
|
||||
} else if (min_byte_need == pack_sparsity_size_in_byte) {
|
||||
MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_sparsity_size_in_byte;
|
||||
return SparsityCompress<T>(quant_data_set, unique_value_index_map, unique_value_bit, unique_value_cnt,
|
||||
pack_sparsity_size_in_byte, nz_cnt, coor_best_bit, bit_num, tensor);
|
||||
} else {
|
||||
MS_LOG(DEBUG) << "unexpected: " << min_byte_need << " not in {" << origin_size_in_byte << " "
|
||||
<< pack_repetition_size_in_byte << " " << pack_sparsity_size_in_byte << "}";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} // namespace mindspore::lite::quant
|
||||
#endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_QUANTIZE_UTIL_H_
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tools/converter/quantizer/tensor_compressor.h"
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
|
||||
namespace mindspore::lite::quant {
|
||||
namespace {
|
||||
constexpr size_t kBitNumPerByte = 8;
|
||||
}
|
||||
std::string TensorCompressor::BoolVectorToString(const std::vector<bool> &bool_vec) {
|
||||
size_t size_in_byte = static_cast<size_t>(ceil(bool_vec.size() / kBitNumPerByte));
|
||||
std::string str(size_in_byte, '\0');
|
||||
auto iter = str.begin();
|
||||
size_t shift = kBitNumPerByte;
|
||||
for (bool bit : bool_vec) {
|
||||
*iter |= bit << (shift - 1);
|
||||
if (--shift == 0) {
|
||||
iter++;
|
||||
shift = kBitNumPerByte;
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
int TensorCompressor::DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input) {
|
||||
if (bit_num > 0 && bit_num < k8Bit) {
|
||||
std::vector<int8_t> origin_data(tensor_input->data.size());
|
||||
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int8_t), tensor_input->data.data(),
|
||||
tensor_input->data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << tensor_input->name << " memcpy failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::vector<uint8_t> pack_data{};
|
||||
BitPack::BitPacking<int8_t, uint8_t>(bit_num, origin_data, &pack_data);
|
||||
tensor_input->data.resize(pack_data.size() * sizeof(uint8_t));
|
||||
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
|
||||
pack_data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy_s failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (bit_num > k8Bit && bit_num < k16Bit) {
|
||||
auto shape_size =
|
||||
std::accumulate(tensor_input->dims.begin(), tensor_input->dims.end(), size_t(1), std::multiplies<size_t>());
|
||||
std::vector<int16_t> origin_data(shape_size);
|
||||
auto status = memcpy_s(origin_data.data(), origin_data.size() * sizeof(int16_t), tensor_input->data.data(),
|
||||
tensor_input->data.size() * sizeof(uint8_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::vector<uint16_t> pack_data{};
|
||||
BitPack::BitPacking<int16_t, uint16_t>(bit_num, origin_data, &pack_data);
|
||||
tensor_input->data.resize(pack_data.size() * sizeof(uint16_t));
|
||||
status = memcpy_s(tensor_input->data.data(), tensor_input->data.size() * sizeof(uint8_t), pack_data.data(),
|
||||
pack_data.size() * sizeof(uint16_t));
|
||||
if (status != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy_s failed. " << status;
|
||||
return RET_ERROR;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::lite::quant
|
|
@ -0,0 +1,257 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_TENSOR_COMPRESSOR_H_
|
||||
#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_TENSOR_COMPRESSOR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <numeric>
|
||||
#include <limits>
|
||||
#include <functional>
|
||||
#include <algorithm>
|
||||
#include "src/common/log_adapter.h"
|
||||
#include "schema/inner/model_generated.h"
|
||||
#include "tools/converter/quantizer/bitpacking.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "tools/converter/quantizer/quant_params.h"
|
||||
|
||||
namespace mindspore::lite::quant {
|
||||
class TensorCompressor {
|
||||
public:
|
||||
template <typename T>
|
||||
bool PackRepetition(size_t bit_num, schema::TensorT *tensor) {
|
||||
if (tensor->weightQuantCompressType != schema::WeightQuantCompressType_NONE) {
|
||||
MS_LOG(INFO) << tensor->name << " is shared weight.";
|
||||
return true;
|
||||
}
|
||||
auto quant_data_array = reinterpret_cast<T *>(tensor->data.data());
|
||||
std::vector<T> quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T));
|
||||
auto elem_cnt = quant_data.size();
|
||||
auto dims = tensor->dims;
|
||||
size_t elem_cnt_by_dims = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<>());
|
||||
if (elem_cnt != elem_cnt_by_dims) {
|
||||
MS_LOG(ERROR) << tensor->name << " elem_cnt: " << elem_cnt << " not equal elem_cnt_by_dims: " << elem_cnt_by_dims;
|
||||
return false;
|
||||
}
|
||||
|
||||
auto &quant_params = tensor->quantParams;
|
||||
|
||||
std::set<T> quant_data_set;
|
||||
for (auto quant_value : quant_data) {
|
||||
quant_data_set.insert(quant_value);
|
||||
}
|
||||
std::map<T, size_t> unique_value_index_map;
|
||||
auto index = 0;
|
||||
for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) {
|
||||
unique_value_index_map[*iter] = index++;
|
||||
}
|
||||
|
||||
auto unique_value_cnt = quant_data_set.size();
|
||||
size_t unique_value_bit = ceil(log2(unique_value_cnt));
|
||||
auto pack_repetition_size_in_bit = bit_num + bit_num * unique_value_cnt + unique_value_bit * elem_cnt;
|
||||
size_t pack_repetition_size_in_byte = ceil(1.0 * pack_repetition_size_in_bit / k8Bit);
|
||||
size_t origin_size_in_byte = ceil(1.0 * bit_num * elem_cnt / k8Bit);
|
||||
|
||||
size_t coor_best_bit = 0;
|
||||
auto nz_cnt = CalCoorBestBit<T>(quant_data, elem_cnt, quant_params, unique_value_bit, &coor_best_bit);
|
||||
// 1. coor_best_bit 2. nz_cnt 3. quant_data_set size 4. unique_values 5. unique_value indexing 6. nz values coord
|
||||
const auto pack_sparsity_size_in_bit =
|
||||
1 * k8Bit + 4 * k8Bit + bit_num + bit_num * unique_value_cnt + unique_value_bit * nz_cnt + nz_cnt * coor_best_bit;
|
||||
size_t pack_sparsity_size_in_byte = ceil(1.0 * pack_sparsity_size_in_bit / k8Bit);
|
||||
MS_LOG(DEBUG) << "coor_best_bit: " << coor_best_bit << " ori: " << origin_size_in_byte
|
||||
<< " indexing: " << pack_repetition_size_in_byte << " sparse: " << pack_sparsity_size_in_byte;
|
||||
auto min_byte_need = std::min({origin_size_in_byte, pack_repetition_size_in_byte, pack_sparsity_size_in_byte});
|
||||
if (min_byte_need == origin_size_in_byte) {
|
||||
return false;
|
||||
} else if (min_byte_need == pack_repetition_size_in_byte) {
|
||||
MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_repetition_size_in_byte;
|
||||
return IndexingCompress<T>(quant_data_set, unique_value_index_map, unique_value_bit, unique_value_cnt,
|
||||
pack_repetition_size_in_byte, bit_num, tensor);
|
||||
} else if (min_byte_need == pack_sparsity_size_in_byte) {
|
||||
MS_LOG(DEBUG) << "from " << origin_size_in_byte << " to " << pack_sparsity_size_in_byte;
|
||||
return SparsityCompress<T>(quant_data_set, unique_value_index_map, unique_value_bit, unique_value_cnt,
|
||||
pack_sparsity_size_in_byte, nz_cnt, coor_best_bit, bit_num, tensor);
|
||||
} else {
|
||||
MS_LOG(DEBUG) << "unexpected: " << min_byte_need << " not in {" << origin_size_in_byte << " "
|
||||
<< pack_repetition_size_in_byte << " " << pack_sparsity_size_in_byte << "}";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int DoBitPack(const size_t &bit_num, schema::TensorT *tensor_input);
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
bool IndexingCompress(const std::set<T> &quant_data_set, const std::map<T, size_t> &unique_value_index_map,
|
||||
size_t unique_value_bit, size_t unique_value_cnt, size_t pack_repetition_size_in_byte,
|
||||
size_t bit_num, schema::TensorT *tensor) {
|
||||
auto quant_data_array = reinterpret_cast<T *>(tensor->data.data());
|
||||
std::vector<T> quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T));
|
||||
|
||||
std::vector<bool> bits(pack_repetition_size_in_byte * k8Bit);
|
||||
size_t index = 0;
|
||||
// write unique_value_cnt: bit_num bit for unsigned
|
||||
for (size_t i = 0; i < bit_num; i++) {
|
||||
bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & (0x1);
|
||||
}
|
||||
// write the unique value set: each value has bit_num bit signed
|
||||
for (auto iter = quant_data_set.cbegin(); iter != quant_data_set.cend(); ++iter) {
|
||||
for (size_t i = 0; i < bit_num; i++) {
|
||||
bits[index++] = ((*iter + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1);
|
||||
}
|
||||
}
|
||||
// write the index: each index has unique_value_bit unsigned
|
||||
for (auto quant_value : quant_data) {
|
||||
for (size_t i = 0; i < unique_value_bit; i++) {
|
||||
bits[index++] = (unique_value_index_map.at(quant_value) >> (unique_value_bit - i - 1)) & (0x1);
|
||||
}
|
||||
}
|
||||
if (index > pack_repetition_size_in_byte * k8Bit) {
|
||||
MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than "
|
||||
<< pack_repetition_size_in_byte * k8Bit;
|
||||
return false;
|
||||
}
|
||||
// update tensor data
|
||||
auto new_data_str = BoolVectorToString(bits);
|
||||
auto ret = memcpy_s(tensor->data.data(), tensor->data.size(), new_data_str.c_str(), new_data_str.size());
|
||||
if (ret != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy error";
|
||||
return false;
|
||||
}
|
||||
tensor->data.resize(new_data_str.size());
|
||||
|
||||
tensor->weightQuantCompressType = schema::WeightQuantCompressType_INDEXING;
|
||||
MS_LOG(DEBUG) << "set WeightQuantCompressType_INDEXING";
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool SparsityCompress(const std::set<T> &quant_data_set, const std::map<T, size_t> &unique_value_index_map,
|
||||
size_t unique_value_bit, size_t unique_value_cnt, size_t pack_sparsity_size_in_byte,
|
||||
size_t nz_cnt, size_t coor_best_bit, size_t bit_num, schema::TensorT *tensor) {
|
||||
auto quant_data_array = reinterpret_cast<T *>(tensor->data.data());
|
||||
std::vector<T> quant_data(quant_data_array, quant_data_array + tensor->data.size() / sizeof(T));
|
||||
auto &quant_params = tensor->quantParams;
|
||||
auto elem_cnt = quant_data.size();
|
||||
auto channel_cnt = quant_params.size();
|
||||
if (channel_cnt == 0) {
|
||||
MS_LOG(ERROR) << "quant_params is empty.";
|
||||
return false;
|
||||
}
|
||||
auto elem_perchannel = elem_cnt / channel_cnt;
|
||||
|
||||
std::vector<bool> bits(pack_sparsity_size_in_byte * k8Bit);
|
||||
int index = 0;
|
||||
// coor_best_bit
|
||||
for (size_t i = 0; i < k8Bit; i++) {
|
||||
bits[index++] = (coor_best_bit >> (k8Bit - i - 1)) & 0x1;
|
||||
}
|
||||
// nz_cnt
|
||||
for (size_t i = 0; i < k32Bit; i++) {
|
||||
bits[index++] = (nz_cnt >> (k32Bit - i - 1)) & 0x1;
|
||||
}
|
||||
// unique_value cnt
|
||||
for (size_t i = 0; i < bit_num; i++) {
|
||||
bits[index++] = (unique_value_cnt >> (bit_num - i - 1)) & 0x1;
|
||||
}
|
||||
// unique_values
|
||||
for (auto unique_value : quant_data_set) {
|
||||
for (size_t i = 0; i < bit_num; i++) {
|
||||
bits[index++] = ((unique_value + (1 << (bit_num - 1))) >> (bit_num - i - 1)) & (0x1);
|
||||
}
|
||||
}
|
||||
// nz values indexing && get coor
|
||||
std::vector<size_t> coors(nz_cnt);
|
||||
size_t coors_index = 0;
|
||||
size_t prev_index = -1;
|
||||
for (size_t di = 0; di < elem_cnt; di++) {
|
||||
auto cur_channel = di / elem_perchannel;
|
||||
auto zp = quant_params[cur_channel]->zeroPoint;
|
||||
auto nz_value = quant_data[di];
|
||||
if (nz_value != zp || (di - prev_index) >= static_cast<size_t>((1 << coor_best_bit))) {
|
||||
MS_ASSERT(coors_index < nz_cnt);
|
||||
coors[coors_index++] = di - prev_index - 1;
|
||||
prev_index = di;
|
||||
for (size_t i = 0; i < unique_value_bit; i++) {
|
||||
bits[index++] = (unique_value_index_map.at(nz_value) >> (unique_value_bit - i - 1)) & (0x1);
|
||||
}
|
||||
}
|
||||
}
|
||||
// write coor
|
||||
for (auto coor : coors) {
|
||||
for (size_t i = 0; i < coor_best_bit; i++) {
|
||||
bits[index++] = (coor >> (coor_best_bit - i - 1)) & 0x1;
|
||||
}
|
||||
}
|
||||
if ((unsigned int)index > pack_sparsity_size_in_byte * k8Bit) {
|
||||
MS_LOG(ERROR) << "unexpected index: " << index << " should not be greater than "
|
||||
<< pack_sparsity_size_in_byte * k8Bit;
|
||||
return false;
|
||||
}
|
||||
auto new_data_str = BoolVectorToString(bits);
|
||||
auto ret = memcpy_s(tensor->data.data(), tensor->data.size(), new_data_str.c_str(), new_data_str.size());
|
||||
if (ret != EOK) {
|
||||
MS_LOG(ERROR) << "memcpy error";
|
||||
return false;
|
||||
}
|
||||
tensor->data.resize(new_data_str.size());
|
||||
|
||||
tensor->weightQuantCompressType = schema::WeightQuantCompressType_SPARSE;
|
||||
MS_LOG(INFO) << "set WeightQuantCompressType_SPARSITY";
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t CalCoorBestBit(const std::vector<T> &quant_data, size_t elem_cnt,
|
||||
const std::vector<std::unique_ptr<schema::QuantParamT>> &quant_params, int unique_value_bit,
|
||||
size_t *coor_best_bit) {
|
||||
MS_ASSERT(!quant_params.empty());
|
||||
size_t best_nn_cnt = 0;
|
||||
size_t min_len_in_bit = std::numeric_limits<size_t>::max();
|
||||
for (size_t bit = k2Bit; bit <= k10Bit; bit++) {
|
||||
// search
|
||||
int nn_cnt = 0;
|
||||
int prev_index = -1;
|
||||
auto channel_cnt = quant_params.size();
|
||||
MS_ASSERT(channel_cnt > 0);
|
||||
auto elem_perchannel = elem_cnt / channel_cnt;
|
||||
for (size_t i = 0; i < elem_cnt; i++) {
|
||||
auto cur_channel = i / elem_perchannel;
|
||||
auto zp = quant_params[cur_channel]->zeroPoint;
|
||||
if (quant_data[i] != zp || (static_cast<int>(i) - prev_index) >= ((1 << bit))) {
|
||||
nn_cnt++;
|
||||
prev_index = i;
|
||||
}
|
||||
}
|
||||
|
||||
size_t len_in_bit = nn_cnt * bit + nn_cnt * unique_value_bit;
|
||||
if (len_in_bit < min_len_in_bit) {
|
||||
min_len_in_bit = len_in_bit;
|
||||
*coor_best_bit = bit;
|
||||
best_nn_cnt = nn_cnt;
|
||||
}
|
||||
}
|
||||
return best_nn_cnt;
|
||||
}
|
||||
|
||||
std::string BoolVectorToString(const std::vector<bool> &bool_vec);
|
||||
};
|
||||
} // namespace mindspore::lite::quant
|
||||
#endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_TENSOR_COMPRESSOR_H_
|
|
@ -47,6 +47,7 @@
|
|||
#include "tools/converter/converter_context.h"
|
||||
#include "tools/converter/quantizer/quantize_util.h"
|
||||
#include "tools/converter/quantizer/fse_encoder.h"
|
||||
#include "tools/converter/quantizer/tensor_compressor.h"
|
||||
#include "nnacl/op_base.h"
|
||||
|
||||
using mindspore::ops::PrimitiveC;
|
||||
|
@ -131,14 +132,15 @@ static STATUS CompressTensor(schema::TensorT *tensor_input, const std::unique_pt
|
|||
if (dst_node->quantType != schema::QuantType_QUANT_WEIGHT) {
|
||||
return RET_OK;
|
||||
}
|
||||
auto compressor = quant::TensorCompressor();
|
||||
if (bit_num == kBitNumMix) {
|
||||
tensor_input->quantParams.clear();
|
||||
} else if (bit_num == kBitNum8) {
|
||||
(void)quant::PackRepetition<int8_t>(bit_num, tensor_input);
|
||||
(void)compressor.PackRepetition<int8_t>(bit_num, tensor_input);
|
||||
} else if (bit_num == kBitNum16) {
|
||||
(void)quant::PackRepetition<int16_t>(bit_num, tensor_input);
|
||||
(void)compressor.PackRepetition<int16_t>(bit_num, tensor_input);
|
||||
} else {
|
||||
auto status = quant::DoBitPack(bit_num, tensor_input);
|
||||
auto status = compressor.DoBitPack(bit_num, tensor_input);
|
||||
if (status != RET_OK) {
|
||||
MS_LOG(ERROR) << "do bit pack failed. " << status;
|
||||
return RET_ERROR;
|
||||
|
|
Loading…
Reference in New Issue