From 6fd806b621fac75ee6fca237f346d6a6c9e3839f Mon Sep 17 00:00:00 2001 From: yeyunpeng2020 Date: Thu, 20 Jan 2022 09:38:25 +0800 Subject: [PATCH] dynamic only support symmetric & support debug & support skip & fix more input --- .../cpu/nnacl/int8/matmul_int8.c | 14 ++-- .../cpu/nnacl/int8/matmul_int8.h | 6 +- .../kernel/arm/int8/matmul_dynamic_int8.cc | 70 ++++++++++++------- .../runtime/kernel/arm/int8/matmul_int8.cc | 4 ++ .../converter/quantizer/dynamic_quantizer.cc | 3 +- .../quantizer/insert_quant_node_manager.cc | 58 ++++++++++----- .../quantizer/insert_quant_node_manager.h | 9 ++- .../converter/quantizer/weight_quantizer.cc | 5 ++ 8 files changed, 114 insertions(+), 55 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c index fb80c4cd018..e0a7fd15a2c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c @@ -331,29 +331,31 @@ void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int c } #endif -void DynamicMatmulInt8Opt(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col, int deep16, - float input_scale, int input_zp, const float *filter_scale, size_t stride) { +void DynamicMatmulInt8AIWI(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col, + int deep16, float input_scale, const float *filter_scale, size_t stride, + bool filter_per_channel) { /* * * row4x16-major * row16x4-major => (int8)row-major - * support activation per-layer asymmetric && weight per-channel symmetric + * support activation per-layer symmetric && weight per-layer/per-channel symmetric * */ for (int r = 0; r < row; r++) { for (int c = 0; c < col; c++) { int r4div = r / C4NUM, r4mod = r % C4NUM; int c4div = c / C4NUM, c4mod = c % C4NUM; - size_t ci = r * stride + c; + int filter_quant_index = filter_per_channel ? c : 0; + double multi_scale = input_scale * filter_scale[filter_quant_index]; double value = 0; for (int d = 0; d < deep16; d++) { int d16div = d / C16NUM, d16mod = d % C16NUM; size_t ai = r4div * deep16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod; size_t bi = c4div * deep16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod; int32_t value_1 = a[ai] * b[bi]; - int32_t value_3 = input_zp * b[bi]; - value += input_scale * filter_scale[c] * (value_1 - value_3); + value += multi_scale * value_1; } if (bias != NULL) { value += bias[c]; } + size_t ci = r * stride + c; dst[ci] = value; } } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h index 47502347b22..97b8870f4a7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h @@ -46,8 +46,10 @@ void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int c const int32_t *left_shift, const int32_t *right_shift, size_t stride, size_t filter_peroc, const int32_t *filter_zp); -void DynamicMatmulInt8Opt(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col, int deep16, - float input_scale, int input_zp, const float *filter_scale, size_t stride); +void DynamicMatmulInt8AIWI(const int8_t *a, const int8_t *b, const float *bias, float *dst, int row, int col, + int deep16, float input_scale, const float *filter_scale, size_t stride, + bool filter_per_channel); + /* 8x4 4x8 -> 8x8 */ /* optimize conv */ void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_int8.cc index aabe35a1c32..1025a647f10 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_dynamic_int8.cc @@ -50,9 +50,9 @@ int MatmulDynamicInt8CPUKernel::RunImpl(int task_id) { if (cur_oc <= 0) { return RET_OK; } - DynamicMatmulInt8Opt(pack_a_ptr_, batch_b_ptr_ + cur_stride * param_->deep_align_, fp32_bias_ptr_, - batch_c_ptr_ + cur_stride, param_->row_, cur_oc, param_->deep_align_, quant_param_->input_scale_, - quant_param_->input_zp_, quant_param_->filter_scale_, param_->col_); + DynamicMatmulInt8AIWI(pack_a_ptr_, batch_b_ptr_ + cur_stride * param_->deep_align_, fp32_bias_ptr_, + batch_c_ptr_ + cur_stride, param_->row_, cur_oc, param_->deep_align_, + quant_param_->input_scale_, quant_param_->filter_scale_, param_->col_, filter_per_channel_); return RET_OK; } @@ -77,33 +77,47 @@ void MatmulDynamicInt8CPUKernel::FreeQuantParam() { } int MatmulDynamicInt8CPUKernel::MallocQuantParam() { - auto weight_tensor = in_tensors_.at(kWeightIndex); - auto weight_quant_params = weight_tensor->quant_params(); - auto w_shape = weight_tensor->shape(); - MS_CHECK_TRUE_MSG(weight_tensor->shape().size() >= DIMENSION_2D, lite::RET_ERROR, "weight dims should >=2"); - int col = param_->b_transpose_ ? w_shape[w_shape.size() - kSize2] : w_shape[w_shape.size() - kSize1]; - filter_per_channel_ = (weight_quant_params.size() > 1); - channel_num_ = filter_per_channel_ ? col : 1; - quant_param_ = reinterpret_cast(malloc(sizeof(MatmulQuantParameter))); if (quant_param_ == nullptr) { MS_LOG(ERROR) << "Malloc MatmulDynamicQuantParameter for Matmul int8 op failed!"; return RET_ERROR; } memset(quant_param_, 0, sizeof(MatmulQuantParameter)); + return RET_OK; +} + +int MatmulDynamicInt8CPUKernel::InitFilterQuantParam() { + if (quant_param_->filter_scale_ != nullptr) { + free(quant_param_->filter_scale_); + quant_param_->filter_scale_ = nullptr; + } + if (quant_param_->filter_zp_ != nullptr) { + free(quant_param_->filter_zp_); + quant_param_->filter_zp_ = nullptr; + } + + auto weight_tensor = in_tensors_.at(kWeightIndex); + auto weight_quant_params = weight_tensor->quant_params(); + auto w_shape = weight_tensor->shape(); + if (w_shape.size() < DIMENSION_2D) { + MS_LOG(ERROR) << weight_tensor->tensor_name() << " dims < 2."; + return RET_ERROR; + } + int col = param_->b_transpose_ ? w_shape[w_shape.size() - kSize2] : w_shape[w_shape.size() - kSize1]; + filter_per_channel_ = (weight_quant_params.size() > 1); + channel_num_ = filter_per_channel_ ? col : 1; + if (static_cast(weight_quant_params.size()) != channel_num_) { + MS_LOG(ERROR) << weight_tensor->tensor_name() << " quant params size:" << weight_quant_params.size() + << " != channel_num_:" << channel_num_; + return RET_ERROR; + } quant_param_->filter_scale_ = reinterpret_cast(malloc(channel_num_ * sizeof(float))); CHECK_NULL_RETURN(quant_param_->filter_scale_); memset(quant_param_->filter_scale_, 0, sizeof(channel_num_)); quant_param_->filter_zp_ = reinterpret_cast(malloc(channel_num_ * sizeof(int32_t))); CHECK_NULL_RETURN(quant_param_->filter_zp_); memset(quant_param_->filter_zp_, 0, sizeof(channel_num_)); - return RET_OK; -} -int MatmulDynamicInt8CPUKernel::InitFilterQuantParam() { - auto weight_tensor = in_tensors_.at(kWeightIndex); - auto weight_quant_params = weight_tensor->quant_params(); - MS_CHECK_TRUE_RET(static_cast(weight_quant_params.size()) == channel_num_, RET_ERROR); for (int i = 0; i < channel_num_; i++) { quant_param_->filter_scale_[i] = static_cast(weight_quant_params[i].scale); quant_param_->filter_zp_[i] = weight_quant_params[i].zeroPoint; @@ -212,7 +226,7 @@ int MatmulDynamicInt8CPUKernel::TransferB() { b_pack_func_(current_weight, current_b_pack, param_->deep_, param_->col_); } } - return RET_ERROR; + return RET_OK; } int MatmulDynamicInt8CPUKernel::InitTmpBuffer() { @@ -252,25 +266,23 @@ int MatmulDynamicInt8CPUKernel::Prepare() { CHECK_LESS_RETURN(in_tensors_.size(), kMinInputSize); CHECK_LESS_RETURN(out_tensors_.size(), kOutputSize); InitParameter(); - auto ret = MallocQuantParam(); if (ret != RET_OK) { FreeQuantParam(); return ret; } - - ret = InitFilterQuantParam(); - if (ret != RET_OK) { - FreeQuantParam(); - return ret; + if (param_->b_const_) { + ret = InitFilterQuantParam(); + if (ret != RET_OK) { + FreeQuantParam(); + return ret; + } } - ret = CopyBias(); if (ret != RET_OK) { FreeQuantParam(); return ret; } - if (!InferShapeDone()) { return RET_OK; } @@ -313,6 +325,12 @@ int MatmulDynamicInt8CPUKernel::Run() { return ret; } if (!param_->b_const_) { + ret = InitFilterQuantParam(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init filter quant param failed."; + FreeQuantParam(); + return ret; + } ret = TransferB(); if (ret != RET_OK) { MS_LOG(ERROR) << "TransferB failed."; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc index e71c4f8a880..80092146e15 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc @@ -80,6 +80,10 @@ kernel::InnerKernel *MatmulInt8CPUKernelCreator(const std::vector(ctx)); } else if (parameter->quant_type_ == schema::QuantType_QUANT_DYNAMIC) { + if (inputs.front()->IsConst()) { + MS_LOG(ERROR) << "kernel: " << parameter->name_ << " is unsupported A is const."; + return nullptr; + } kernel = new (std::nothrow) MatmulDynamicInt8CPUKernel(parameter, inputs, outputs, static_cast(ctx)); } else { diff --git a/mindspore/lite/tools/converter/quantizer/dynamic_quantizer.cc b/mindspore/lite/tools/converter/quantizer/dynamic_quantizer.cc index 4dc8dff1e40..863abb9a9db 100644 --- a/mindspore/lite/tools/converter/quantizer/dynamic_quantizer.cc +++ b/mindspore/lite/tools/converter/quantizer/dynamic_quantizer.cc @@ -23,7 +23,6 @@ int DynamicQuantizer::DoQuantize(FuncGraphPtr func_graph) { // Dynamic dont support filters. flags_.commonQuantParam.min_quant_weight_channel = 0; flags_.commonQuantParam.min_quant_weight_size = 0; - flags_.commonQuantParam.skip_quant_node.clear(); auto quantizer = WeightQuantizer(flags_); const std::set support_weight_quant_nodes = {prim::kPrimMatMulFusion, prim::kPrimGather}; const std::set symmetric_nodes = {prim::kPrimMatMulFusion}; @@ -36,7 +35,7 @@ int DynamicQuantizer::DoQuantize(FuncGraphPtr func_graph) { const std::set support_dynamic_quant_ops = { prim::kPrimMatMulFusion, }; - ret = manager.InsertDynamicQuantNode(func_graph, support_dynamic_quant_ops); + ret = manager.InsertDynamicQuantNode(func_graph, support_dynamic_quant_ops, flags_.commonQuantParam.skip_quant_node); if (ret != RET_OK) { MS_LOG(ERROR) << "Insert dynamic quant failed."; return ret; diff --git a/mindspore/lite/tools/converter/quantizer/insert_quant_node_manager.cc b/mindspore/lite/tools/converter/quantizer/insert_quant_node_manager.cc index 126ff62aad2..7310a589a81 100644 --- a/mindspore/lite/tools/converter/quantizer/insert_quant_node_manager.cc +++ b/mindspore/lite/tools/converter/quantizer/insert_quant_node_manager.cc @@ -18,18 +18,22 @@ #include #include #include +#include #include "ops/quant_dtype_cast.h" -#include "ops/dynamic_quant.h" #include "tools/optimizer/common/gllo_utils.h" #include "tools/optimizer/common/format_utils.h" #include "tools/common/node_util.h" namespace mindspore::lite::quant { -ValueNodePtr InsertQuantNodeManager::NewQuantCastValueNode(int src_type, int dst_type_, +namespace { +constexpr size_t kMinSize3 = 3; +constexpr size_t kPrimitiveCOffset = 1; +} // namespace +ValueNodePtr InsertQuantNodeManager::NewQuantCastValueNode(int src_type, int dst_type, const std::vector &quant_params) { auto prim_c = std::make_shared(); MS_CHECK_TRUE_MSG(prim_c != nullptr, nullptr, "prim_c is nullptr."); - prim_c->Init(src_type, dst_type_); + prim_c->Init(src_type, dst_type); auto quant_params_holder = std::make_shared(quant_params.size(), quant_params.size()); MS_CHECK_TRUE_MSG(quant_params_holder != nullptr, nullptr, "quant_params_holder is nullptr."); quant_params_holder->set_quant_type(schema::QuantType_QUANT_ALL); @@ -158,17 +162,14 @@ int InsertQuantNodeManager::InsertQuantDtypeCastNode(const FuncGraphPtr &graph) return RET_OK; } -int InsertQuantNodeManager::NewDynamicQuantNode(const FuncGraphPtr &graph, const CNodePtr &cnode) { +int InsertQuantNodeManager::InsertDynamicQuantWithIndex(const FuncGraphPtr &graph, const CNodePtr &cnode, + size_t index) { auto primitive_c = std::make_shared(); primitive_c->set_dst_type(dst_type_); primitive_c->set_symmetric(symmetric_); - auto op_name = cnode->fullname_with_scope(); - if (cnode->size() <= kInputSize1) { - MS_LOG(ERROR) << op_name << " cnode size <= 2."; - return RET_ERROR; - } - auto dynamic_quant_cnode = graph->NewCNode(primitive_c, {cnode->input(1)}); - dynamic_quant_cnode->set_fullname_with_scope(cnode->fullname_with_scope() + "_dynamic_cast_node"); + auto dynamic_quant_cnode = graph->NewCNode(primitive_c, {cnode->input(index)}); + auto name = cnode->fullname_with_scope() + "_dynamic_cast_node_" + to_string(index); + dynamic_quant_cnode->set_fullname_with_scope(name); CHECK_NULL_RETURN(cnode->abstract()); auto abstract = cnode->abstract()->Clone(); if (abstract == nullptr) { @@ -182,7 +183,24 @@ int InsertQuantNodeManager::NewDynamicQuantNode(const FuncGraphPtr &graph, const return ret; } MarkDynamicQuantize(dynamic_quant_cnode); - cnode->set_input(1, dynamic_quant_cnode); + cnode->set_input(index, dynamic_quant_cnode); + return RET_OK; +} + +int InsertQuantNodeManager::NewDynamicQuantNode(const FuncGraphPtr &graph, const CNodePtr &cnode) { + auto op_name = cnode->fullname_with_scope(); + if (cnode->size() < kMinSize3) { + MS_LOG(ERROR) << op_name << " cnode size:" << cnode->size() << " < 3."; + return RET_ERROR; + } + auto input = cnode->input(kInputIndex + kPrimitiveCOffset); + if (input->isa() || IsGraphInput(input)) { + InsertDynamicQuantWithIndex(graph, cnode, kInputIndex + kPrimitiveCOffset); + } + auto weight = cnode->input(kWeightIndex + kPrimitiveCOffset); + if (weight->isa() || IsGraphInput(weight)) { + InsertDynamicQuantWithIndex(graph, cnode, kWeightIndex + kPrimitiveCOffset); + } return RET_OK; } @@ -199,10 +217,16 @@ int InsertQuantNodeManager::MarkDynamicQuantize(const CNodePtr &cnode) { } int InsertQuantNodeManager::InsertDynamicQuantNode(const FuncGraphPtr &graph, - const std::set &support_dynamic_quant_ops) { + const std::set &support_dynamic_quant_ops, + const std::set &skip_quant_node) { MS_ASSERT(graph != nullptr); auto cnodes = graph->GetOrderedCnodes(); for (auto &cnode : cnodes) { + auto op_name = cnode->fullname_with_scope(); + if (skip_quant_node.find(op_name) != skip_quant_node.end()) { + MS_LOG(INFO) << op_name << " is skip dynamic quant."; + continue; + } auto ret = CheckDataType(cnode, kNumberTypeFloat32); if (ret == RET_NO_CHANGE) { continue; @@ -210,22 +234,22 @@ int InsertQuantNodeManager::InsertDynamicQuantNode(const FuncGraphPtr &graph, auto is_support_node = CheckNodeInSet(cnode, support_dynamic_quant_ops); if (!is_support_node) { auto type = NodePrimitiveType(cnode); - MS_LOG(INFO) << "node:" << cnode->fullname_with_scope() << " type:" << type << " will not quantify."; + MS_LOG(INFO) << "node:" << op_name << " type:" << type << " will not quantify."; continue; } ret = NewDynamicQuantNode(graph, cnode); if (ret != RET_OK) { - MS_LOG(ERROR) << "node:" << cnode->fullname_with_scope() << " new dynamic quant node failed."; + MS_LOG(ERROR) << "node:" << op_name << " new dynamic quant node failed."; return ret; } ret = MarkDynamicQuantize(cnode); if (ret != RET_OK) { - MS_LOG(ERROR) << "node:" << cnode->fullname_with_scope() << " new mark dynamic quant node failed."; + MS_LOG(ERROR) << "node:" << op_name << " new mark dynamic quant node failed."; return ret; } ret = UpdateDataType(cnode, kNumberTypeFloat32); if (ret != RET_OK) { - MS_LOG(ERROR) << "node:" << cnode->fullname_with_scope() << " update datatype failed."; + MS_LOG(ERROR) << "node:" << op_name << " update datatype failed."; return ret; } } diff --git a/mindspore/lite/tools/converter/quantizer/insert_quant_node_manager.h b/mindspore/lite/tools/converter/quantizer/insert_quant_node_manager.h index c6727d77f9b..ee03039a94f 100644 --- a/mindspore/lite/tools/converter/quantizer/insert_quant_node_manager.h +++ b/mindspore/lite/tools/converter/quantizer/insert_quant_node_manager.h @@ -18,11 +18,13 @@ #define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_INSERT_QUANT_NODE_MANAGER_H #include #include +#include #include "include/errorcode.h" #include "ir/anf.h" #include "ir/dtype/type_id.h" #include "ir/func_graph.h" #include "tools/converter/quantizer/quantize_util.h" +#include "ops/dynamic_quant.h" namespace mindspore::lite::quant { class InsertQuantNodeManager { @@ -33,7 +35,8 @@ class InsertQuantNodeManager { int InsertQuantDtypeCastNode(const FuncGraphPtr &graph); - int InsertDynamicQuantNode(const FuncGraphPtr &graph, const std::set &support_dynamic_quant_ops); + int InsertDynamicQuantNode(const FuncGraphPtr &graph, const std::set &support_dynamic_quant_ops, + const std::set &skip_quant_node); private: ValueNodePtr NewQuantCastValueNode(int src_type, int dst_type, const std::vector &quant_params); @@ -46,9 +49,11 @@ class InsertQuantNodeManager { int MarkDynamicQuantize(const CNodePtr &cnode); + int InsertDynamicQuantWithIndex(const FuncGraphPtr &graph, const CNodePtr &cnode, size_t index); + private: TypeId dst_type_ = kNumberTypeInt8; - bool symmetric_ = false; + bool symmetric_ = true; }; } // namespace mindspore::lite::quant #endif // MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_INSERT_QUANT_NODE_MANAGER_H diff --git a/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc b/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc index 614ab0d88d6..b311d26a86b 100644 --- a/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc +++ b/mindspore/lite/tools/converter/quantizer/weight_quantizer.cc @@ -41,6 +41,11 @@ int WeightQuantizer::WeightQuant(const FuncGraphPtr &func_graph, MS_LOG(DEBUG) << cnode->fullname_with_scope() << " : primitive is nullptr"; continue; } + auto op_name = cnode->fullname_with_scope(); + if (flags_.commonQuantParam.skip_quant_node.find(op_name) != flags_.commonQuantParam.skip_quant_node.end()) { + MS_LOG(INFO) << op_name << " is skip dynamic quant."; + continue; + } if (!CheckNodeInSet(cnode, support_weight_quant_types)) { MS_LOG(INFO) << cnode->fullname_with_scope() << " of type: " << primitive->name() << " dont need weight quant."; continue;