pack optimize

2023-02-17 17:17:48 +08:00 · 2023-02-17 17:17:48 +08:00 · ac7b243a5f
parent 985c48e543
commit ac7b243a5f
17 changed files with 686 additions and 3 deletions
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@ -142,6 +142,7 @@ set(LITE_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/litert/sub_graph_kernel.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/litert/scheduler.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/litert/lite_session.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/litert/runtime_packed_node_pass.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/litert/model_manager.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/errorcode.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/litert/cpu_info.cc
--- a/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.cc
+++ b/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.cc
@ -125,7 +125,7 @@ void MatmulDynamicBaseInt8CPUKernel::FreeTmpBuffer() {
    free(pack_a_ptr_);
    pack_a_ptr_ = nullptr;
  }
-  if (pack_b_ptr_ != nullptr) {
+  if (pack_b_ptr_ != nullptr && !weight_is_packed_) {
    free(pack_b_ptr_);
    pack_b_ptr_ = nullptr;
  }
@ -133,7 +133,7 @@ void MatmulDynamicBaseInt8CPUKernel::FreeTmpBuffer() {
    free(input_sums_);
    input_sums_ = nullptr;
  }
-  if (weight_sums_ != nullptr) {
+  if (weight_sums_ != nullptr && !weight_is_packed_) {
    free(weight_sums_);
    weight_sums_ = nullptr;
  }
@ -162,6 +162,12 @@ int MatmulDynamicBaseInt8CPUKernel::InitInputQuantParam() {
 }

 int MatmulDynamicBaseInt8CPUKernel::TransferB() {
+  if (weight_is_packed_) {
+    CHECK_NULL_RETURN(weight_sums_tensor_);
+    pack_b_ptr_ = static_cast<int8_t *>(in_tensors_.at(kWeightIndex)->data());
+    weight_sums_ = static_cast<int *>(weight_sums_tensor_->data());
+    return RET_OK;
+  }
  auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->data());
  CHECK_NULL_RETURN(weight_data);
  for (int i = 0; i < b_batch_; i++) {
@ -177,6 +183,7 @@ int MatmulDynamicBaseInt8CPUKernel::TransferB() {
      CalcWeightSums(current_weight, param_->deep_, param_->col_, current_sums, RowMajor);
    }
  }
+
  return RET_OK;
 }

@ -205,6 +212,10 @@ int MatmulDynamicBaseInt8CPUKernel::InitMatrixABuffer() {
 }

 int MatmulDynamicBaseInt8CPUKernel::InitMatrixBBuffer() {
+  if (weight_is_packed_) {
+    return RET_OK;
+  }
+
  if (pack_b_ptr_ != nullptr) {
    free(pack_b_ptr_);
    pack_b_ptr_ = nullptr;
--- a/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h
+++ b/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h
@ -42,6 +42,12 @@ class MatmulDynamicBaseInt8CPUKernel : public LiteKernel {
  static int InitBroadcastParams(const std::vector<int> &a_shape_const, const std::vector<int> &b_shape_const,
                                 MatMulParameter *params, std::vector<int> *a_offsets, std::vector<int> *b_offsets);

+  const int8_t *GetPackBPtr() const { return pack_b_ptr_; }
+  const int *GetWeightSums() const { return weight_sums_; }
+  const int GetBBatch() const { return b_batch_; }
+  void SetWeightIsPacked(bool weight_is_packed) { this->weight_is_packed_ = weight_is_packed; }
+  void SetWeightSumsTensor(lite::Tensor *weight_sums_tensor) { this->weight_sums_tensor_ = weight_sums_tensor; }
+
 private:
  void ResizeMatrixBParameter();
  int CopyBias();
@ -90,6 +96,8 @@ class MatmulDynamicBaseInt8CPUKernel : public LiteKernel {
  int thread_stride_ = 0;
  bool enable_fp16_ = false;
  PackFunc b_pack_func_ = nullptr;
+  bool weight_is_packed_ = false;
+  lite::Tensor *weight_sums_tensor_ = nullptr;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/litert/lite_session.cc
+++ b/mindspore/lite/src/litert/lite_session.cc
@ -62,6 +62,7 @@
 #include "kernel/ascend/plugin/ascend_kernel_plugin.h"
 #endif
 #include "thread/parallel_thread_pool_manager.h"
+#include "src/litert/runtime_packed_node_pass.h"

 using AbstractBaseModel = mindspore::infer::AbstractBaseModel;

@ -585,6 +586,8 @@ int LiteSession::CompileGraph(Model *model) {
  InitGraphInputTensors(model);
  InitGraphOutputTensors(model);

+  PackedNodePass::GetInstance().Run(model, tensors_);
+
  // scheduler kernels
  Scheduler scheduler(context_.get(), ms_context_, model, &tensors_, &inputs_, &outputs_, is_train_session_,
                      &is_infershape_, &is_control_flow_, &infer_along_running_, execution_plan_, delegate_,
@ -698,6 +701,11 @@ int LiteSession::PrepareKernels(const Model *model) {
        return RET_ERROR;
      }
      for (auto &node : subgraph_kernel->nodes()) {
+        ret = PackKernelExec(node, tensors_);
+        if (ret != RET_OK) {
+          MS_LOG(ERROR) << "Pack KernelExec failed.";
+          return ret;
+        }
        ret = node->Prepare();
        if (ret != RET_OK) {
          MS_LOG(ERROR) << "node: " << node->name() << " prepare failed.";
--- a/mindspore/lite/src/litert/runtime_packed_node_pass.cc
+++ b/mindspore/lite/src/litert/runtime_packed_node_pass.cc
@ -0,0 +1,261 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/litert/runtime_packed_node_pass.h"
+#include "nnacl/op_base.h"
+#include "src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h"
+
+using RecoveryWeightFunc = void (*)(void *, void *, int, int, bool);
+namespace mindspore {
+namespace {
+constexpr size_t kFlatbuffersBuilderInitSize = 1024;
+constexpr auto kActivationType = "activation_type";
+constexpr auto kTransposeA = "transpose_a";
+constexpr auto kTransposeB = "transpose_b";
+constexpr auto kArm64SimdDot = "ARM64SIMD_DOT";
+}  // namespace
+
+namespace lite {
+PackedNodePass::~PackedNodePass() {
+  for (auto &pack_info : node_pack_info_map_) {
+    delete pack_info.second;
+  }
+  node_pack_info_map_.clear();
+}
+
+void PackedNodePass::Run(Model *model, const std::vector<Tensor *> &tensors) {
+  for (auto &node : model->graph_.all_nodes_) {
+    MS_ASSERT(node != nullptr);
+    if (node->node_type_ != schema::PrimitiveType_Custom) {
+      continue;
+    }
+    auto *primitive = reinterpret_cast<const schema::Primitive *>(node->primitive_);
+    if (primitive == nullptr) {
+      MS_LOG(ERROR) << "Op " << node->name_ << " should exist in model!";
+      return;
+    }
+    auto custom = primitive->value_as_Custom();
+    if (custom == nullptr || custom->type() == nullptr) {
+      MS_LOG(ERROR) << "Custom node is nullptr";
+      return;
+    }
+    auto custom_type = custom->type()->str();
+    if (custom_type != "MatmulFusionPacked") {
+      continue;
+    }
+    flatbuffers::FlatBufferBuilder fbb(kFlatbuffersBuilderInitSize);
+
+    auto custom_attr = custom->attr();
+    std::map<std::string, std::string> attr_map;
+    for (size_t i = 0; i < custom_attr->size(); ++i) {
+      auto attr = custom_attr->Get(i);
+      auto attr_key = attr->name()->str();
+      auto data_bytes = attr->data();
+      int data_size = static_cast<int>(data_bytes->size());
+      std::string attr_value;
+      for (int j = 0; j < data_size; j++) {
+        attr_value.push_back(static_cast<char>(data_bytes->Get(j)));
+      }
+      attr_map[attr_key] = attr_value;
+    }
+    if (attr_map.find(kActivationType) == attr_map.end() || attr_map.find(kTransposeA) == attr_map.end() ||
+        attr_map.find(kTransposeB) == attr_map.end()) {
+      MS_LOG(ERROR) << "Custom attr error.";
+      return;
+    }
+    auto val_offset = schema::CreateMatMulFusion(
+      fbb, std::atoi(attr_map[kTransposeA].c_str()), std::atoi(attr_map[kTransposeB].c_str()),
+      static_cast<schema::ActivationType>(std::atoi(attr_map[kActivationType].c_str())));
+    auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_MatMulFusion, val_offset.o);
+    fbb.Finish(prim_offset);
+    void *prim = malloc(fbb.GetSize());
+    if (prim == nullptr) {
+      MS_LOG(ERROR) << "malloc primitive failed.";
+      return;
+    }
+    memcpy(prim, fbb.GetBufferPointer(), fbb.GetSize());
+    auto custom_primitive = flatbuffers::GetRoot<schema::Primitive>(prim);
+    fbb.Clear();
+    PackInfo *pack_info = new (std::nothrow) PackInfo();
+    if (pack_info == nullptr) {
+      free(prim);
+      MS_LOG(ERROR) << "new PackInfo failed.";
+      return;
+    }
+    node->primitive_ = custom_primitive;
+    pack_info->is_packed_ = true;
+    pack_info->weight_sums_index_ = node->input_indices_.back();
+    pack_info->b_batch_ = std::atoi(attr_map["b_batch"].c_str());
+    pack_info->col_ = std::atoi(attr_map["col"].c_str());
+    pack_info->deep_ = std::atoi(attr_map["deep"].c_str());
+    pack_info->col_align_ = std::atoi(attr_map["col_align"].c_str());
+    pack_info->deep_align_ = std::atoi(attr_map["deep_align"].c_str());
+    pack_info->b_transpose_ = std::atoi(attr_map[kTransposeB].c_str());
+    pack_info->cpu_option_ = attr_map["cpu_option"];
+    AddNodePackInfo(node->name_, pack_info);
+    node->input_indices_.pop_back();
+    node->node_type_ = schema::PrimitiveType_MatMulFusion;
+  }
+
+  if (!(reinterpret_cast<lite::LiteModel *>(model)->keep_model_buf())) {
+    CopyWeightBiasSumsTensor(tensors);
+  }
+}
+
+void PackedNodePass::CopyWeightBiasSumsTensor(const std::vector<Tensor *> &tensors) {
+  for (auto &pack_info : node_pack_info_map_) {
+    auto index = static_cast<size_t>(pack_info.second->weight_sums_index_);
+    if (index > tensors.size()) {
+      return;
+    }
+    auto tensor = tensors[index];
+    if (!tensor->IsConst() && tensor->data() != nullptr) {
+      return;
+    }
+    if (!tensor->IsConst() || tensor->own_data()) {
+      continue;
+    }
+    if (tensor->data_type() == kObjectTypeTensorType) {
+      MS_ASSERT(tensor->data() == nullptr);
+    } else {
+      auto copy_tensor = Tensor::CopyTensor(*tensor, true);
+      if (copy_tensor == nullptr) {
+        MS_LOG(ERROR) << "Copy tensor failed";
+        return;
+      }
+      tensor->FreeData();
+      tensor->set_data(copy_tensor->data());
+      tensor->set_own_data(true);
+      copy_tensor->set_data(nullptr);
+      delete copy_tensor;
+    }
+  }
+}
+
+void MatmulDynamicSdotInt8Cpu(void *src, void *dst, int row, int col, bool transpose) {
+  auto src_int8 = static_cast<int8_t *>(src);
+  auto dst_int8 = static_cast<int8_t *>(dst);
+  if (!transpose) {
+    // RowMajor2Col4x16MajorInt8
+    int row_4 = UP_ROUND(row, C4NUM);
+    int stride = C16NUM * C4NUM;
+    for (int r = 0; r < row_4; ++r) {
+      for (int c = 0; c < col; ++c) {
+        int stride_idx = c / C16NUM * (row_4 / C4NUM) + r / C4NUM;
+        if (r < row) {
+          int src_idx = r * col + c;
+          src_int8[src_idx] = dst_int8[stride * stride_idx + c % C16NUM * C4NUM + r % C4NUM];
+        }
+      }
+    }
+  } else {
+    int temp = row;
+    row = col;
+    col = temp;
+    // RowMajor2Row4x16MajorInt8
+    int col4 = UP_ROUND(col, C4NUM);
+    for (int r = 0; r < row; r++) {
+      int rd16 = r / C16NUM;
+      int rm16 = r % C16NUM;
+      for (int c = 0; c < col; c++) {
+        int cd4 = c / C4NUM;
+        int cm4 = c % C4NUM;
+        int dst_index = rd16 * col4 * C16NUM + cd4 * C16NUM * C4NUM + rm16 * C4NUM + cm4;
+        int src_index = r * col + c;
+        src_int8[src_index] = dst_int8[dst_index];
+      }
+    }
+  }
+}
+
+RecoveryWeightFunc GetRecoveryWeightFunc(const int quant_type, const TypeId data_type, const int node_type,
+                                         const std::string &cpu_option) {
+  if (cpu_option == kArm64SimdDot && node_type == schema::PrimitiveType_MatMulFusion &&
+      quant_type == schema::QuantType_QUANT_DYNAMIC && data_type == kNumberTypeInt8) {
+    return MatmulDynamicSdotInt8Cpu;
+  }
+  return nullptr;
+}
+
+int PackedMatmulKernelExec(kernel::KernelExec *kernel_exec, const std::vector<Tensor *> &tensors) {
+  auto pack_info = PackedNodePass::GetInstance().GetNodePackInfo(kernel_exec->name());
+  if (pack_info == nullptr) {
+    return RET_OK;
+  }
+  MS_CHECK_TRUE_MSG(kernel_exec->in_tensors().size() >= kInputSize1, lite::RET_ERROR,
+                    "kernel doesn't have weight tensor.");
+  auto dst_tensor = kernel_exec->in_tensors()[SECOND_INPUT];
+  auto kernel = kernel_exec->kernel();
+  MS_CHECK_TRUE_MSG(kernel != nullptr, lite::RET_NULL_PTR, "kernel is nullptr.");
+  auto param = reinterpret_cast<MatMulParameter *>(kernel_exec->op_parameter());
+  if (dst_tensor->data_type() != kNumberTypeInt8 || kernel->quant_type() != schema::QuantType_QUANT_DYNAMIC) {
+    return RecoveryPackedWeight(dst_tensor, static_cast<int>(kernel->quant_type()), dst_tensor->data_type(),
+                                schema::PrimitiveType_MatMulFusion, pack_info);
+  }
+
+  if (param->matmul_type_ != kMatmulDynamicSdotInt8Cpu && pack_info->cpu_option_ == kArm64SimdDot) {
+    return RecoveryPackedWeight(dst_tensor, static_cast<int>(kernel->quant_type()), dst_tensor->data_type(),
+                                schema::PrimitiveType_MatMulFusion, pack_info);
+  }
+  auto matmul_kernel = static_cast<kernel::MatmulDynamicBaseInt8CPUKernel *>(kernel);
+  matmul_kernel->SetWeightIsPacked(true);
+  auto index = static_cast<size_t>(pack_info->weight_sums_index_);
+  if (index < tensors.size()) {
+    matmul_kernel->SetWeightSumsTensor(tensors.at(index));
+  }
+
+  return lite::RET_OK;
+}
+
+int RecoveryPackedWeight(Tensor *weight, const int quant_type, const TypeId data_type, const int node_type,
+                         PackInfo *pack_info) {
+  auto recovery_func = GetRecoveryWeightFunc(quant_type, data_type, node_type, pack_info->cpu_option_);
+  if (recovery_func == nullptr) {
+    MS_LOG(ERROR) << "unsupported recovery func.";
+    return RET_NULL_PTR;
+  }
+  void *unpack_data = malloc(weight->Size());
+  if (unpack_data == nullptr) {
+    MS_LOG(ERROR) << "malloc unpack_data failed.";
+    return RET_NULL_PTR;
+  }
+  void *pack_b_ptr = weight->data();
+  for (int i = 0; i < pack_info->b_batch_; i++) {
+    void *current_weight;
+    void *current_b_pack;
+    if (weight->data_type() == kNumberTypeInt8) {
+      current_weight = static_cast<void *>(static_cast<int8_t *>(unpack_data) + i * pack_info->deep_ * pack_info->col_);
+      current_b_pack =
+        static_cast<void *>(static_cast<int8_t *>(pack_b_ptr) + i * pack_info->col_align_ * pack_info->deep_align_);
+    } else {
+      free(unpack_data);
+      MS_LOG(ERROR) << "unsupported data type.";
+      return RET_ERROR;
+    }
+    recovery_func(current_weight, current_b_pack, pack_info->deep_, pack_info->col_, pack_info->b_transpose_);
+  }
+  weight->FreeData();
+  weight->set_data(unpack_data);
+  return RET_OK;
+}
+
+int PackKernelExec(kernel::KernelExec *kernel_exec, const std::vector<Tensor *> &tensors) {
+  if (kernel_exec->type() == schema::PrimitiveType_MatMulFusion) {
+    return PackedMatmulKernelExec(kernel_exec, tensors);
+  }
+  return RET_OK;
+}
+}  // namespace lite
+}  // namespace mindspore
--- a/mindspore/lite/src/litert/runtime_packed_node_pass.h
+++ b/mindspore/lite/src/litert/runtime_packed_node_pass.h
@ -0,0 +1,80 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_LITERT_RUNTIME_PACKED_NODE_PASS_
+#define MINDSPORE_LITE_SRC_LITERT_RUNTIME_PACKED_NODE_PASS_
+
+#include <string>
+#include <map>
+#include <vector>
+#include "src/litert/lite_model.h"
+#include "src/tensor.h"
+#include "src/litert/kernel_exec.h"
+
+namespace mindspore {
+namespace lite {
+struct PackInfo {
+  bool is_packed_{false};
+  int weight_sums_index_;
+  int b_batch_;
+  int deep_;
+  int col_;
+  int deep_align_;
+  int col_align_;
+  bool b_transpose_;
+  std::string cpu_option_;
+};
+
+class PackedNodePass {
+ public:
+  static PackedNodePass &GetInstance() {
+    static PackedNodePass instance;
+    return instance;
+  }
+
+  PackInfo *GetNodePackInfo(const std::string &node_name) {
+    if (this->node_pack_info_map_.find(node_name) == this->node_pack_info_map_.end()) {
+      return nullptr;
+    }
+    return this->node_pack_info_map_[node_name];
+  }
+  void Run(Model *model, const std::vector<Tensor *> &tensors);
+  void CopyWeightBiasSumsTensor(const std::vector<Tensor *> &tensors);
+
+ protected:
+  void AddNodePackInfo(const std::string &node_name, PackInfo *pack_info) {
+    if (this->node_pack_info_map_.find(node_name) != this->node_pack_info_map_.end()) {
+      MS_LOG(WARNING) << "Key conflict when add weight sums index.";
+    }
+    this->node_pack_info_map_[node_name] = pack_info;
+  }
+
+ private:
+  PackedNodePass() = default;
+  ~PackedNodePass();
+
+ private:
+  std::map<std::string, PackInfo *> node_pack_info_map_;
+};
+
+int PackKernelExec(kernel::KernelExec *kernel_exec, const std::vector<Tensor *> &tensors);
+
+// packed weight data -> unpack
+int RecoveryPackedWeight(Tensor *weight, const int quant_type, const TypeId data_type, const int node_type,
+                         PackInfo *packInfo);
+}  // namespace lite
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_LITERT_RUNTIME_PACKED_NODE_PASS_
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@ -48,6 +48,8 @@ include_directories(${TOP_DIR}/mindspore/ccsrc/plugin/device/cpu/kernel)
 file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        ${CMAKE_CURRENT_SOURCE_DIR}/ops/*.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/converter.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/offline_packing_optimizer.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/converter_packed_node.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/converter_funcgraph.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/converter_metagraph.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/anf_transform.cc
@ -167,6 +169,7 @@ set(LITE_SRC ${API_SRC}
        ${SRC_DIR}/litert/sub_graph_split.cc
        ${KERNEL_ONLINE_FUSION_SRC}
        ${SRC_DIR}/litert/lite_session.cc
+        ${SRC_DIR}/litert/runtime_packed_node_pass.cc
        ${SRC_DIR}/litert/executor.cc
        ${SRC_DIR}/litert/lite_model.cc
        ${SRC_DIR}/litert/model_manager.cc
--- a/mindspore/lite/tools/converter/config_parser/config_file_parser.cc
+++ b/mindspore/lite/tools/converter/config_parser/config_file_parser.cc
@ -35,6 +35,7 @@ constexpr auto kDataPreprocessParam = "data_preprocess_param";
 constexpr auto kRegistry = "registry";
 constexpr auto kAclOptionParam = "acl_option_cfg_param";
 constexpr auto kMicroParam = "micro_param";
+constexpr auto kCpuOptionParam = "cpu_option_cfg_param";
 }  // namespace
 using ShapeVector = std::vector<int64_t>;
 const int kBatchDim = 0;
@ -286,6 +287,12 @@ int ConfigFileParser::ParseConfigParam(std::map<std::string, std::map<std::strin
    MS_LOG(ERROR) << "ParseWeightQuantString failed.";
    return ret;
  }
+  ret = ParseCpuOptionCfgString(*maps);
+  (void)maps->erase(kCpuOptionParam);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ParseCpuOptionCfgString failed.";
+    return ret;
+  }
  return RET_OK;
 }

@ -425,5 +432,15 @@ int ConfigFileParser::ParseWeightQuantString(const std::map<std::string, std::ma
  }
  return RET_OK;
 }
+
+int ConfigFileParser::ParseCpuOptionCfgString(const std::map<std::string, std::map<std::string, std::string>> &maps) {
+  if (maps.find(kCpuOptionParam) != maps.end()) {
+    const auto &map = maps.at(kCpuOptionParam);
+    std::map<std::string, std::string &> parse_map{{"architecture", cpu_option_cfg_string_.architecture},
+                                                   {"instruction", cpu_option_cfg_string_.instruction}};
+    return SetMapData(map, parse_map, kCpuOptionParam);
+  }
+  return RET_OK;
+}
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/tools/converter/config_parser/config_file_parser.h
+++ b/mindspore/lite/tools/converter/config_parser/config_file_parser.h
@ -98,6 +98,11 @@ struct MicroParamString {
  std::string enable_micro;
 };

+struct CpuOptionCfgString {
+  std::string architecture;
+  std::string instruction;
+};
+
 class ConfigFileParser {
 public:
  int ParseConfigFile(const std::string &config_file_path);
@ -112,6 +117,7 @@ class ConfigFileParser {
  RegistryInfoString GetRegistryInfoString() const { return this->registry_info_string_; }
  AclOptionCfgString GetAclOptionCfgString() { return this->acl_option_cfg_string_; }
  MicroParamString GetMicroParamString() { return this->micro_param_string_; }
+  CpuOptionCfgString GetCpuOptionCfgString() { return this->cpu_option_cfg_string_; }

 private:
  int ParseDataPreProcessString(const std::map<std::string, std::map<std::string, std::string>> &maps);
@ -124,6 +130,7 @@ class ConfigFileParser {
  int SetMapData(const std::map<std::string, std::string> &input_map,
                 const std::map<std::string, std::string &> &parse_map, const std::string &section);
  int ParseMicroParamString(const std::map<std::string, std::map<std::string, std::string>> &maps);
+  int ParseCpuOptionCfgString(const std::map<std::string, std::map<std::string, std::string>> &maps);

 private:
  DataPreProcessString data_pre_process_string_;
@ -134,6 +141,7 @@ class ConfigFileParser {
  RegistryInfoString registry_info_string_;
  AclOptionCfgString acl_option_cfg_string_;
  MicroParamString micro_param_string_;
+  CpuOptionCfgString cpu_option_cfg_string_;
 };

 }  // namespace lite
--- a/mindspore/lite/tools/converter/config_parser/cpu_option_param_parser.cc
+++ b/mindspore/lite/tools/converter/config_parser/cpu_option_param_parser.cc
@ -0,0 +1,41 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/converter/config_parser/cpu_option_param_parser.h"
+
+namespace mindspore {
+namespace lite {
+STATUS CpuOptionParamParser::ParseCpuOptionCfg(const CpuOptionCfgString &cpu_option_string,
+                                               CpuOptionCfg *cpu_option_cfg) {
+  if (cpu_option_string.architecture.empty() || cpu_option_string.instruction.empty()) {
+    return RET_OK;
+  }
+
+  if (cpu_option_string.architecture != "ARM64") {
+    MS_LOG(ERROR) << "cpu instruction only supported ARM64. But get " << cpu_option_string.architecture;
+    return RET_INPUT_PARAM_INVALID;
+  }
+
+  if (cpu_option_string.instruction != "SIMD_DOT") {
+    MS_LOG(ERROR) << "cpu instruction only supported SIMD_DOT. But get " << cpu_option_string.instruction;
+    return RET_INPUT_PARAM_INVALID;
+  }
+  cpu_option_cfg->instruction = cpu_option_string.instruction;
+  cpu_option_cfg->architecture = cpu_option_string.architecture;
+  return RET_OK;
+}
+}  // namespace lite
+}  // namespace mindspore
--- a/mindspore/lite/tools/converter/config_parser/cpu_option_param_parser.h
+++ b/mindspore/lite/tools/converter/config_parser/cpu_option_param_parser.h
@ -0,0 +1,32 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_CONFIG_PARSER_CPU_OPTION_PARAM_PARSER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_CONFIG_PARSER_CPU_OPTION_PARAM_PARSER_H_
+#include <string>
+#include "tools/converter/cxx_api/converter_para.h"
+#include "tools/converter/config_parser/config_file_parser.h"
+#include "include/errorcode.h"
+
+namespace mindspore {
+namespace lite {
+class CpuOptionParamParser {
+ public:
+  STATUS ParseCpuOptionCfg(const CpuOptionCfgString &cpu_option_string, CpuOptionCfg *cpu_option_cfg);
+};
+}  // namespace lite
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_CONFIG_PARSER_CPU_OPTION_PARAM_PARSER_H_
--- a/mindspore/lite/tools/converter/converter.cc
+++ b/mindspore/lite/tools/converter/converter.cc
@ -53,6 +53,8 @@
 #include "src/common/file_utils.h"
 #include "ops/dynamic_shape.h"
 #include "tools/common/parse_config_utils.h"
+#include "tools/converter/converter_packed_node.h"
+#include "tools/converter/config_parser/cpu_option_param_parser.h"

 namespace mindspore {
 extern "C" {
@ -348,6 +350,13 @@ int ConverterImpl::InitConfigParam(const std::shared_ptr<ConverterPara> &param)
    MS_LOG(ERROR) << "Parse micro param failed.";
    return ret;
  }
+
+  lite::CpuOptionParamParser cpu_param_parser;
+  ret = cpu_param_parser.ParseCpuOptionCfg(config_parser.GetCpuOptionCfgString(), &param->cpuOptionCfgParam);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Parse cpu option param failed.";
+    return ret;
+  }
  return RET_OK;
 }

@ -817,6 +826,16 @@ int ConverterImpl::SaveGraph(FuncGraphPtr graph, const std::shared_ptr<Converter
    MS_LOG(ERROR) << "Convert to meta graph failed";
    return RET_ERROR;
  }
+
+  if (!param->cpuOptionCfgParam.architecture.empty()) {
+    std::string cpu_option = param->cpuOptionCfgParam.architecture + param->cpuOptionCfgParam.instruction;
+    status = ConverterPackedNode(meta_graph, cpu_option);
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "save pack info failed.";
+      return status;
+    }
+  }
+
  meta_graph->version = Version();

  if (param->pre_infer) {
--- a/mindspore/lite/tools/converter/converter_funcgraph.cc
+++ b/mindspore/lite/tools/converter/converter_funcgraph.cc
@ -55,6 +55,7 @@
 #include "tools/converter/parser/unify_format.h"
 #include "tools/optimizer/graph/specify_graph_input_format.h"
 #include "tools/converter/anf_transform.h"
+#include "tools/converter/offline_packing_optimizer.h"

 namespace mindspore {
 namespace lite {
@ -311,6 +312,14 @@ STATUS ConverterFuncGraph::Optimize(const std::shared_ptr<ConverterPara> &param,
    func_graph->set_attr(kIsOptimized, MakeValue(true));
  }

+  if (!param->cpuOptionCfgParam.architecture.empty()) {
+    // Do offline pack.
+    if (OfflinePackingOptimizer().Optimize(func_graph, "ANDROID_ARM_CPU") != RET_OK) {
+      MS_LOG(ERROR) << "Do offline packing failed.";
+      return status;
+    }
+  }
+
  return RET_OK;
 }

--- a/mindspore/lite/tools/converter/converter_packed_node.cc
+++ b/mindspore/lite/tools/converter/converter_packed_node.cc
@ -0,0 +1,150 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include <memory>
+#include <utility>
+#include "tools/converter/converter_packed_node.h"
+#include "tools/converter/offline_packing_optimizer.h"
+#include "src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h"
+#include "mindspore/core/ops/op_name.h"
+
+namespace mindspore {
+namespace {
+constexpr auto kMatmulCustomType = "MatmulFusionPacked";
+}
+
+namespace lite {
+void AddCustomAttr(std::vector<std::unique_ptr<mindspore::schema::AttributeT>> *attrs, const std::string &&key,
+                   const std::string &&value) {
+  auto attr = std::make_unique<schema::AttributeT>();
+  attr->name = key;
+  std::vector<uint8_t> attr_data(value.begin(), value.end());
+  attr->data = attr_data;
+  attrs->emplace_back(std::move(attr));
+}
+
+int ReplaceMatMulFusionToCustom(schema::MetaGraphT *meta_graph, const std::unique_ptr<schema::CNodeT> &cnode,
+                                const std::unique_ptr<mindspore::schema::TensorT> &b_input,
+                                const std::string &cpu_option) {
+  auto lite_kernel = PackDataWrapper::GetInstance().GetPackedKernel(cnode->name);
+  if (lite_kernel == nullptr) {
+    MS_LOG(ERROR) << "Get Packed Kernel error.";
+    return RET_ERROR;
+  }
+  auto param = lite_kernel->op_parameter();
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "param is nullptr.";
+    return RET_ERROR;
+  }
+  auto matmul_param = reinterpret_cast<MatMulParameter *>(param);
+  if (matmul_param->matmul_type_ == kMatmulDynamicSdotInt8Cpu) {
+    cnode->primitive->value.type = schema::PrimitiveType_Custom;
+    auto primitive = new (std::nothrow) schema::CustomT;
+    if (primitive == nullptr) {
+      MS_LOG(ERROR) << "new CustomT error.";
+      return RET_NULL_PTR;
+    }
+    primitive->type = kMatmulCustomType;
+
+    // activation_type
+    AddCustomAttr(&(primitive->attr), ops::kActivationType, std::to_string(matmul_param->act_type_));
+    // transpose_a
+    AddCustomAttr(&(primitive->attr), ops::kTransposeA, std::to_string(matmul_param->a_transpose_));
+    // transpose_b
+    AddCustomAttr(&(primitive->attr), ops::kTransposeB, std::to_string(matmul_param->b_transpose_));
+
+    // replace packed data
+    auto matmul_kernel = reinterpret_cast<const mindspore::kernel::MatmulDynamicBaseInt8CPUKernel *>(lite_kernel);
+    auto b_batch = matmul_kernel->GetBBatch();
+    auto pack_b_size = b_batch * matmul_param->col_align_ * matmul_param->deep_align_ * sizeof(int8_t);
+    b_input->data.resize(pack_b_size);
+    if (memcpy_s(b_input->data.data(), b_input->data.size(), matmul_kernel->GetPackBPtr(), pack_b_size) != EOK) {
+      delete primitive;
+      MS_LOG(ERROR) << "new CustomT error.";
+      return RET_ERROR;
+    }
+
+    // add weight_sums to inputs
+    auto weight_sum_size = b_batch * matmul_param->col_align_ * sizeof(int);
+    auto weight_sums_tensor = std::make_unique<schema::TensorT>();
+    weight_sums_tensor->nodeType = lite::NodeType_ValueNode;
+    weight_sums_tensor->format = schema::Format_NHWC;
+    weight_sums_tensor->dataType = TypeId::kNumberTypeInt32;
+    weight_sums_tensor->dims = {};
+    weight_sums_tensor->dims.emplace_back(weight_sum_size / sizeof(int));
+    weight_sums_tensor->data.resize(weight_sum_size);
+    weight_sums_tensor->name = cnode->name + "_weight_sums";
+    if (memcpy_s(weight_sums_tensor->data.data(), weight_sums_tensor->data.size(), matmul_kernel->GetWeightSums(),
+                 weight_sum_size) != EOK) {
+      delete primitive;
+      MS_LOG(ERROR) << "new CustomT error.";
+      return RET_ERROR;
+    }
+    cnode->inputIndex.emplace_back(meta_graph->allTensors.size());
+    meta_graph->allTensors.emplace_back(std::move(weight_sums_tensor));
+
+    // add scalar to attr
+    AddCustomAttr(&(primitive->attr), "b_batch", std::to_string(b_batch));
+    AddCustomAttr(&(primitive->attr), "deep", std::to_string(matmul_param->deep_));
+    AddCustomAttr(&(primitive->attr), "col", std::to_string(matmul_param->col_));
+    AddCustomAttr(&(primitive->attr), "col_align", std::to_string(matmul_param->col_align_));
+    AddCustomAttr(&(primitive->attr), "deep_align", std::to_string(matmul_param->deep_align_));
+
+    // add cpu option
+    std::string cpu_option_str = cpu_option;
+    AddCustomAttr(&(primitive->attr), "cpu_option", std::move(cpu_option_str));
+
+    cnode->primitive->value.value = primitive;
+  }
+  return RET_OK;
+}
+
+int ConverterPackedNode(schema::MetaGraphT *meta_graph, const std::string &cpu_option) {
+  for (auto &dst_node : meta_graph->nodes) {
+    if (dst_node->primitive == nullptr || dst_node->primitive->value.type != schema::PrimitiveType_MatMulFusion) {
+      continue;
+    }
+    MS_CHECK_TRUE_MSG(dst_node->inputIndex.size() >= kInputSize1, RET_ERROR, "inputs size is wrong.");
+    auto a_index = dst_node->inputIndex[FIRST_INPUT];
+    MS_CHECK_TRUE_MSG(meta_graph->allTensors.size() > a_index, RET_ERROR, "allTensors size is wrong.");
+    auto &a_input = meta_graph->allTensors.at(a_index);
+    CHECK_NULL_RETURN(a_input);
+
+    auto b_index = dst_node->inputIndex[SECOND_INPUT];
+    MS_CHECK_TRUE_MSG(meta_graph->allTensors.size() > b_index, RET_ERROR, "allTensors size is wrong.");
+    auto &b_input = meta_graph->allTensors.at(b_index);
+    CHECK_NULL_RETURN(b_input);
+
+    if (a_input->dataType != b_input->dataType) {
+      MS_LOG(ERROR) << "inputs dataType is not same." << a_input->dataType << " " << b_input->dataType;
+      return RET_ERROR;
+    }
+
+    if (b_input->data.empty()) {
+      continue;
+    }
+    auto ret = ReplaceMatMulFusionToCustom(meta_graph, dst_node, b_input, cpu_option);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "ReplaceMatmulToCustom error.";
+      return ret;
+    }
+  }
+
+  return RET_OK;
+}
+}  // namespace lite
+}  // namespace mindspore
--- a/mindspore/lite/tools/converter/converter_packed_node.h
+++ b/mindspore/lite/tools/converter/converter_packed_node.h
@ -0,0 +1,29 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_CONVERT_PACKED_NODE_H
+#define MINDSPORE_LITE_TOOLS_CONVERTER_CONVERT_PACKED_NODE_H
+
+#include <string>
+#include "schema/inner/model_generated.h"
+
+namespace mindspore {
+namespace lite {
+int ConverterPackedNode(schema::MetaGraphT *meta_graph, const std::string &cpu_option);
+}  // namespace lite
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_CONVERT_PACKED_NODE_H
--- a/mindspore/lite/tools/converter/cxx_api/converter_para.h
+++ b/mindspore/lite/tools/converter/cxx_api/converter_para.h
@ -35,6 +35,11 @@ struct ParallelSplitConfig {
  std::vector<std::string> parallel_devices_;
 };

+struct CpuOptionCfg {
+  std::string architecture;
+  std::string instruction;
+};
+
 struct ConverterPara {
  converter::FmkType fmk_type;
  std::string model_file;
@ -73,6 +78,7 @@ struct ConverterPara {
  lite::micro::MicroParam microParam;
  ParallelSplitConfig parallel_split_config;
  std::string device;
+  CpuOptionCfg cpuOptionCfgParam;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_TOOLS_CONVERTER_CXX_API_CONVERTER_PARA_H_
--- a/mindspore/lite/tools/converter/offline_packing_optimizer.h
+++ b/mindspore/lite/tools/converter/offline_packing_optimizer.h
@ -33,7 +33,7 @@ STATUS MatmulPacking(const mindspore::CNodePtr &cnode_ptr, const FuncGraphPtr &f
                     const lite::InnerContext *ctx);
 mindspore::lite::InnerContext *InitInnerContextForAndroidArmCpu();

-enum class BackendType : uint {
+enum class BackendType : uint8_t {
  kUnknownBackend = 0,
  kAndroidArmCpuBackend,
 };