weight packed 1.6

2022-02-09 14:19:41 +08:00 · 2022-02-09 14:19:41 +08:00 · 7ac492cf71
parent e31e523712
commit 7ac492cf71
19 changed files with 452 additions and 15 deletions
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@ -159,6 +159,13 @@ set(LITE_SRC
        ${LITE_DIR}/tools/converter/quantizer/fse_bit_stream.cc
        )

+if(MSLITE_ENABLE_SERVER_INFERENCE)
+    set(LITE_SRC
+            ${LITE_SRC}
+            ${LITE_DIR}/src/pack_weight_manager.cc
+            )
+endif()
+
 set(REGISTRY_SRC
        ${MICRO_DIR}/coder/opcoders/kernel_registry.cc
        )
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@ -131,6 +131,13 @@ set(LITE_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/cpu_info.cc
        )

+if(MSLITE_ENABLE_SERVER_INFERENCE)
+    set(LITE_SRC
+            ${LITE_SRC}
+            ${CMAKE_CURRENT_SOURCE_DIR}/pack_weight_manager.cc
+            )
+endif()
+
 if(MSLITE_ENABLE_CONTROLFLOW)
    file(GLOB CONTROL_FLOW_SRC
            ${CMAKE_CURRENT_SOURCE_DIR}/control_flow/*.cc
--- a/mindspore/lite/src/common/graph_util.cc
+++ b/mindspore/lite/src/common/graph_util.cc
@ -90,7 +90,7 @@ std::vector<size_t> GetLinkedPostNodeIdx(const lite::Model *model, const size_t
 bool IsPackedOp(int op_type) {
  static const std::vector<int> packed_ops = {schema::PrimitiveType_Conv2DFusion,
                                              schema::PrimitiveType_Conv2dTransposeFusion,
-                                              schema::PrimitiveType_MatMulFusion};
+                                              schema::PrimitiveType_FullConnection, schema::PrimitiveType_MatMulFusion};
  return IsContain(packed_ops, op_type);
 }
 }  // namespace lite
--- a/mindspore/lite/src/lite_model.cc
+++ b/mindspore/lite/src/lite_model.cc
@ -28,6 +28,9 @@
 #include "src/common/graph_util.h"
 #include "src/common/file_utils.h"
 #include "src/tensor.h"
+#ifdef SERVER_INFERENCE
+#include "src/pack_weight_manager.h"
+#endif
 #ifdef ENABLE_V0
 #include "src/ops/compat/compat_register.h"
 #endif
@ -105,6 +108,9 @@ int LiteModel::ConvertAttrToTensors() {
 #endif

 void LiteModel::Free() {
+#ifdef SERVER_INFERENCE
+  lite::PackWeightManager::GetInstance()->DeleteSavedModelPtr(this);
+#endif
  if (this->buf != nullptr) {
    delete[](this->buf);
    this->buf = nullptr;
@ -592,7 +598,9 @@ Model *ImportFromBuffer(const char *model_buf, size_t size, bool take_buf) {
    MS_LOG(ERROR) << "new model fail!";
    return nullptr;
  }
-
+#ifdef SERVER_INFERENCE
+  lite::PackWeightManager::GetInstance()->StoreLiteModel(model_buf, model);
+#endif
  auto status = model->ConstructModel(model_buf, size, take_buf);
  if (status != RET_OK) {
    MS_LOG(ERROR) << "construct model failed.";
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@ -16,6 +16,9 @@

 #include "src/lite_session.h"
 #include <set>
+#ifdef SERVER_INFERENCE
+#include "src/pack_weight_manager.h"
+#endif
 #ifndef RUNTIME_PASS_CLIP
 #include "src/runtime/runtime_pass.h"
 #endif
@ -575,6 +578,35 @@ void LiteSession::FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kern
  }
 }

+#ifdef SERVER_INFERENCE
+int LiteSession::IniPackWeightData(Model *model) {
+  auto lite_model = reinterpret_cast<LiteModel *>(model);
+  auto kernel_num = model->all_nodes_.size();
+  for (size_t i = 0; i < kernel_num; i++) {
+    auto node = model->all_nodes_[i];
+    auto node_type = node->node_type_;
+    if (IsPackedOp(node_type)) {
+      for (size_t j = 0; j < node->input_indices_.size(); j++) {
+        auto tensor_index = node->input_indices_[j];
+        auto src_tensor = lite_model->GetSchemaTensor(tensor_index);
+        if (src_tensor == nullptr || src_tensor->handler() == nullptr || src_tensor->data() == nullptr ||
+            src_tensor->length() == 0) {
+          continue;
+        }
+        auto data = lite::PackWeightManager::GetInstance()->GetTensorData(lite_model, src_tensor, tensor_index);
+        if (data == nullptr) {
+          MS_LOG(DEBUG) << "data not packed.";
+          continue;
+        }
+        this->tensors_[tensor_index]->set_data(data);
+        this->tensors_[tensor_index]->set_own_data(false);
+      }
+    }
+  }
+  return RET_OK;
+}
+#endif
+
 int LiteSession::CompileGraph(Model *model) {
  auto ret = PreCheck(model);
  if (ret != RET_OK) {
@ -589,6 +621,13 @@ int LiteSession::CompileGraph(Model *model) {
    is_running_.store(false);
    return ret;
  }
+#ifdef SERVER_INFERENCE
+  ret = IniPackWeightData(model);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "IniPackWeightData failed.";
+    return RET_ERROR;
+  }
+#endif
  InitGraphInputTensors(model);
  InitGraphOutputTensors(model);

@ -1666,6 +1705,9 @@ const char *lite::LiteSession::LoadModelByPath(const std::string &file, mindspor
    delete[] model_buf;
    model_buf = nullptr;
  }
+#ifdef SERVER_INFERENCE
+  lite::PackWeightManager::GetInstance()->InitWeightManagerByPath(file, model_buf);
+#endif
  return lite_buf;
 }

@ -1687,6 +1729,9 @@ const char *lite::LiteSession::LoadModelByPath(const std::string &file, mindspor
    delete[] model_buf;
    model_buf = nullptr;
  }
+#ifdef SERVER_INFERENCE
+  lite::PackWeightManager::GetInstance()->InitWeightManagerByPath(file, model_buf);
+#endif
  return lite_buf;
 }

--- a/mindspore/lite/src/lite_session.h
+++ b/mindspore/lite/src/lite_session.h
@ -115,6 +115,9 @@ class LiteSession : public session::LiteSession {
    const std::vector<kernel::LiteKernel *> &kernels,
    const std::unordered_map<Tensor *, Tensor *> isolate_input_map = std::unordered_map<Tensor *, Tensor *>());
  static void FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels);
+#ifdef SERVER_INFERENCE
+  int IniPackWeightData(Model *model);
+#endif

 private:
  int PreCheck(Model *model);
--- a/mindspore/lite/src/pack_weight_manager.cc
+++ b/mindspore/lite/src/pack_weight_manager.cc
@ -0,0 +1,159 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef SERVER_INFERENCE
+#include "src/pack_weight_manager.h"
+namespace mindspore::lite {
+namespace {
+constexpr size_t kMemAliginSize = 64;
+
+size_t RoundMemSize(size_t size) { return (size + kMemAliginSize - 1) & (~(kMemAliginSize - 1)); }
+}  // namespace
+PackWeightManager *PackWeightManager::GetInstance() {
+  static PackWeightManager instance;
+  return &instance;
+}
+
+void PackWeightManager::InitWeightManagerByPath(const std::string &model_path, const char *model_buf) {
+  MS_CHECK_TRUE_RET_VOID(model_buf != nullptr);
+  if (path_model_buf_.find(model_path) == path_model_buf_.end()) {
+    auto *model_const_weight = new (std::nothrow) ModelConstWeight();
+    if (model_const_weight == nullptr) {
+      return;
+    }
+    path_model_weight_[model_path] = model_const_weight;
+  }
+  path_model_buf_[model_path].push_back(model_buf);
+}
+
+STATUS PackWeightManager::StoreLiteModel(const char *model_buf, const Model *model) {
+  MS_CHECK_TRUE_RET(model_buf != nullptr, RET_ERROR);
+  MS_CHECK_TRUE_RET(model != nullptr, RET_ERROR);
+  for (auto &item : path_model_buf_) {
+    auto &model_bufs = item.second;
+    auto path = item.first;
+    if (find(model_bufs.begin(), model_bufs.end(), model_buf) != model_bufs.end()) {
+      path_model_weight_[path]->lite_models.push_back(model);
+      return RET_OK;
+    }
+  }
+
+  return RET_OK;
+}
+
+void *PackWeightManager::GetTensorData(const LiteModel *model, const SchemaTensorWrapper *origin_tensor,
+                                       size_t tensor_index) {
+  MS_CHECK_TRUE_RET(model != nullptr, nullptr);
+  for (auto &item : path_model_weight_) {
+    auto &path = item.first;
+    auto &model_weight = item.second;
+    auto &models = model_weight->lite_models;
+    if (find(models.begin(), models.end(), model) != models.end()) {
+      if (model_weight->packed_weight.find(tensor_index) != model_weight->packed_weight.end()) {
+        return model_weight->packed_weight[tensor_index];
+      }
+      path_model_weight_[path]->origin_weight[tensor_index] = origin_tensor->data();
+      path_model_weight_[path]->origin_data_index[origin_tensor->data()] = tensor_index;
+      return nullptr;
+    }
+  }
+  MS_LOG(DEBUG) << "tensor data not packed.";
+  return nullptr;
+}
+
+std::pair<PackStatus, void *> PackWeightManager::FindPackedTensor(ModelConstWeight *weight, const Tensor *tensor,
+                                                                  const size_t size) {
+  std::unique_lock<std::mutex> weight_lock(mtx_weight_);
+  MS_CHECK_TRUE_RET(tensor != nullptr, std::make_pair(MALLOC, nullptr));
+  auto &packed_weights = weight->packed_weight;
+  if (size > MAX_MALLOC_SIZE) {
+    MS_LOG(ERROR) << "malloc size more than MAX_MALLOC_SIZE";
+    return std::make_pair(MALLOC, nullptr);
+  }
+  if (weight->packed_data.find(tensor->data()) != weight->packed_data.end()) {
+    return std::make_pair(PACKED, tensor->data());
+  } else if (weight->origin_data_index.find(tensor->data()) != weight->origin_data_index.end()) {
+    auto origin_index = weight->origin_data_index[tensor->data()];
+    void *data = nullptr;
+#ifdef _WIN32
+    data = _aligned_malloc(allocate_size, kMemAlginSize);
+#else
+    auto ret = posix_memalign(&data, kMemAliginSize, size);
+    if (ret != 0) {
+      MS_LOG(ERROR) << "posix_memalign failed.";
+      return std::make_pair(MALLOC, nullptr);
+    }
+#endif
+    weight->packed_data.insert(data);
+    packed_weights.insert(std::make_pair(origin_index, data));
+    return std::make_pair(NOTPACK, packed_weights.at(origin_index));
+  }
+  return std::make_pair(MALLOC, nullptr);
+}
+
+std::pair<PackStatus, void *> PackWeightManager::GetPackedTensor(const Tensor *tensor, const size_t size) {
+  MS_CHECK_TRUE_RET(tensor != nullptr, std::make_pair(MALLOC, nullptr));
+  auto round_size = RoundMemSize(size);
+  for (auto &item : path_model_weight_) {
+    auto &model_weight = item.second;
+    auto packed_tensor_pair = FindPackedTensor(model_weight, tensor, round_size);
+    if (packed_tensor_pair.second != nullptr) {
+      return packed_tensor_pair;
+    }
+  }
+  MS_LOG(DEBUG) << "not const tensor, need pack in kernel.";
+  return std::make_pair(MALLOC, nullptr);
+}
+
+void PackWeightManager::DeleteSavedModelPtr(LiteModel *delete_model) {
+  std::unique_lock<std::mutex> weight_lock(mtx_weight_);
+  MS_CHECK_TRUE_RET_VOID(delete_model != nullptr);
+  for (auto &item : path_model_weight_) {
+    auto &weight = item.second;
+    auto it = find(weight->lite_models.begin(), weight->lite_models.end(), delete_model);
+    if (it != weight->lite_models.end()) {
+      weight->lite_models.erase(it);
+    }
+  }
+}
+
+void PackWeightManager::FreePackedWeight(ModelConstWeight *weight) {
+  for (auto &&packed_data : weight->packed_data) {
+    auto data = const_cast<void *>(packed_data);
+    if (data != nullptr) {
+#ifdef _WIN32
+      _aligned_free(data);
+#else
+      free(data);
+#endif
+      data = nullptr;
+    }
+  }
+  weight->packed_weight.clear();
+  weight->packed_data.clear();
+  if (weight != nullptr) {
+    delete weight;
+    weight = nullptr;
+  }
+}
+
+PackWeightManager::~PackWeightManager() {
+  for (auto &item : path_model_weight_) {
+    FreePackedWeight(item.second);
+    path_model_weight_.erase(item.first);
+  }
+}
+}  // namespace mindspore::lite
+#endif
--- a/mindspore/lite/src/pack_weight_manager.h
+++ b/mindspore/lite/src/pack_weight_manager.h
@ -0,0 +1,63 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_PACK_WEIGHT_MANAGER_H_
+#define MINDSPORE_LITE_SRC_PACK_WEIGHT_MANAGER_H_
+#include <map>
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include <set>
+#include <mutex>
+#include "src/tensor.h"
+#include "src/lite_session.h"
+namespace mindspore::lite {
+// tensor index <-> tensor data
+using OriginWeight = std::map<size_t, const void *>;
+using PackedWeight = std::map<size_t, void *>;
+struct ModelConstWeight {
+  PackedWeight packed_weight;
+  OriginWeight origin_weight;
+  std::vector<const Model *> lite_models;
+  std::map<const void *, size_t> origin_data_index;
+  std::set<void *> packed_data;
+};
+
+enum PackStatus : int8_t { NOTPACK = 1, PACKED = 2, MALLOC = 3 };
+
+class PackWeightManager {
+ public:
+  static PackWeightManager *GetInstance();
+  virtual ~PackWeightManager();
+
+  void InitWeightManagerByPath(const std::string &model_path, const char *model_buf);
+  void DeleteSavedModelPtr(LiteModel *delete_model);
+  STATUS StoreLiteModel(const char *model_buf, const Model *model);
+  void *GetTensorData(const LiteModel *model, const SchemaTensorWrapper *origin_tensor, size_t tensor_index);
+  std::pair<PackStatus, void *> GetPackedTensor(const Tensor *tensor, const size_t size);
+
+ private:
+  PackWeightManager() = default;
+  std::pair<PackStatus, void *> FindPackedTensor(ModelConstWeight *weight, const Tensor *tensor, const size_t size);
+  void FreePackedWeight(ModelConstWeight *weight);
+
+  std::map<const std::string, ModelConstWeight *> path_model_weight_;
+  std::map<const std::string, std::vector<const void *>> path_model_buf_;
+  std::mutex mtx_weight_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_PACK_WEIGHT_MANAGER_H_
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@ -50,10 +50,14 @@ ConvolutionBaseCPUKernel::~ConvolutionBaseCPUKernel() {
  if (addr_map.find(reinterpret_cast<uintptr_t>(packed_weight_)) != addr_map.end()) {
    FreeAlignedData(reinterpret_cast<void **>(&packed_weight_));
  } else if (!op_parameter_->is_train_session_) {
-    if (packed_weight_ != nullptr) {
+#ifdef SERVER_INFERENCE
+    if (packed_weight_ != nullptr && weight_is_packed_ == lite::MALLOC) {
+#endif
      free(packed_weight_);
      packed_weight_ = nullptr;
+#ifdef SERVER_INFERENCE
    }
+#endif
  }
  if (addr_map.find(reinterpret_cast<uintptr_t>(bias_data_)) != addr_map.end()) {
    FreeAlignedData(reinterpret_cast<void **>(&bias_data_));
@ -154,6 +158,12 @@ int ConvolutionBaseCPUKernel::InitConvWeightBias() {
    MS_ASSERT(in_tensors_.size() == kInputSize1);
  }
  if (!op_parameter_->is_train_session_) {
+#ifdef SERVER_INFERENCE
+    if (weight_is_packed_ == lite::PACKED) {
+      MS_LOG(DEBUG) << "not do weight pack.";
+      return RET_OK;
+    }
+#endif
    if (origin_weight_ != nullptr) {
      PackWeight();
    } else {
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h
@ -27,6 +27,9 @@
 #include <android/log.h>
 #endif
 #endif
+#ifdef SERVER_INFERENCE
+#include "src/pack_weight_manager.h"
+#endif
 #include "src/inner_kernel.h"
 #include "include/context.h"
 #include "src/runtime/kernel/arm/base/layout_transform.h"
@ -77,6 +80,9 @@ class ConvolutionBaseCPUKernel : public InnerKernel {
  bool IsRepack() const { return is_repack_; }
  std::unordered_map<uintptr_t, void *> addr_map;
  void *packed_weight_ = nullptr;
+#ifdef SERVER_INFERENCE
+  lite::PackStatus weight_is_packed_ = lite::MALLOC;
+#endif
  void *bias_data_ = nullptr;
  const InnerContext *ctx_ = nullptr;
  ConvParameter *conv_param_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
@ -15,7 +15,9 @@
 */

 #include "src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h"
-
+#ifdef SERVER_INFERENCE
+#include "src/pack_weight_manager.h"
+#endif
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;
@ -305,7 +307,16 @@ int Convolution1x1CPUKernel::MallocWeightBiasData() {
  int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float);
  if (!op_parameter_->is_train_session_) {
    CHECK_LESS_RETURN(MAX_MALLOC_SIZE, size);
+#ifdef SERVER_INFERENCE
+    auto packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(in_tensors_[1], size);
+    packed_weight_ = packed.second;
+    weight_is_packed_ = packed.first;
+    if (weight_is_packed_ == lite::MALLOC && packed_weight_ == nullptr) {
+      packed_weight_ = malloc(size);
+    }
+#else
    packed_weight_ = malloc(size);
+#endif
    if (packed_weight_ == nullptr) {
      MS_LOG(ERROR) << "Conv1x1 Malloc packed_weight_ error!";
      return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
@ -16,7 +16,9 @@

 #include "src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.h"
 #include "include/errorcode.h"
-
+#ifdef SERVER_INFERENCE
+#include "src/pack_weight_manager.h"
+#endif
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_INFER_INVALID;
 using mindspore::lite::RET_OK;
@ -116,7 +118,17 @@ int ConvolutionDepthwiseCPUKernel::MallocWeightBiasData() {
  }
  if (!op_parameter_->is_train_session_) {
    CHECK_LESS_RETURN(MAX_MALLOC_SIZE, pack_weight_size * sizeof(float));
+#ifdef SERVER_INFERENCE
+    auto packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
+      in_tensors_[1], static_cast<size_t>(pack_weight_size) * sizeof(float));
+    packed_weight_ = packed.second;
+    weight_is_packed_ = packed.first;
+    if (weight_is_packed_ == lite::MALLOC && packed_weight_ == nullptr) {
+      packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    }
+#else
    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+#endif
    if (packed_weight_ == nullptr) {
      MS_LOG(ERROR) << "Malloc buffer failed.";
      return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
@ -16,7 +16,9 @@

 #include "src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.h"
 #include "include/errorcode.h"
-
+#ifdef SERVER_INFERENCE
+#include "src/pack_weight_manager.h"
+#endif
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_INFER_INVALID;
 using mindspore::lite::RET_OK;
@ -199,7 +201,17 @@ int ConvolutionDepthwiseIndirectCPUKernel::MallocWeightBiasData() {
  int pack_weight_size = div_flag * batch_flag * weight_tensor->Height() * weight_tensor->Width();
  if (!op_parameter_->is_train_session_) {
    CHECK_LESS_RETURN(MAX_MALLOC_SIZE, pack_weight_size * sizeof(float));
+#ifdef SERVER_INFERENCE
+    auto packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
+      in_tensors_[1], static_cast<size_t>(pack_weight_size * sizeof(float)));
+    packed_weight_ = packed.second;
+    weight_is_packed_ = packed.first;
+    if (weight_is_packed_ == lite::MALLOC && packed_weight_ == nullptr) {
+      packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    }
+#else
    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+#endif
    if (packed_weight_ == nullptr) {
      MS_LOG(ERROR) << "Malloc buffer failed.";
      return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc
@ -174,7 +174,17 @@ int ConvolutionDepthwiseSWCPUKernel::MallocWeightBiasData() {
  int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
  if (!op_parameter_->is_train_session_) {
    CHECK_LESS_RETURN(MAX_MALLOC_SIZE, pack_weight_size * sizeof(float));
+#ifdef SERVER_INFERENCE
+    auto packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
+      in_tensors_[1], static_cast<size_t>(pack_weight_size) * sizeof(float));
+    packed_weight_ = packed.second;
+    weight_is_packed_ = packed.first;
+    if (packed_weight_ == nullptr && weight_is_packed_ == lite::MALLOC) {
+      packed_weight_ = malloc(pack_weight_size * sizeof(float));
+    }
+#else
    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+#endif
    if (packed_weight_ == nullptr) {
      MS_LOG(ERROR) << "Malloc buffer failed.";
      return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
@ -21,6 +21,9 @@
 #include "src/kernel_registry.h"
 #include "nnacl/fp32/conv_common_fp32.h"
 #include "nnacl/fp32/matmul_fp32.h"
+#ifdef SERVER_INFERENCE
+#include "src/pack_weight_manager.h"
+#endif

 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_INFER_INVALID;
@ -210,12 +213,25 @@ int ConvolutionCPUKernel::MallocWeightBiasData() {
  size_t pack_weight_size = oc_block_num * in_channel * kernel_plane;
  if (!op_parameter_->is_train_session_) {
    CHECK_LESS_RETURN(MAX_MALLOC_SIZE, pack_weight_size * sizeof(float));
+#ifdef SERVER_INFERENCE
+    auto packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
+      in_tensors_[1], static_cast<size_t>(pack_weight_size) * sizeof(float));
+    packed_weight_ = packed.second;
+    weight_is_packed_ = packed.first;
+    if (weight_is_packed_ == lite::MALLOC && packed_weight_ == nullptr) {
+      packed_weight_ = malloc(pack_weight_size * sizeof(float));
+      memset(packed_weight_, 0, pack_weight_size * sizeof(float));
+    }
+#else
    packed_weight_ = malloc(pack_weight_size * sizeof(float));
+#endif
    if (packed_weight_ == nullptr) {
      MS_LOG(ERROR) << "malloc packed weight failed.";
      return RET_ERROR;
    }
+#ifndef SERVER_INFERENCE
    memset(packed_weight_, 0, pack_weight_size * sizeof(float));
+#endif
  }

  if (bias_data_ == nullptr) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
@ -208,13 +208,25 @@ int ConvolutionWinogradCPUKernel::MallocWeightBiasData() {
  if (!op_parameter_->is_train_session_) {
    if (packed_weight_ == nullptr) {
      CHECK_LESS_RETURN(MAX_MALLOC_SIZE, trans_matrix_data_size);
+#ifdef SERVER_INFERENCE
+      auto packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(in_tensors_[1], trans_matrix_data_size);
+      packed_weight_ = packed.second;
+      weight_is_packed_ = packed.first;
+      if (weight_is_packed_ == lite::MALLOC && packed_weight_ == nullptr) {
+        packed_weight_ = malloc(trans_matrix_data_size);
+        memset(packed_weight_, 0, trans_matrix_data_size);
+      }
+#else
      packed_weight_ = malloc(trans_matrix_data_size);
+#endif
      if (packed_weight_ == nullptr) {
        MS_LOG(ERROR) << "malloc matrix_buffer failed.";
        return RET_MEMORY_FAILED;
      }
    }
+#ifndef SERVER_INFERENCE
    memset(packed_weight_, 0, trans_matrix_data_size);
+#endif
  }

  float matrix_a[64];
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
@ -68,7 +68,19 @@ int MatmulFp32BaseCPUKernel::InitBufferA() {
    if (op_parameter_->is_train_session_) {
      a_pack_ptr_ = reinterpret_cast<float *>(workspace());
    } else {
+#ifdef SERVER_INFERENCE
+      if (!params_->a_const_) {
+        a_pack_ptr_ = reinterpret_cast<float *>(
+          ms_context_->allocator->Malloc(static_cast<size_t>(matrix_a_pack_size_) * sizeof(float)));
+      } else {
+        auto a_packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
+          in_tensors()[0], static_cast<size_t>(matrix_a_pack_size_) * sizeof(float));
+        a_pack_ptr_ = reinterpret_cast<float *>(a_packed.second);
+        a_is_packed_ = a_packed.first;
+      }
+#else
      a_pack_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_a_pack_size_ * sizeof(float)));
+#endif
    }
  }
  if (a_pack_ptr_ == nullptr) {
@ -85,8 +97,20 @@ int MatmulFp32BaseCPUKernel::InitBufferB() {
  if (op_parameter_->is_train_session_) {
    b_pack_ptr_ = reinterpret_cast<float *>(workspace()) + matrix_a_pack_size_;
  } else {
+#ifdef SERVER_INFERENCE
+    if (params_->b_const_) {
+      auto b_packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
+        in_tensors()[1], static_cast<size_t>(matrix_b_pack_size_) * sizeof(float));
+      b_pack_ptr_ = reinterpret_cast<float *>(b_packed.second);
+      b_is_packed_ = b_packed.first;
+    } else {
+      b_pack_ptr_ = reinterpret_cast<float *>(
+        ms_context_->allocator->Malloc(static_cast<size_t>(matrix_b_pack_size_) * sizeof(float)));
+    }
+#else
    b_pack_ptr_ = reinterpret_cast<float *>(
      ms_context_->allocator->Malloc(static_cast<size_t>(matrix_b_pack_size_) * sizeof(float)));
+#endif
  }
  if (b_pack_ptr_ == nullptr) {
    MS_LOG(ERROR) << "malloc b_pack_ptr_ failed";
@ -207,14 +231,26 @@ void MatmulFp32BaseCPUKernel::FreeBiasBuf() {

 void MatmulFp32BaseCPUKernel::FreeResizeBufA() {
  if (!vec_matmul_ && !op_parameter_->is_train_session_ && a_pack_ptr_ != nullptr && is_pack_) {
-    ms_context_->allocator->Free(a_pack_ptr_);
+#ifdef SERVER_INFERENCE
+    if (a_is_packed_ == lite::MALLOC) {
+#endif
+      ms_context_->allocator->Free(a_pack_ptr_);
+#ifdef SERVER_INFERENCE
+    }
+#endif
  }
  a_pack_ptr_ = nullptr;
 }

 void MatmulFp32BaseCPUKernel::FreeResizeBufB() {
  if (!op_parameter_->is_train_session_ && b_pack_ptr_ != nullptr && is_pack_) {
-    ms_context_->allocator->Free(b_pack_ptr_);
+#ifdef SERVER_INFERENCE
+    if (b_is_packed_ == lite::MALLOC) {
+#endif
+      ms_context_->allocator->Free(b_pack_ptr_);
+#ifdef SERVER_INFERENCE
+    }
+#endif
  }
  b_pack_ptr_ = nullptr;
 }
@ -385,11 +421,17 @@ int MatmulFp32BaseCPUKernel::Prepare() {
    if (InitBufferA() != RET_OK) {
      return RET_ERROR;
    }
-    ret = InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data()));
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "InitMatrixA failed!";
-      return ret;
+#ifdef SERVER_INFERENCE
+    if (a_is_packed_ != lite::PACKED) {
+#endif
+      ret = InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data()));
+      if (ret != RET_OK) {
+        MS_LOG(ERROR) << "InitMatrixA failed!";
+        return ret;
+      }
+#ifdef SERVER_INFERENCE
    }
+#endif
  }
  if (params_->b_const_) {
    auto b_tensor = in_tensors_[1];
@ -397,10 +439,16 @@ int MatmulFp32BaseCPUKernel::Prepare() {
    if (InitBufferB() != RET_OK) {
      return RET_ERROR;
    }
-    if (InitMatrixB(static_cast<float *>(b_tensor->data())) != RET_OK) {
-      MS_LOG(ERROR) << "InitMatrixB failed!";
-      return RET_ERROR;
+#ifdef SERVER_INFERENCE
+    if (b_is_packed_ != lite::PACKED) {
+#endif
+      if (InitMatrixB(static_cast<float *>(b_tensor->data())) != RET_OK) {
+        MS_LOG(ERROR) << "InitMatrixB failed!";
+        return RET_ERROR;
+      }
+#ifdef SERVER_INFERENCE
    }
+#endif
  }
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h
@ -19,6 +19,9 @@

 #include <vector>
 #include "src/inner_kernel.h"
+#ifdef SERVER_INFERENCE
+#include "src/pack_weight_manager.h"
+#endif
 #include "nnacl/matmul_parameter.h"
 #include "include/errorcode.h"
 #include "src/common/common.h"
@ -77,6 +80,10 @@ class MatmulFp32BaseCPUKernel : public InnerKernel {
  MatMulParameter *params_ = nullptr;
  float *a_pack_ptr_ = nullptr;
  float *b_pack_ptr_ = nullptr;
+#ifdef SERVER_INFERENCE
+  lite::PackStatus a_is_packed_ = lite::MALLOC;
+  lite::PackStatus b_is_packed_ = lite::MALLOC;
+#endif
  int a_batch_ = 1;
  int b_batch_ = 1;
  std::vector<int> a_offset_;
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@ -119,6 +119,7 @@ set(LITE_SRC ${API_SRC}
        ${SRC_DIR}/errorcode.cc
        ${SRC_DIR}/weight_decoder.cc
        ${SRC_DIR}/huffman_decode.cc
+        ${SRC_DIR}/pack_weight_manager.cc
        ${SRC_DIR}/delegate/tensorrt/distribution/distribution_base.cc
        )