!31174 [MS][LITE][STABLE]optimize code | sync from master

Merge pull request !31174 from chenjianping/r1.6_dev
2022-03-14 03:28:29 +00:00 · 2022-03-14 03:28:29 +00:00 · f188616162
parent b791d9d91f c1ae2ba55f
commit f188616162
18 changed files with 140 additions and 105 deletions
--- a/mindspore/core/mindrt/src/actor/actormgr.cc
+++ b/mindspore/core/mindrt/src/actor/actormgr.cc
@ -194,6 +194,14 @@ int ActorMgr::EnqueueMessage(const mindspore::ActorReference actor, std::unique_

 int ActorMgr::Send(const AID &to, std::unique_ptr<MessageBase> msg, bool remoteLink, bool isExactNotRemote) {
  // The destination is local
+#ifdef BUILD_LITE
+  auto actor = GetActor(to);
+  if (actor != nullptr) {
+    return EnqueueMessage(actor, std::move(msg));
+  } else {
+    return ACTOR_NOT_FIND;
+  }
+#else
  if (IsLocalAddres(to)) {
    auto actor = GetActor(to);
    if (actor != nullptr) {
@ -223,6 +231,7 @@ int ActorMgr::Send(const AID &to, std::unique_ptr<MessageBase> msg, bool remoteL
      return IO_NOT_FIND;
    }
  }
+#endif
 }

 AID ActorMgr::Spawn(const ActorReference &actor, bool shareThread) {
@ -235,7 +244,7 @@ AID ActorMgr::Spawn(const ActorReference &actor, bool shareThread) {
  MS_LOG(DEBUG) << "ACTOR was spawned,a=" << actor->GetAID().Name().c_str();

  if (shareThread) {
-    auto mailbox = std::unique_ptr<MailBox>(new (std::nothrow) NonblockingMailBox());
+    auto mailbox = std::make_unique<NonblockingMailBox>();
    auto hook = std::unique_ptr<std::function<void()>>(
      new std::function<void()>([actor]() { ActorMgr::GetActorMgrRef()->SetActorReady(actor); }));
    // the mailbox has this hook, the hook holds the actor reference, the actor has the mailbox. this is a cycle which
--- a/mindspore/core/mindrt/src/actor/mailbox.cc
+++ b/mindspore/core/mindrt/src/actor/mailbox.cc
@ -33,7 +33,7 @@ std::list<std::unique_ptr<MessageBase>> *BlockingMailBox::GetMsgs() {
    while (enqueMailBox->empty()) {
      cond.wait(ulk, [this] { return !this->enqueMailBox->empty(); });
    }
-    SwapMailBox(&enqueMailBox, &dequeMailBox);
+    enqueMailBox->swap(*dequeMailBox);
    ret = dequeMailBox;
  }
  return ret;
@ -62,7 +62,7 @@ std::list<std::unique_ptr<MessageBase>> *NonblockingMailBox::GetMsgs() {
      released_ = true;
      return nullptr;
    }
-    SwapMailBox(&enqueMailBox, &dequeMailBox);
+    dequeMailBox->swap(*enqueMailBox);
    ret = dequeMailBox;
    released_ = false;
  }
--- a/mindspore/core/mindrt/src/actor/mailbox.h
+++ b/mindspore/core/mindrt/src/actor/mailbox.h
@ -34,11 +34,6 @@ class MailBox {
  virtual std::unique_ptr<MessageBase> GetMsg() = 0;
  inline void SetNotifyHook(std::unique_ptr<std::function<void()>> &&hook) { notifyHook = std::move(hook); }
  inline bool TakeAllMsgsEachTime() { return takeAllMsgsEachTime; }
-  void SwapMailBox(std::list<std::unique_ptr<MessageBase>> **box1, std::list<std::unique_ptr<MessageBase>> **box2) {
-    std::list<std::unique_ptr<MessageBase>> *tmp = *box1;
-    *box1 = *box2;
-    *box2 = tmp;
-  }

 protected:
  // if this flag is true, GetMsgs() should be invoked to take all enqueued msgs each time, otherwise we can only get
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.12)
 project(Lite)

 set(BUILD_LITE "on")
+add_compile_definitions(BUILD_LITE)

 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/secure_option.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_link_option.cmake)
--- a/mindspore/lite/src/common/common.h
+++ b/mindspore/lite/src/common/common.h
@ -21,7 +21,13 @@

 namespace mindspore {
 namespace lite {
-#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#ifndef MS_UNLIKELY
+#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
+#define MS_UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define MS_UNLIKELY(x) x
+#endif
+#endif

 enum NCHW_SHAPE { NCHW_N = 0, NCHW_C = 1, NCHW_H = 2, NCHW_W = 3 };
 enum NHWC_SHAPE { NHWC_N = 0, NHWC_H = 1, NHWC_W = 2, NHWC_C = 3 };
--- a/mindspore/lite/src/cxx_api/model_pool/model_pool.cc
+++ b/mindspore/lite/src/cxx_api/model_pool/model_pool.cc
@ -41,7 +41,7 @@ int GetCoreNum() {
 }

 void SetNumaBindStrategy(std::vector<std::vector<int>> *all_model_bind_list, int thread_num, int node_id) {
-  if (UNLIKELY(thread_num == 0)) {
+  if (MS_UNLIKELY(thread_num == 0)) {
    MS_LOG(ERROR) << "thread num is zero.";
    return;
  }
--- a/mindspore/lite/src/inner_context.cc
+++ b/mindspore/lite/src/inner_context.cc
@ -360,18 +360,20 @@ ThreadPool *InnerContext::thread_pool() const { return thread_pool_; }
 bool InnerContext::device_and_pkg_support_fp16() const { return this->device_and_pkg_support_fp16_; }

 std::set<void *> InnerContext::GetLinkInfo(void *pre) const {
-  if (link_info_.find(pre) == link_info_.end()) {
+  auto iter = link_info_.find(pre);
+  if (iter == link_info_.end()) {
    MS_LOG(DEBUG) << "Not found precursor in link information.";
    return {};
  }
-  return link_info_.at(pre);
+  return iter->second;
 }

 std::unordered_map<void *, std::set<void *>> InnerContext::GetAllLinkInfo() const { return link_info_; }

 void InnerContext::SetLinkInfo(void *pre, void *suc) {
-  if (link_info_.find(pre) != link_info_.end()) {
-    link_info_.at(pre).insert(suc);
+  auto iter = link_info_.find(pre);
+  if (iter != link_info_.end()) {
+    iter->second.insert(suc);
    return;
  }
  std::set<void *> suc_set{suc};
@ -385,9 +387,10 @@ void InnerContext::SetAllLinkInfo(const std::unordered_map<void *, std::set<void
 void InnerContext::ReplaceLinkInfoReceiverWithNewOne(void *new_receiver, void *old_receiver) {
  for (auto &info : link_info_) {
    auto &receivers = info.second;
-    if (receivers.find(old_receiver) != receivers.end()) {
+    auto iter = receivers.find(old_receiver);
+    if (iter != receivers.end()) {
+      receivers.erase(iter);
      receivers.insert(new_receiver);
-      receivers.erase(old_receiver);
    }
  }
 }
--- a/mindspore/lite/src/lite_mindrt.cc
+++ b/mindspore/lite/src/lite_mindrt.cc
@ -20,6 +20,7 @@
 #include "mindrt/include/mindrt.hpp"
 #include "src/lite_kernel_util.h"
 #include "src/common/tensor_util.h"
+#include "src/common/common.h"
 #include "src/runtime/inner_allocator.h"
 #include "src/runtime/kernel/arm/base/partial_fusion.h"
 #ifndef CONTROLFLOW_TENSORLIST_CLIP
@ -39,14 +40,14 @@ void LiteOpActor::RunOpData(OpData<lite::Tensor> *inputs, OpContext<lite::Tensor

  InitInputData();

-  auto ret = RunKernel(*(reinterpret_cast<const KernelCallBack *>(context->kernel_call_back_before_)),
-                       *(reinterpret_cast<const KernelCallBack *>(context->kernel_call_back_after_)));
+  auto ret = kernel_->Execute(*(reinterpret_cast<const KernelCallBack *>(context->kernel_call_back_before_)),
+                              *(reinterpret_cast<const KernelCallBack *>(context->kernel_call_back_after_)));
+  input_op_datas_.erase(op_uuid);
  if (ret != RET_OK) {
-    input_op_datas_.erase(op_uuid);
+    MS_LOG(ERROR) << "run kernel failed, name: " << kernel_->name();
    context->SetFailed(ret);
    return;
  }
-  input_op_datas_.erase(op_uuid);
  AsyncOutput(context);
  SetOutputData(context);
  return;
@ -89,7 +90,7 @@ int LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *act
  isolate_input_map_ = input_map;
  std::vector<kernel::LiteKernel *> kernels{};
  std::transform(actors->begin(), actors->end(), std::back_inserter(kernels),
-                 [](std::shared_ptr<LiteOpActor> actor) { return actor->kernel_; });
+                 [](const std::shared_ptr<LiteOpActor> &actor) { return actor->kernel_; });
  size_t in_tensor_size = kernel_->in_tensors().size();
  for (size_t i = 0; i < in_tensor_size; i++) {
    Tensor *old_tensor = kernel_->in_tensors()[i];
@ -112,7 +113,8 @@ int LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *act
    }

    TypeId new_data_type = GetSubgraphInTensorDataType(kernel_, old_tensor);
-    Tensor *new_tensor = new Tensor(new_data_type, old_tensor->shape(), old_tensor->format(), old_tensor->category());
+    Tensor *new_tensor =
+      new (std::nothrow) Tensor(new_data_type, old_tensor->shape(), old_tensor->format(), old_tensor->category());
    if (new_tensor == nullptr) {
      MS_LOG(ERROR) << "new Tensor failed.";
      return RET_NULL_PTR;
@ -235,38 +237,41 @@ int LiteOpActor::UpdateActorOutput() {
 }
 #endif

-bool LiteOpActor::ArrowHasCompiled(const AID &actor_name, const size_t &to_index,
+bool LiteOpActor::ArrowHasCompiled(const AID &actor_name, size_t to_index,
                                   const std::unordered_map<AID, std::set<size_t>> &receiver_index_set) {
-  if (receiver_index_set.find(actor_name) != receiver_index_set.end()) {
-    return receiver_index_set.at(actor_name).find(to_index) != receiver_index_set.at(actor_name).end();
+  auto iter = receiver_index_set.find(actor_name);
+  if (iter != receiver_index_set.end()) {
+    return iter->second.find(to_index) != iter->second.end();
  }
  return false;
 }

-void LiteOpActor::MarkArrowAsCompiled(const AID *actor_name, const size_t *to_index,
+void LiteOpActor::MarkArrowAsCompiled(const AID *actor_name, size_t to_index,
                                      std::unordered_map<AID, std::set<size_t>> *receiver_index_set) {
  if (receiver_index_set->find(*actor_name) == receiver_index_set->end()) {
-    std::set<size_t> tmp{*to_index};
+    std::set<size_t> tmp{to_index};
    receiver_index_set->insert(std::pair<AID, std::set<size_t>>(*actor_name, tmp));
  } else {
-    receiver_index_set->at(*actor_name).insert(*to_index);
+    receiver_index_set->at(*actor_name).insert(to_index);
  }
 }

 int LiteOpActor::CreateCommonArrow(const std::unordered_map<void *, std::set<std::pair<AID, size_t>>> &receivers_map,
                                   const std::set<void *> &receiver_tensors, const size_t &output_index,
                                   std::unordered_map<AID, std::set<size_t>> *receiver_index_set) {
+  std::unordered_map<void *, std::set<std::pair<AID, size_t>>>::const_iterator iter;
  for (auto receiver_tensor : receiver_tensors) {
-    if (receivers_map.find(receiver_tensor) == receivers_map.end()) {
+    iter = receivers_map.find(receiver_tensor);
+    if (iter == receivers_map.end()) {
      MS_LOG(DEBUG) << "not a useful receiver.";
      continue;
    }
-    auto receiver_set = receivers_map.at(receiver_tensor);
+    auto receiver_set = iter->second;
    for (auto item : receiver_set) {
      if (ArrowHasCompiled(item.first, item.second, *receiver_index_set)) {
        continue;
      }
-      MarkArrowAsCompiled(&(item.first), &(item.second), receiver_index_set);
+      MarkArrowAsCompiled(&(item.first), item.second, receiver_index_set);
      auto arrow = std::make_shared<DataArrow>(output_index, item.first, item.second);
      MS_CHECK_TRUE_MSG(arrow != nullptr, RET_ERROR, "create arrow failed.");
      output_data_arrows_.push_back(arrow);
@ -352,8 +357,9 @@ void LiteOpActor::InitInputData() {
 }

 void LiteOpActor::AsyncOutput(OpContext<Tensor> *context) {
-  for (size_t i = 0; i < output_data_arrows_.size(); i++) {
-    auto data = outputs_data_.at(i);
+  auto output_size = output_data_arrows_.size();
+  for (size_t i = 0; i < output_size; ++i) {
+    auto data = outputs_data_[i];
    Async(output_data_arrows_[i]->to_op_id_, &mindspore::OpActor<Tensor>::RunOpData, data.get(), context);
  }
 }
@ -372,11 +378,11 @@ int LiteOpActor::PrepareOutputData() {
    auto &arrow = output_data_arrows_[i];
    auto data = std::make_shared<OpData<Tensor>>(this->GetAID(), (kernel_->out_tensors()).at(arrow->from_output_index_),
                                                 static_cast<int>(arrow->to_input_index_));
-    if (data == nullptr) {
+    if (MS_UNLIKELY(data == nullptr)) {
      MS_LOG(ERROR) << "new output_data failed.";
      return RET_NULL_PTR;
    }
-    outputs_data_.at(i) = data;
+    outputs_data_[i] = data;
  }
  return RET_OK;
 }
@ -389,6 +395,7 @@ std::vector<std::shared_ptr<LiteOpActor>> CreateOpActor(const std::vector<kernel
    MS_LOG(ERROR) << "thread pool is nullptr";
    return actors;
  }
+  actors.reserve(kernels.size());
  for (auto &kernel : kernels) {
    /* make subgraph name (actor name) unique */
    kernel->set_name(kernel->name() + "_" + to_string(actor_count++));
--- a/mindspore/lite/src/lite_mindrt.h
+++ b/mindspore/lite/src/lite_mindrt.h
@ -52,7 +52,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
  }
  void RunOpData(OpData<lite::Tensor> *input_data, OpContext<lite::Tensor> *context = nullptr) override;
  virtual int CompileArrow(const std::unordered_map<void *, std::set<std::pair<AID, size_t>>> &receivers_map);
-  int RunKernel(const KernelCallBack &before, const KernelCallBack &after) {
+  int RunKernel(KernelCallBack before, KernelCallBack after) {
    auto ret = kernel_->Execute(before, after);
    if (RET_OK != ret) {
      MS_LOG(ERROR) << "run kernel failed, name: " << kernel_->name();
@ -107,9 +107,9 @@ class LiteOpActor : public OpActor<lite::Tensor> {
                        const std::set<void *> &receiver_tensors, const size_t &output_index,
                        std::unordered_map<AID, std::set<size_t>> *receiver_index_set);
  int CreateEmptyArrow(const size_t &output_index);
-  bool ArrowHasCompiled(const AID &actor_name, const size_t &to_index,
+  bool ArrowHasCompiled(const AID &actor_name, size_t to_index,
                        const std::unordered_map<AID, std::set<size_t>> &receiver_index_set);
-  void MarkArrowAsCompiled(const AID *actor_name, const size_t *to_index,
+  void MarkArrowAsCompiled(const AID *actor_name, size_t to_index,
                           std::unordered_map<AID, std::set<size_t>> *receiver_index_set);

 private:
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@ -564,9 +564,9 @@ int LiteSession::IsolateOutputTensor() {
    if (src_tensor->IsGraphInput()) {
      continue;
    }
-    Tensor *new_tensor =
-      new Tensor(src_tensor->data_type(), src_tensor->shape(), src_tensor->format(), Category::GRAPH_OUTPUT);
-    if (new_tensor == nullptr) {
+    Tensor *new_tensor = new (std::nothrow)
+      Tensor(src_tensor->data_type(), src_tensor->shape(), src_tensor->format(), Category::GRAPH_OUTPUT);
+    if (MS_UNLIKELY(new_tensor == nullptr)) {
      MS_LOG(ERROR) << "duplicate new output failed.";
      return RET_NULL_PTR;
    }
@ -590,12 +590,14 @@ int LiteSession::IsolateOutputTensor() {
    /* set new tensor for calculate */
    for (auto subgraph : kernels_) {
      /* subgraph input and output */
-      for (size_t i = 0; i < subgraph->in_tensors().size(); i++) {
+      auto in_size = subgraph->in_tensors().size();
+      for (size_t i = 0; i < in_size; ++i) {
        if (subgraph->in_tensors()[i] == src_tensor) {
          subgraph->set_in_tensor(new_tensor, i);
        }
      }
-      for (size_t i = 0; i < subgraph->out_tensors().size(); i++) {
+      auto out_size = subgraph->out_tensors().size();
+      for (size_t i = 0; i < out_size; ++i) {
        if (subgraph->out_tensors()[i] == src_tensor) {
          subgraph->set_out_tensor(new_tensor, i);
        }
@ -607,14 +609,18 @@ int LiteSession::IsolateOutputTensor() {
 #endif
      /* node input and output */
      auto nodes = reinterpret_cast<kernel::SubGraphKernel *>(subgraph)->nodes();
-      for (size_t i = 0; i < nodes.size(); i++) {
+      auto nodes_size = nodes.size();
+      for (size_t i = 0; i < nodes_size; ++i) {
        auto node = nodes[i];
-        for (size_t j = 0; j < node->out_tensors().size(); j++) {
+        out_size = node->out_tensors().size();
+        for (size_t j = 0; j < out_size; ++j) {
          if (node->out_tensors()[j] == src_tensor) {
            node->set_out_tensor(new_tensor, j);
+            break;
          }
        }
-        for (size_t j = 0; j < node->in_tensors().size(); j++) {
+        in_size = node->in_tensors().size();
+        for (size_t j = 0; j < in_size; ++j) {
          if (node->in_tensors()[j] == src_tensor) {
            node->set_in_tensor(new_tensor, j);
          }
@ -906,11 +912,7 @@ int LiteSession::RunGraph(const KernelCallBack &before, const KernelCallBack &af
    return ret;
  }
  MS_ASSERT(this->context_ != nullptr);
-  if (before == nullptr && after == nullptr) {
-    ret = executor_->Run(this->inputs_, this->outputs_, this->kernels_);
-  } else {
-    ret = executor_->Run(this->inputs_, this->outputs_, this->kernels_, before, after);
-  }
+  ret = executor_->Run(this->inputs_, this->outputs_, this->kernels_, before, after);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "RunGraph failed : " << ret;
  }
@ -1236,7 +1238,7 @@ void LiteSession::ResetInputsShape(const std::vector<std::vector<int>> &dims) {
 }

 int LiteSession::ReSizeKernels(const std::vector<kernel::LiteKernel *> &kernels,
-                               const std::unordered_map<Tensor *, Tensor *> isolate_input_map) {
+                               const std::unordered_map<Tensor *, Tensor *> &isolate_input_map) {
  for (auto kernel : kernels) {
    if (kernel == nullptr) {
      MS_LOG(ERROR) << "input kernel is nullptr!";
--- a/mindspore/lite/src/lite_session.h
+++ b/mindspore/lite/src/lite_session.h
@ -117,7 +117,7 @@ class LiteSession : public session::LiteSession {
 #endif
  static int ReSizeKernels(
    const std::vector<kernel::LiteKernel *> &kernels,
-    const std::unordered_map<Tensor *, Tensor *> isolate_input_map = std::unordered_map<Tensor *, Tensor *>());
+    const std::unordered_map<Tensor *, Tensor *> &isolate_input_map = std::unordered_map<Tensor *, Tensor *>());
  static void FreePackOpWeight(const std::vector<kernel::LiteKernel *> &kernels);
 #ifdef SERVER_INFERENCE
  int IniPackWeightData(Model *model);
--- a/mindspore/lite/src/mindrt_executor.cc
+++ b/mindspore/lite/src/mindrt_executor.cc
@ -18,6 +18,7 @@
 #include <memory>
 #include "src/lite_mindrt.h"
 #include "include/errorcode.h"
+#include "src/common/common.h"
 #include "src/common/tensor_util.h"
 #ifdef ENABLE_FP16
 #include "nnacl/base/cast_base.h"
@ -27,7 +28,8 @@
 namespace mindspore::lite {
 int MindrtExecutor::PrepareGraphInput(const std::vector<kernel::LiteKernel *> &kernels,
                                      const std::vector<Tensor *> &inputs) {
-  for (size_t j = 0; j < kernels.size(); ++j) {
+  auto kernels_size = kernels.size();
+  for (size_t j = 0; j < kernels_size; ++j) {
    auto in_tensor_size = kernels[j]->in_tensors().size();
    for (size_t k = 0; k < in_tensor_size; ++k) {
      auto tensor = kernels[j]->in_tensors()[k];
@ -40,7 +42,7 @@ int MindrtExecutor::PrepareGraphInput(const std::vector<kernel::LiteKernel *> &k
        return RET_ERROR;
      }
      auto data = std::make_shared<OpData<Tensor>>(op_actors_[j]->GetAID(), inputs.at(idx), static_cast<int>(k));
-      if (data == nullptr) {
+      if (MS_UNLIKELY(data == nullptr)) {
        MS_LOG(ERROR) << "new opdata failed.";
        return RET_NULL_PTR;
      }
@ -52,7 +54,8 @@ int MindrtExecutor::PrepareGraphInput(const std::vector<kernel::LiteKernel *> &k

 int MindrtExecutor::PrepareGraphOutput(const std::vector<kernel::LiteKernel *> &kernels,
                                       const std::vector<Tensor *> &outputs) {
-  for (size_t i = 0; i < outputs.size(); ++i) {
+  auto outputs_size = outputs.size();
+  for (size_t i = 0; i < outputs_size; ++i) {
    Tensor *graph_output_tensor = outputs[i];
    if (graph_output_tensor->IsGraphInput()) {
      continue;
@ -66,8 +69,8 @@ int MindrtExecutor::PrepareGraphOutput(const std::vector<kernel::LiteKernel *> &
      });
    MS_ASSERT(current_output_map != isolate_output_map_->end());
    Tensor *subgraph_output_tensor = current_output_map->first;
-
-    for (size_t j = 0; j < kernels.size(); ++j) {
+    auto kernels_size = kernels.size();
+    for (size_t j = 0; j < kernels_size; ++j) {
      auto out_tensor_size = kernels[j]->out_tensors().size();
      for (size_t k = 0; k < out_tensor_size; ++k) {
        if (subgraph_output_tensor != kernels[j]->out_tensors()[k]) {
@ -75,7 +78,7 @@ int MindrtExecutor::PrepareGraphOutput(const std::vector<kernel::LiteKernel *> &
        }
        auto data =
          std::make_shared<OpData<Tensor>>(op_actors_[j]->GetAID(), subgraph_output_tensor, static_cast<int>(k));
-        if (data == nullptr) {
+        if (MS_UNLIKELY(data == nullptr)) {
          MS_LOG(ERROR) << "new opdata failed.";
          return RET_NULL_PTR;
        }
@ -114,8 +117,9 @@ std::unordered_map<void *, std::set<std::pair<AID, size_t>>> MindrtExecutor::Bui
    for (size_t i = 0; i < input_tensors.size(); ++i) {
      auto key = input_tensors[i];
      auto pair = std::make_pair(op_actor->GetAID(), i);
-      if (receivers_map.find(key) != receivers_map.end()) {
-        receivers_map.at(key).insert(pair);
+      auto iter = receivers_map.find(key);
+      if (iter != receivers_map.end()) {
+        iter->second.emplace(pair);
      } else {
        std::set<std::pair<AID, size_t>> tmp_set{pair};
        receivers_map[input_tensors[i]] = tmp_set;
@ -127,7 +131,7 @@ std::unordered_map<void *, std::set<std::pair<AID, size_t>>> MindrtExecutor::Bui

 int MindrtExecutor::LinkActors() {
  auto receivers_map = BuildReceiverMap();
-  for (auto op_actor : op_actors_) {
+  for (auto &&op_actor : op_actors_) {
    auto ret = op_actor->CompileArrow(receivers_map);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "actor: " << op_actor->GetAID() << " compile arrow failed.";
@ -138,7 +142,7 @@ int MindrtExecutor::LinkActors() {
 }

 int MindrtExecutor::PostInitActors() {
-  for (auto actor : op_actors_) {
+  for (auto &&actor : op_actors_) {
    auto ret = actor->PostInit();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "PrepareGraphOutput failed, actor aid: " << actor->GetAID();
--- a/mindspore/lite/src/runtime/dynamic_mem_manager.cc
+++ b/mindspore/lite/src/runtime/dynamic_mem_manager.cc
@ -18,7 +18,6 @@
 #include "src/common/log_adapter.h"
 #include "src/common/utils.h"
 #include "src/common/common.h"
-#include "src/runtime/numa_adapter.h"

 using mindspore::numa::NUMAAdapter;

@ -49,13 +48,13 @@ void *MemOperator::Allocate(size_t rounded_size, int node_id, size_t *allocate_s
  int64_t left = 0;
  if (node_id >= 0) {
    // allocate memory from numa node
-    MemoryInfo mem_info = NUMAAdapter::GetInstance()->GetNodeSize(node_id);
+    MemoryInfo mem_info = numa_instance_->GetNodeSize(node_id);
    free_count = mem_info.free;
  } else {
    free_count = lite::GetFreeMemory();
  }

-  if (UNLIKELY(static_cast<int64_t>(rounded_size) >= free_count)) {
+  if (MS_UNLIKELY(static_cast<int64_t>(rounded_size) >= free_count)) {
    MS_LOG(ERROR) << "No enough memory left!node_id: " << node_id << ", request: " << rounded_size
                  << ", free: " << free_count << ", least free request: " << least_free_memory_;
    return nullptr;
@ -75,16 +74,16 @@ void *MemOperator::Allocate(size_t rounded_size, int node_id, size_t *allocate_s
  data = _aligned_malloc(allocate_tmp_size, kMemAlginSize);
 #else
  if (node_id >= 0) {
-    data = NUMAAdapter::GetInstance()->Malloc(node_id, static_cast<size_t>(allocate_tmp_size));
+    data = numa_instance_->Malloc(node_id, static_cast<size_t>(allocate_tmp_size));
  } else {
    auto ret = posix_memalign(&data, kMemAlginSize, static_cast<size_t>(allocate_tmp_size));
-    if (UNLIKELY(ret != 0)) {
+    if (MS_UNLIKELY(ret != 0)) {
      MS_LOG(ERROR) << "posix_memalign failed!ret: " << ret;
      return nullptr;
    }
  }
 #endif
-  if (UNLIKELY(data == nullptr)) {
+  if (MS_UNLIKELY(data == nullptr)) {
    MS_LOG(ERROR) << "malloc data failed!";
    return nullptr;
  }
@ -147,7 +146,7 @@ void *MemOperator::Malloc(size_t size) {
  // todo kAllocUnitSize can be replaced by config
  size_t allocate_size;
  void *data = Allocate(rounded_size, node_id_, &allocate_size);
-  if (UNLIKELY(data == nullptr)) {
+  if (MS_UNLIKELY(data == nullptr)) {
    return nullptr;
  }
  all_datas_.emplace(data, allocate_size);
@ -169,7 +168,7 @@ void *MemOperator::Malloc(size_t size) {

 // return memory to the memory pool
 void MemOperator::Free(void *ptr) {
-  if (UNLIKELY(ptr == nullptr)) {
+  if (MS_UNLIKELY(ptr == nullptr)) {
    return;
  }
  std::lock_guard<std::mutex> locker(mutex_);
@ -230,9 +229,10 @@ void MemOperator::EraseFreeBlock(const int64_t index) {
 }

 MemOperator::MemOperator(int node_id) {
-  if (node_id >= 0 && NUMAAdapter::GetInstance()->Available()) {
+  numa_instance_ = NUMAAdapter::GetInstance();
+  if (node_id >= 0 && numa_instance_->Available()) {
    node_id_ = node_id;
-    auto mem_info = NUMAAdapter::GetInstance()->GetNodeSize(node_id_);
+    auto mem_info = numa_instance_->GetNodeSize(node_id_);
    if (mem_info.total <= 0) {
      return;
    }
@ -247,7 +247,7 @@ MemOperator::MemOperator(int node_id) {
  auto *block = GetBlock();
  size_t allocate_size;
  block->data_ = Allocate(kAllocUnitSize, node_id, &allocate_size);
-  if (UNLIKELY(block->data_ == nullptr)) {
+  if (MS_UNLIKELY(block->data_ == nullptr)) {
    return;
  }
  all_datas_.emplace(block->data_, allocate_size);
@ -262,7 +262,7 @@ MemOperator::~MemOperator() {
    _aligned_free(data.first);
 #else
    if (node_id_ >= 0) {
-      NUMAAdapter::GetInstance()->Free(data.first, data.second);
+      numa_instance_->Free(data.first, data.second);
    } else {
      free(data.first);
    }
@ -328,7 +328,7 @@ std::shared_ptr<MemOperator> DynamicMemManager::GetMemOperator(const int node_id
  iter = nodes_mem_.find(numa_node_id);
  if (iter == nodes_mem_.end()) {
    mem_oper = std::make_shared<MemOperator>(numa_node_id);
-    if (UNLIKELY(mem_oper == nullptr)) {
+    if (MS_UNLIKELY(mem_oper == nullptr)) {
      MS_LOG(ERROR) << "make_shared MemOperator failed!";
      return nullptr;
    }
--- a/mindspore/lite/src/runtime/dynamic_mem_manager.h
+++ b/mindspore/lite/src/runtime/dynamic_mem_manager.h
@ -23,6 +23,7 @@
 #include <map>
 #include <unordered_map>
 #include <deque>
+#include "src/runtime/numa_adapter.h"

 namespace mindspore {
 struct Block {
@ -39,7 +40,7 @@ struct Block {
 class MemOperator {
 public:
  explicit MemOperator(int node_id);
-  virtual ~MemOperator();
+  ~MemOperator();

  void *Malloc(size_t size);
  void Free(void *ptr);
@ -62,6 +63,7 @@ class MemOperator {
  // all data blocks
  size_t block_count_ = 0;
  int64_t garbage_block_;
+  std::shared_ptr<numa::NUMAAdapter> numa_instance_ = nullptr;
  std::mutex mutex_;
  std::vector<Block> blocks_;
  // key: data size, value: Block index
--- a/mindspore/lite/src/runtime/numa_adapter.cc
+++ b/mindspore/lite/src/runtime/numa_adapter.cc
@ -23,6 +23,7 @@ namespace numa {
 namespace {
 static constexpr int kSuccess = 0;
 static constexpr int kBitsPerByte = 8;
+static constexpr auto kBitsPerMask = static_cast<int>(sizeof(uint64_t) * kBitsPerByte);
 }  // namespace

 NUMAAdapter::NUMAAdapter() {
@ -34,7 +35,7 @@ NUMAAdapter::NUMAAdapter() {
  }

  numa_interfaces_.numa_available = reinterpret_cast<int (*)(void)>(dlsym(handle_, "numa_available"));
-  if (UNLIKELY(numa_interfaces_.numa_available == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_available == nullptr)) {
    MS_LOG(ERROR) << "numa_available not found!";
  }
  if (numa_interfaces_.numa_available() < 0) {
@ -46,64 +47,64 @@ NUMAAdapter::NUMAAdapter() {
  available_ = true;
  numa_interfaces_.numa_num_configured_nodes =
    reinterpret_cast<int (*)(void)>(dlsym(handle_, "numa_num_configured_nodes"));
-  if (UNLIKELY(numa_interfaces_.numa_num_configured_nodes == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_num_configured_nodes == nullptr)) {
    MS_LOG(ERROR) << "numa_num_configured_nodes not found!";
    available_ = false;
  }
  numa_interfaces_.numa_num_task_cpus = reinterpret_cast<int (*)(void)>(dlsym(handle_, "numa_num_task_cpus"));
-  if (UNLIKELY(numa_interfaces_.numa_num_task_cpus == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_num_task_cpus == nullptr)) {
    MS_LOG(ERROR) << "numa_num_task_cpus not found!";
    available_ = false;
  }
  numa_interfaces_.numa_node_to_cpus =
    reinterpret_cast<int (*)(int node, struct bitmask *mask)>(dlsym(handle_, "numa_node_to_cpus"));
-  if (UNLIKELY(numa_interfaces_.numa_node_to_cpus == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_node_to_cpus == nullptr)) {
    MS_LOG(ERROR) << "numa_node_to_cpus not found!";
    available_ = false;
  }
  numa_interfaces_.numa_allocate_nodemask =
    reinterpret_cast<struct bitmask *(*)(void)>(dlsym(handle_, "numa_allocate_nodemask"));
-  if (UNLIKELY(numa_interfaces_.numa_allocate_nodemask == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_allocate_nodemask == nullptr)) {
    MS_LOG(ERROR) << "numa_allocate_nodemask not found!";
    available_ = false;
  }
  numa_interfaces_.numa_bitmask_clearall =
    reinterpret_cast<struct bitmask *(*)(struct bitmask *)>(dlsym(handle_, "numa_bitmask_clearall"));
-  if (UNLIKELY(numa_interfaces_.numa_bitmask_clearall == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_bitmask_clearall == nullptr)) {
    MS_LOG(ERROR) << "numa_bitmask_clearall not found!";
    available_ = false;
  }
  numa_interfaces_.numa_bitmask_setbit =
    reinterpret_cast<struct bitmask *(*)(struct bitmask *, unsigned int)>(dlsym(handle_, "numa_bitmask_setbit"));
-  if (UNLIKELY(numa_interfaces_.numa_bitmask_setbit == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_bitmask_setbit == nullptr)) {
    MS_LOG(ERROR) << "numa_bitmask_setbit not found!";
    available_ = false;
  }
  numa_interfaces_.numa_bind = reinterpret_cast<void (*)(struct bitmask *)>(dlsym(handle_, "numa_bind"));
-  if (UNLIKELY(numa_interfaces_.numa_bind == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_bind == nullptr)) {
    MS_LOG(ERROR) << "numa_bind not found!";
    available_ = false;
  }
  numa_interfaces_.numa_bitmask_free =
    reinterpret_cast<void (*)(struct bitmask *)>(dlsym(handle_, "numa_bitmask_free"));
-  if (UNLIKELY(numa_interfaces_.numa_bitmask_free == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_bitmask_free == nullptr)) {
    MS_LOG(ERROR) << "numa_bitmask_free not found!";
    available_ = false;
  }
  numa_interfaces_.numa_alloc_onnode =
    reinterpret_cast<void *(*)(size_t size, int node)>(dlsym(handle_, "numa_alloc_onnode"));
-  if (UNLIKELY(numa_interfaces_.numa_alloc_onnode == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_alloc_onnode == nullptr)) {
    MS_LOG(ERROR) << "numa_bitmask_free not found!";
    available_ = false;
  }
  numa_interfaces_.numa_node_size64 =
    reinterpret_cast<int64_t (*)(int node, int64_t *freep)>(dlsym(handle_, "numa_node_size64"));
-  if (UNLIKELY(numa_interfaces_.numa_node_size64 == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_node_size64 == nullptr)) {
    MS_LOG(ERROR) << "numa_node_size64 not found!";
    available_ = false;
  }
  numa_interfaces_.numa_free = reinterpret_cast<void (*)(void *start, size_t size)>(dlsym(handle_, "numa_free"));
-  if (UNLIKELY(numa_interfaces_.numa_free == nullptr)) {
+  if (MS_UNLIKELY(numa_interfaces_.numa_free == nullptr)) {
    MS_LOG(ERROR) << "numa_free not found!";
    available_ = false;
  }
@ -119,7 +120,7 @@ void NUMAAdapter::Bind(int node_id) {
    return;
  }
  auto bitmask = numa_interfaces_.numa_allocate_nodemask();
-  if (UNLIKELY(bitmask == nullptr)) {
+  if (MS_UNLIKELY(bitmask == nullptr)) {
    MS_LOG(ERROR) << "bind numa_node " << node_id << " failed!";
    return;
  }
@ -172,7 +173,7 @@ std::vector<int> NUMAAdapter::GetCPUList(int node_id) {
    return cpu_list;
  }
  int cpu_num = numa_interfaces_.numa_num_task_cpus();
-  if (UNLIKELY(cpu_num < 0)) {
+  if (MS_UNLIKELY(cpu_num < 0)) {
    MS_LOG(ERROR) << "numa_num_task_cpus return " << cpu_num;
    return cpu_list;
  }
@ -180,12 +181,11 @@ std::vector<int> NUMAAdapter::GetCPUList(int node_id) {
  int maskp_index = 0;
  auto maskp = nodemask->maskp;
  do {
-    if (UNLIKELY(maskp == nullptr)) {
+    if (MS_UNLIKELY(maskp == nullptr)) {
      MS_LOG(ERROR) << "maskp is nullptr!";
      break;
    }
    auto mask = *(maskp);
-    static constexpr auto kBitsPerMask = static_cast<int>(sizeof(decltype(mask)) * kBitsPerByte);
    int step = static_cast<int>(maskp_index * kBitsPerMask);
    for (int i = 0; i < kBitsPerMask; ++i) {
      if (mask & 1) {
@ -219,6 +219,7 @@ NUMAAdapter::~NUMAAdapter() {
    return;
  }
  (void)dlclose(handle_);
+  handle_ = nullptr;
 }
 }  // namespace numa
 }  // namespace mindspore
--- a/mindspore/lite/src/runtime/numa_adapter.h
+++ b/mindspore/lite/src/runtime/numa_adapter.h
@ -16,12 +16,18 @@

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_NUMA_ADAPTER_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_NUMA_ADAPTER_H_
-#include <numa.h>
+#include <cstdint>
 #include <cstddef>
 #include <vector>
+#include <memory>

 namespace mindspore {
 namespace numa {
+struct bitmask {
+  uint64_t size;
+  uint64_t *maskp;
+};
+
 struct NUMAInterface {
  int (*numa_available)(void);
  int (*numa_num_configured_nodes)(void);
@ -44,12 +50,13 @@ struct MemoryInfo {

 class NUMAAdapter {
 public:
-  static NUMAAdapter *GetInstance() {
-    static NUMAAdapter instance;
-    return &instance;
+  static std::shared_ptr<NUMAAdapter> GetInstance() {
+    static std::shared_ptr<NUMAAdapter> instance = std::make_shared<NUMAAdapter>();
+    return instance;
  }

-  virtual ~NUMAAdapter();
+  NUMAAdapter();
+  ~NUMAAdapter();
  inline bool Available() const { return false; }
  void Bind(int node_id);
  void *Malloc(int node_id, size_t size);
@ -60,8 +67,6 @@ class NUMAAdapter {
  MemoryInfo GetNodeSize(int node_id);

 private:
-  NUMAAdapter();
-
  void *handle_;  // numa.so handle
  bool available_ = false;
  NUMAInterface numa_interfaces_;
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@ -203,7 +203,7 @@ int Scheduler::HandleBuildinCpuKernelWeight(const kernel::SubGraphType belong_su
  return RET_OK;
 }

-int Scheduler::InitKernels(std::vector<kernel::LiteKernel *> dst_kernels) {
+int Scheduler::InitKernels(std::vector<kernel::LiteKernel *> &&dst_kernels) {
  if (is_train_session_) {
    return RET_OK;
  }
@ -422,7 +422,7 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
  }
 #endif

-  ret = InitKernels(*dst_kernels);
+  ret = InitKernels(std::move(*dst_kernels));
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "InitKernels failed.";
    return ret;
@ -1558,7 +1558,7 @@ kernel::LiteKernel *FindAllSubGraphKernels(const std::vector<kernel::LiteKernel
 }
 }  // namespace

-int Scheduler::ConstructNormalSubGraphs(const std::vector<kernel::LiteKernel *> src_kernel,
+int Scheduler::ConstructNormalSubGraphs(const std::vector<kernel::LiteKernel *> &src_kernel,
                                        std::vector<kernel::LiteKernel *> *dst_kernel,
                                        std::map<const kernel::LiteKernel *, bool> *is_kernel_finish) {
  if (src_kernel.empty()) {
--- a/mindspore/lite/src/scheduler.h
+++ b/mindspore/lite/src/scheduler.h
@ -93,7 +93,7 @@ class Scheduler {
  int FindProviderKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                         const Model::Node *node, TypeId data_type, kernel::LiteKernel **kernel);

-  int InitKernels(std::vector<kernel::LiteKernel *> dst_kernels);
+  int InitKernels(std::vector<kernel::LiteKernel *> &&dst_kernels);
  kernel::LiteKernel *SchedulePartialToKernel(const lite::Model::Node *src_node);
  // schedule a partial node to a subgraph_kernel
  std::vector<kernel::LiteKernel *> ScheduleSubGraphToSubGraphKernels(const int &subgraph_index);
@ -106,7 +106,7 @@ class Scheduler {
                                std::vector<lite::Tensor *> *in_tensors, std::vector<lite::Tensor *> *out_tensors,
                                TypeId prefer_data_type = kTypeUnknown);
  // vector<LiteKernel/SubGraphKernel> --> vector<SubGraphKernel>
-  int ConstructNormalSubGraphs(const std::vector<kernel::LiteKernel *> src_kernel,
+  int ConstructNormalSubGraphs(const std::vector<kernel::LiteKernel *> &src_kernel,
                               std::vector<kernel::LiteKernel *> *dst_kernel,
                               std::map<const kernel::LiteKernel *, bool> *sinked_kernel_map);