!30550 【MS】【LITE】support NUMA

Merge pull request !30550 from chenjianping/master_dev
2022-02-25 06:16:00 +00:00 · 2022-02-25 06:16:00 +00:00 · 87b47c76a9
parent 3ae5651de3 a08d1cef67
commit 87b47c76a9
19 changed files with 584 additions and 89 deletions
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@ -136,6 +136,7 @@ set(LITE_SRC
        ${LITE_SRC}
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/dynamic_mem_allocator.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/dynamic_mem_manager.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/numa_adapter.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/pack_weight_manager.cc
        )
 endif()
--- a/mindspore/lite/src/common/common.h
+++ b/mindspore/lite/src/common/common.h
@ -21,6 +21,8 @@

 namespace mindspore {
 namespace lite {
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+
 enum NCHW_SHAPE { NCHW_N = 0, NCHW_C = 1, NCHW_H = 2, NCHW_W = 3 };
 enum NHWC_SHAPE { NHWC_N = 0, NHWC_H = 1, NHWC_W = 2, NHWC_C = 3 };
 enum HWCK_SHAPE { HWCK_H = 0, HWCK_W = 1, HWCK_C = 2, HWCK_K = 3 };
@ -57,6 +59,11 @@ static const char *const kMSCacheModelPath = "cache_model_path";
 static const char *const kMSCacheVocabSize = "vocab_size";
 static const char *const kMSCacheDeviceSize = "device_cache_size";
 static const char *const kMSCacheSerializePath = "serialize_path";
+// config
+#ifdef SERVER_INFERENCE
+static const char *const kConfigServerInference = "server_inference";
+static const char *const kConfigNUMANodeId = "numa_node_id";
+#endif
 }  // namespace lite
 }  // namespace mindspore

--- a/mindspore/lite/src/common/utils.cc
+++ b/mindspore/lite/src/common/utils.cc
@ -27,6 +27,9 @@
 #include <sys/types.h>
 #include <sys/param.h>
 #endif
+#ifdef SERVER_INFERENCE
+#include <sys/sysinfo.h>
+#endif

 namespace mindspore {
 namespace lite {
@ -178,5 +181,17 @@ size_t GetMaxMallocSize() {
 #endif
  return max_malloc_size;
 }
+
+#ifdef SERVER_INFERENCE
+int64_t GetFreeMemory() {
+  struct sysinfo info;
+  auto ret = sysinfo(&info);
+  if (ret != 0) {
+    MS_LOG(ERROR) << "sysinfo failed!ret = " << ret;
+    return 0;
+  }
+  return static_cast<int64_t>(info.freeram);
+}
+#endif
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/common/utils.h
+++ b/mindspore/lite/src/common/utils.h
@ -43,6 +43,9 @@ uint64_t GetTimeUs();
 bool IsSupportSDot();

 size_t GetMaxMallocSize();
+#ifdef SERVER_INFERENCE
+int64_t GetFreeMemory();
+#endif

 #ifdef __ANDROID__
 uint32_t getHwCap(int hwcap_type);
--- a/mindspore/lite/src/cxx_api/model/model_impl.cc
+++ b/mindspore/lite/src/cxx_api/model/model_impl.cc
@ -34,6 +34,9 @@
 #include "src/lite_session.h"
 #include "src/common/file_utils.h"
 #include "src/common/config_file.h"
+#ifdef SERVER_INFERENCE
+#include "src/common/common.h"
+#endif

 namespace mindspore {
 namespace {
@ -684,7 +687,15 @@ lite::LiteSession *ModelImpl::CreateLiteSession(lite::InnerContext *context) {
    delete context;
    return nullptr;
  }
-
+#ifdef SERVER_INFERENCE
+  auto iter = config_info_.find(lite::kConfigServerInference);
+  if (iter != config_info_.end()) {
+    auto numa_iter = iter->second.find(lite::kConfigNUMANodeId);
+    if (numa_iter != iter->second.end()) {
+      context->SetNodeId(std::atoi(numa_iter->second.c_str()));
+    }
+  }
+#endif
  session->InitExecutionConfig(&execution_plan_);
  session->SetConfigInfo(&config_info_);

--- a/mindspore/lite/src/cxx_api/model_pool/model_pool.cc
+++ b/mindspore/lite/src/cxx_api/model_pool/model_pool.cc
@ -22,6 +22,9 @@
 #include "src/runtime/inner_allocator.h"
 #include "src/common//file_utils.h"
 #include "src/pack_weight_manager.h"
+#include "src/runtime/numa_adapter.h"
+#include "src/common/common.h"
+
 namespace mindspore {
 namespace {
 constexpr int32_t kNumThreads = 4;
@ -36,6 +39,32 @@ int GetCoreNum() {
 #endif
  return core_num;
 }
+
+void SetNumaBindStrategy(std::vector<std::vector<int>> *all_model_bind_list, int thread_num, int node_id) {
+  if (UNLIKELY(thread_num == 0)) {
+    MS_LOG(ERROR) << "thread num is zero.";
+    return;
+  }
+  std::vector<int> cpu_list = numa::NUMAAdapter::GetInstance()->GetCPUList(node_id);
+  auto cpu_num = cpu_list.size();
+  if (cpu_num == 0) {
+    return;
+  }
+  std::vector<int> bind_id;
+  bind_id.reserve(thread_num);
+  all_model_bind_list->reserve(cpu_num / thread_num + 1);
+  bind_id.emplace_back(cpu_list[0]);
+  for (size_t i = 1; i < cpu_num; ++i) {
+    if (i % thread_num == 0) {
+      all_model_bind_list->emplace_back(bind_id);
+      bind_id.clear();
+    }
+    bind_id.emplace_back(cpu_list[i]);
+  }
+  if (!bind_id.empty()) {
+    all_model_bind_list->emplace_back(bind_id);
+  }
+}
 }  // namespace

 void ModelPool::SetBindStrategy(std::vector<std::vector<int>> *all_model_bind_list, int thread_num) {
@ -112,14 +141,26 @@ ModelPoolContex ModelPool::CreateModelContext(const std::shared_ptr<RunnerConfig
    MS_LOG(ERROR) << "context is nullptr.";
    return {};
  }
-  if (model_context->GetThreadNum() == 0) {
-    MS_LOG(ERROR) << "thread num is zero.";
+  if (model_context->GetThreadNum() < 1) {
+    MS_LOG(ERROR) << "Invalid thread num " << model_context->GetThreadNum();
    return {};
  }
+  int node_id = -1;
+  if (numa::NUMAAdapter::GetInstance()->Available()) {
+    node_id = 0;
+    num_models_ =
+      numa::NUMAAdapter::GetInstance()->GetCPUList(node_id).size() / static_cast<int>(model_context->GetThreadNum());
+  } else {
+    num_models_ = GetCoreNum() / static_cast<int>(model_context->GetThreadNum());
+  }
  ModelPoolContex model_pool_context;
  std::vector<std::vector<int>> all_model_bind_list;
  if (model_context->GetThreadAffinityMode() == lite::HIGHER_CPU) {
-    SetBindStrategy(&all_model_bind_list, static_cast<int>(model_context->GetThreadNum()));
+    if (numa::NUMAAdapter::GetInstance()->Available()) {
+      SetNumaBindStrategy(&all_model_bind_list, static_cast<int>(model_context->GetThreadNum()), node_id);
+    } else {
+      SetBindStrategy(&all_model_bind_list, static_cast<int>(model_context->GetThreadNum()));
+    }
  } else if (model_context->GetThreadAffinityMode() == lite::MID_CPU) {
    MS_LOG(ERROR) << "not support bind MID_CPU.";
    return {};
@ -171,6 +212,7 @@ Status ModelPool::Init(const std::string &model_path, const std::shared_ptr<Runn
    MS_LOG(ERROR) << "CreateModelContext failed, context is empty.";
    return kLiteError;
  }
+
  size_t size = 0;
  graph_buf_ = lite::ReadFile(model_path.c_str(), &size);
  if (graph_buf_ == nullptr) {
@ -178,10 +220,14 @@ Status ModelPool::Init(const std::string &model_path, const std::shared_ptr<Runn
    return kLiteError;
  }
  lite::PackWeightManager::GetInstance()->InitWeightManagerByBuf(graph_buf_);
+  int node_id = -1;
+  if (numa::NUMAAdapter::GetInstance()->Available()) {
+    node_id = 0;
+  }
  std::shared_ptr<ModelThread> model_thread = nullptr;
  for (size_t i = 0; i < num_models_; i++) {
    model_thread = std::make_shared<ModelThread>();
-    auto status = model_thread->Init(graph_buf_, size, model_pool_context[i], dec_key, dec_mode);
+    auto status = model_thread->Init(graph_buf_, size, model_pool_context[i], dec_key, dec_mode, node_id);
    if (status != kSuccess) {
      MS_LOG(ERROR) << " model thread init failed.";
      return kLiteError;
--- a/mindspore/lite/src/cxx_api/model_pool/model_worker.cc
+++ b/mindspore/lite/src/cxx_api/model_pool/model_worker.cc
@ -16,6 +16,7 @@
 #include "src/cxx_api/model_pool/model_worker.h"
 #include "src/common/log.h"
 #include "src/common/utils.h"
+#include "src/common/common.h"
 namespace mindspore {
 void ModelThread::Run() {
  while (!PredictTaskQueue::GetInstance()->IsPredictTaskDone()) {
@ -59,9 +60,12 @@ void ModelThread::Run() {
 }

 Status ModelThread::Init(const char *model_buf, size_t size, const std::shared_ptr<Context> &model_context,
-                         const Key &dec_key, const std::string &dec_mode) {
+                         const Key &dec_key, const std::string &dec_mode, int node_id) {
  model_ = std::make_shared<Model>();
  mindspore::ModelType model_type = kMindIR;
+  if (node_id > -1) {
+    model_->UpdateConfig(lite::kConfigServerInference, {lite::kConfigNUMANodeId, std::to_string(node_id)});
+  }
  auto status = model_->Build(model_buf, size, model_type, model_context, dec_key, dec_mode);
  if (status != kSuccess) {
    MS_LOG(ERROR) << "model build failed in ModelPool Init";
--- a/mindspore/lite/src/cxx_api/model_pool/model_worker.h
+++ b/mindspore/lite/src/cxx_api/model_pool/model_worker.h
@ -36,7 +36,7 @@ class ModelThread {

  // the model pool is initialized once and can always accept model run requests
  Status Init(const char *model_buf, size_t size, const std::shared_ptr<Context> &model_context,
-              const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm);
+              const Key &dec_key = {}, const std::string &dec_mode = kDecModeAesGcm, int node_id = -1);

  std::vector<MSTensor> GetInputs();

--- a/mindspore/lite/src/inner_context.cc
+++ b/mindspore/lite/src/inner_context.cc
@ -15,6 +15,7 @@
 */
 #include "src/inner_context.h"
 #include <algorithm>
+#include <memory>
 #include "include/errorcode.h"
 #include "src/common/log_adapter.h"
 #include "src/common/log_util.h"
@ -138,7 +139,11 @@ int InnerContext::Init() {
  }

  if (this->allocator == nullptr) {
+#ifdef SERVER_INFERENCE
+    this->allocator = std::make_shared<DynamicMemAllocator>(node_id_);
+#else
    this->allocator = mindspore::Allocator::Create();
+#endif
    CHECK_NULL_RETURN(this->allocator);
  }
  if (IsNpuEnabled()) {
--- a/mindspore/lite/src/inner_context.h
+++ b/mindspore/lite/src/inner_context.h
@ -20,7 +20,11 @@
 #include <string>
 #include <unordered_map>
 #include "include/context.h"
+#ifdef SERVER_INFERENCE
+#include "src/runtime/dynamic_mem_allocator.h"
+#else
 #include "src/runtime/inner_allocator.h"
+#endif
 #include "thread/threadpool.h"
 #include "nnacl/op_base.h"
 #ifdef ENABLE_ARM
@ -82,6 +86,13 @@ struct InnerContext : public Context {

  void ReplaceLinkInfoSenderWithNewOne(void *new_sender, void *old_sender);

+#ifdef SERVER_INFERENCE
+  /// \brief Set NUMA node id.
+  ///
+  /// \param[in] node Define the NUMA node id.
+  inline void SetNodeId(int node_id) { node_id_ = node_id; }
+#endif
+
 private:
  bool IsAllDeviceTypeValid() const;

@ -99,6 +110,10 @@ struct InnerContext : public Context {

  bool device_and_pkg_support_fp16_ = false;

+#ifdef SERVER_INFERENCE
+  int node_id_ = -1;
+#endif
+
  ThreadPool *thread_pool_{nullptr};

  // key is the precursor tensor's pointer, value is the group of successors' pointer.
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@ -40,6 +40,9 @@
 #include "src/lite_model.h"
 #include "src/weight_decoder.h"
 #include "src/runtime/runtime_allocator.h"
+#ifdef SERVER_INFERENCE
+#include "src/runtime/dynamic_mem_allocator.h"
+#endif
 #include "src/lite_kernel_util.h"
 #ifndef CUSTOM_KERNEL_REGISTRY_CLIP
 #include "src/registry/register_kernel_impl.h"
@ -1078,6 +1081,7 @@ int LiteSession::Init(InnerContext *context) {
    is_running_.store(false);
    return ret;
  }
+
  is_running_.store(false);
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/allocator.cc
+++ b/mindspore/lite/src/runtime/allocator.cc
@ -13,18 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifdef MSLITE_ENABLE_SERVER_INFERENCE
-#include "src/runtime/dynamic_mem_allocator.h"
-#else
 #include "src/runtime/inner_allocator.h"
-#endif

 namespace mindspore {
-std::shared_ptr<Allocator> Allocator::Create() {
-#ifdef MSLITE_ENABLE_SERVER_INFERENCE
-  return std::make_shared<DynamicMemAllocator>();
-#else
-  return std::make_shared<DefaultAllocator>();
-#endif
-}
+std::shared_ptr<Allocator> Allocator::Create() { return std::make_shared<DefaultAllocator>(); }
 }  // namespace mindspore
--- a/mindspore/lite/src/runtime/dynamic_mem_allocator.cc
+++ b/mindspore/lite/src/runtime/dynamic_mem_allocator.cc
@ -15,40 +15,43 @@
 */
 #include "src/runtime/dynamic_mem_allocator.h"
 #include <memory>
-#include "src/runtime/dynamic_mem_manager.h"
 #include "src/common/utils.h"
 #include "src/common/log_adapter.h"

 namespace mindspore {
-void *DynamicMemAllocator::Malloc(size_t size) { return DynamicMemManager::GetInstance()->Malloc(size); }
+void *DynamicMemAllocator::Malloc(size_t size) { return mem_oper_->Malloc(size); }

-void DynamicMemAllocator::Free(void *ptr) { DynamicMemManager::GetInstance()->Free(ptr); }
+void DynamicMemAllocator::Free(void *ptr) { mem_oper_->Free(ptr); }

 int DynamicMemAllocator::RefCount(void *ptr) {
  if (ptr == nullptr) {
    return -1;
  }
-  return DynamicMemManager::GetInstance()->RefCount(ptr);
+  return mem_oper_->RefCount(ptr);
 }

 int DynamicMemAllocator::SetRefCount(void *ptr, int ref_count) {
  if (ptr == nullptr) {
    return -1;
  }
-  return DynamicMemManager::GetInstance()->SetRefCount(ptr, ref_count);
+  return mem_oper_->SetRefCount(ptr, ref_count);
 }

 int DynamicMemAllocator::IncRefCount(void *ptr, int ref_count) {
  if (ptr == nullptr) {
    return -1;
  }
-  return DynamicMemManager::GetInstance()->IncRefCount(ptr, ref_count);
+  return mem_oper_->IncRefCount(ptr, ref_count);
 }

 int DynamicMemAllocator::DecRefCount(void *ptr, int ref_count) {
  if (ptr == nullptr) {
    return -1;
  }
-  return DynamicMemManager::GetInstance()->DecRefCount(ptr, ref_count);
+  return mem_oper_->DecRefCount(ptr, ref_count);
+}
+
+DynamicMemAllocator::DynamicMemAllocator(int node_id) {
+  mem_oper_ = DynamicMemManager::GetInstance()->GetMemOperator(node_id);
 }
 }  // namespace mindspore
--- a/mindspore/lite/src/runtime/dynamic_mem_allocator.h
+++ b/mindspore/lite/src/runtime/dynamic_mem_allocator.h
@ -19,13 +19,15 @@

 #include <mutex>
 #include <map>
+#include <memory>
 #include <unordered_map>
 #include "include/api/allocator.h"
+#include "src/runtime/dynamic_mem_manager.h"

 namespace mindspore {
 class DynamicMemAllocator : public Allocator {
 public:
-  DynamicMemAllocator() = default;
+  explicit DynamicMemAllocator(int node_id);
  virtual ~DynamicMemAllocator() = default;
  void *Malloc(size_t size) override;
  void Free(void *ptr) override;
@ -33,6 +35,9 @@ class DynamicMemAllocator : public Allocator {
  int SetRefCount(void *ptr, int ref_count) override;
  int IncRefCount(void *ptr, int ref_count) override;
  int DecRefCount(void *ptr, int ref_count) override;
+
+ private:
+  std::shared_ptr<MemOperator> mem_oper_;
 };
 }  // namespace mindspore

--- a/mindspore/lite/src/runtime/dynamic_mem_manager.cc
+++ b/mindspore/lite/src/runtime/dynamic_mem_manager.cc
@ -17,6 +17,12 @@
 #include "src/runtime/dynamic_mem_manager.h"
 #include "src/common/log_adapter.h"
 #include "src/common/utils.h"
+#include "src/common/common.h"
+#include "src/runtime/numa_adapter.h"
+
+using mindspore::numa::NUMAAdapter;
+
+using mindspore::numa::MemoryInfo;

 namespace mindspore {
 namespace {
@ -24,51 +30,69 @@ namespace {
 static constexpr size_t kMemAlginSize = 64;

 // The minimum unit size (512M) of memory block used for dynamic extend.
-static constexpr size_t kAllocUnitSize = 536870912;
+static constexpr auto kAllocUnitSize = 536870912;

-static constexpr size_t kBlockSize = 1024;
+static constexpr auto kBlockSize = 2048;
 // invalid block index
-static constexpr int64_t kInvalidIndex = -1;
+static constexpr int kInvalidIndex = -1;
+// invalid numa node id
+static constexpr int kInvalidNodeId = -1;
+static constexpr int kInvalidRefCount = -1;
+static constexpr float kDefaultMemoryLeastRatio = 0.1;

 size_t Rounded(size_t size) { return (size + kMemAlginSize - 1) & (~(kMemAlginSize - 1)); }
+}  // namespace

-void *Allocate(size_t allocate_size) {
-  if (allocate_size > lite::GetMaxMallocSize()) {
-    MS_LOG(ERROR) << "MallocData out of max_size, size: " << allocate_size;
+void *MemOperator::Allocate(size_t rounded_size, int node_id, size_t *allocate_size) {
+  int64_t allocate_tmp_size = static_cast<int64_t>(rounded_size < kAllocUnitSize ? kAllocUnitSize : rounded_size);
+  int64_t free_count = 0;
+  int64_t left = 0;
+  if (node_id >= 0) {
+    // allocate memory from numa node
+    MemoryInfo mem_info = NUMAAdapter::GetInstance()->GetNodeSize(node_id);
+    free_count = mem_info.free;
+  } else {
+    free_count = lite::GetFreeMemory();
+  }
+
+  if (UNLIKELY(static_cast<int64_t>(rounded_size) >= free_count)) {
+    MS_LOG(ERROR) << "No enough memory left!node_id: " << node_id << ", request: " << rounded_size
+                  << ", free: " << free_count << ", least free request: " << least_free_memory_;
    return nullptr;
  }
+  if (free_count < allocate_tmp_size) {
+    allocate_tmp_size = rounded_size;
+  }
+  left = free_count - allocate_tmp_size;
+  if (left <= least_free_memory_) {
+    MS_LOG(ERROR) << "No enough memory left!node_id: " << node_id << ", request: " << rounded_size
+                  << ", free: " << free_count << ", least free request: " << least_free_memory_;
+    return nullptr;
+  }
+  *allocate_size = allocate_tmp_size;
  void *data = nullptr;
 #ifdef _WIN32
-  data = _aligned_malloc(allocate_size, kMemAlginSize);
+  data = _aligned_malloc(allocate_tmp_size, kMemAlginSize);
 #else
-  auto ret = posix_memalign(&data, kMemAlginSize, allocate_size);
-  if (UNLIKELY(ret != 0)) {
-    MS_LOG(ERROR) << "posix_memalign failed!ret: " << ret;
-    return nullptr;
+  if (node_id >= 0) {
+    data = NUMAAdapter::GetInstance()->Malloc(node_id, static_cast<size_t>(allocate_tmp_size));
+  } else {
+    auto ret = posix_memalign(&data, kMemAlginSize, static_cast<size_t>(allocate_tmp_size));
+    if (UNLIKELY(ret != 0)) {
+      MS_LOG(ERROR) << "posix_memalign failed!ret: " << ret;
+      return nullptr;
+    }
  }
 #endif
  if (UNLIKELY(data == nullptr)) {
    MS_LOG(ERROR) << "malloc data failed!";
    return nullptr;
  }
+
  return data;
 }
-}  // namespace

-DynamicMemManager::DynamicMemManager() {
-  blocks_.resize(kBlockSize);
-  garbage_block_ = kInvalidIndex;
-  auto *block = GetBlock();
-  block->data_ = Allocate(kAllocUnitSize);
-  if (UNLIKELY(block->data_ == nullptr)) {
-    return;
-  }
-  all_datas_.emplace_back(block->data_);
-  block->size_ = kAllocUnitSize;
-  free_blocks_.emplace(kAllocUnitSize, block->index_);
-}
-
-Block *DynamicMemManager::GetBlock() {
+Block *MemOperator::GetBlock() {
  Block *block;
  if (garbage_block_ != kInvalidIndex) {
    block = &blocks_[garbage_block_];
@ -87,24 +111,25 @@ Block *DynamicMemManager::GetBlock() {
  return block;
 }

-void DynamicMemManager::AddGarbageBlock(const int64_t index) {
+void MemOperator::AddGarbageBlock(const int64_t index) {
  blocks_[index].next_index_ = garbage_block_;
  garbage_block_ = index;
 }

 // malloc memory for data storage
-void *DynamicMemManager::Malloc(size_t size) {
+void *MemOperator::Malloc(size_t size) {
  auto rounded_size = Rounded(size);
  std::lock_guard<std::mutex> locker(mutex_);
  auto iter = free_blocks_.lower_bound(rounded_size);
  if (iter != free_blocks_.end()) {
    auto index = iter->second;
    free_blocks_.erase(iter);
-    auto *block = &blocks_[index];
-    block->used_ = true;
-    datas_.emplace(block->data_, index);
-    if (block->size_ > rounded_size) {
+    blocks_[index].used_ = true;
+    auto data = blocks_[index].data_;
+    datas_.emplace(data, index);
+    if (blocks_[index].size_ > rounded_size) {
      Block *block_next = GetBlock();
+      auto *block = &blocks_[index];
      block_next->size_ = block->size_ - rounded_size;
      block->size_ = rounded_size;
      block_next->data_ = static_cast<int8_t *>(block->data_) + rounded_size;
@ -117,15 +142,15 @@ void *DynamicMemManager::Malloc(size_t size) {
      block->next_index_ = block_next->index_;
      free_blocks_.emplace(block_next->size_, block_next->index_);
    }
-    return block->data_;
+    return data;
  }
  // todo kAllocUnitSize can be replaced by config
-  auto allocate_size = rounded_size < kAllocUnitSize ? kAllocUnitSize : rounded_size;
-  void *data = Allocate(allocate_size);
+  size_t allocate_size;
+  void *data = Allocate(rounded_size, node_id_, &allocate_size);
  if (UNLIKELY(data == nullptr)) {
    return nullptr;
  }
-  all_datas_.emplace_back(data);
+  all_datas_.emplace(data, allocate_size);
  Block *block = GetBlock();
  block->size_ = rounded_size;
  block->data_ = data;
@ -143,7 +168,7 @@ void *DynamicMemManager::Malloc(size_t size) {
 }

 // return memory to the memory pool
-void DynamicMemManager::Free(void *ptr) {
+void MemOperator::Free(void *ptr) {
  if (UNLIKELY(ptr == nullptr)) {
    return;
  }
@ -194,7 +219,7 @@ void DynamicMemManager::Free(void *ptr) {
  }
 }

-void DynamicMemManager::EraseFreeBlock(const int64_t index) {
+void MemOperator::EraseFreeBlock(const int64_t index) {
  auto range = free_blocks_.equal_range(blocks_[index].size_);
  for (auto item = range.first; item != range.second; ++item) {
    if (item->second == index) {
@ -204,23 +229,52 @@ void DynamicMemManager::EraseFreeBlock(const int64_t index) {
  }
 }

-DynamicMemManager::~DynamicMemManager() {
-  MS_LOG(DEBUG) << "~DynamicMemManager() begin.";
+MemOperator::MemOperator(int node_id) {
+  if (node_id >= 0 && NUMAAdapter::GetInstance()->Available()) {
+    node_id_ = node_id;
+    auto mem_info = NUMAAdapter::GetInstance()->GetNodeSize(node_id_);
+    if (mem_info.total <= 0) {
+      return;
+    }
+    least_free_memory_ = mem_info.total * kDefaultMemoryLeastRatio;
+  } else {
+    auto total = lite::GetMaxMallocSize();
+    least_free_memory_ = total * kDefaultMemoryLeastRatio;
+  }
+
+  blocks_.resize(kBlockSize);
+  garbage_block_ = kInvalidIndex;
+  auto *block = GetBlock();
+  size_t allocate_size;
+  block->data_ = Allocate(kAllocUnitSize, node_id, &allocate_size);
+  if (UNLIKELY(block->data_ == nullptr)) {
+    return;
+  }
+  all_datas_.emplace(block->data_, allocate_size);
+  block->size_ = allocate_size;
+  free_blocks_.emplace(allocate_size, block->index_);
+}
+
+MemOperator::~MemOperator() {
+  MS_LOG(DEBUG) << "~MemOperator() begin.";
  for (auto &&data : all_datas_) {
 #ifdef _WIN32
-    _aligned_free(data);
+    _aligned_free(data.first);
 #else
-    free(data);
+    if (node_id_ >= 0) {
+      NUMAAdapter::GetInstance()->Free(data.first, data.second);
+    } else {
+      free(data.first);
+    }
 #endif
-    data = nullptr;
  }
  free_blocks_.clear();
  all_datas_.clear();
  blocks_.clear();
-  MS_LOG(DEBUG) << "~DynamicMemManager() end.";
+  MS_LOG(DEBUG) << "~MemOperator() end.";
 }

-int DynamicMemManager::SetRefCount(void *ptr, int ref_count) {
+int MemOperator::SetRefCount(void *ptr, int ref_count) {
  std::lock_guard<std::mutex> locker(mutex_);
  auto iter = datas_.find(ptr);
  if (iter != datas_.end()) {
@ -228,10 +282,10 @@ int DynamicMemManager::SetRefCount(void *ptr, int ref_count) {
    blocks_[index].ref_count_ = ref_count;
    return ref_count;
  }
-  return -1;
+  return kInvalidRefCount;
 }

-int DynamicMemManager::IncRefCount(void *ptr, int ref_count) {
+int MemOperator::IncRefCount(void *ptr, int ref_count) {
  std::lock_guard<std::mutex> locker(mutex_);
  auto iter = datas_.find(ptr);
  if (iter != datas_.end()) {
@ -239,10 +293,10 @@ int DynamicMemManager::IncRefCount(void *ptr, int ref_count) {
    blocks_[index].ref_count_ += ref_count;
    return blocks_[index].ref_count_;
  }
-  return -1;
+  return kInvalidRefCount;
 }

-int DynamicMemManager::DecRefCount(void *ptr, int ref_count) {
+int MemOperator::DecRefCount(void *ptr, int ref_count) {
  std::lock_guard<std::mutex> locker(mutex_);
  auto iter = datas_.find(ptr);
  if (iter != datas_.end()) {
@ -250,15 +304,39 @@ int DynamicMemManager::DecRefCount(void *ptr, int ref_count) {
    blocks_[index].ref_count_ -= ref_count;
    return blocks_[index].ref_count_;
  }
-  return -1;
+  return kInvalidRefCount;
 }

-int DynamicMemManager::RefCount(void *ptr) {
+int MemOperator::RefCount(void *ptr) {
  std::lock_guard<std::mutex> locker(mutex_);
  auto iter = datas_.find(ptr);
  if (iter != datas_.end()) {
    return blocks_[iter->second].ref_count_;
  }
-  return -1;
+  return kInvalidRefCount;
+}
+
+std::shared_ptr<MemOperator> DynamicMemManager::GetMemOperator(const int node_id) {
+  std::map<int, std::shared_ptr<MemOperator>>::iterator iter;
+  int numa_node_id = node_id;
+  if (numa_node_id < 0) {
+    numa_node_id = kInvalidNodeId;
+  }
+
+  std::lock_guard<std::mutex> locker(mutex_);
+  std::shared_ptr<MemOperator> mem_oper = nullptr;
+  iter = nodes_mem_.find(numa_node_id);
+  if (iter == nodes_mem_.end()) {
+    mem_oper = std::make_shared<MemOperator>(numa_node_id);
+    if (UNLIKELY(mem_oper == nullptr)) {
+      MS_LOG(ERROR) << "make_shared MemOperator failed!";
+      return nullptr;
+    }
+    std::cout << "new mem_oper, node_id " << numa_node_id << "\n";
+    nodes_mem_.insert({numa_node_id, mem_oper});
+  } else {
+    mem_oper = iter->second;
+  }
+  return mem_oper;
 }
 }  // namespace mindspore
--- a/mindspore/lite/src/runtime/dynamic_mem_manager.h
+++ b/mindspore/lite/src/runtime/dynamic_mem_manager.h
@ -24,8 +24,6 @@
 #include <unordered_map>
 #include <deque>

-#define UNLIKELY(x) __builtin_expect(!!(x), 0)
-
 namespace mindspore {
 struct Block {
  // used_ may be true when ref_count_ == 0
@ -38,14 +36,10 @@ struct Block {
  int64_t next_index_ = -1;
 };

-class DynamicMemManager {
+class MemOperator {
 public:
-  static DynamicMemManager *GetInstance() {
-    static DynamicMemManager instance;
-    return &instance;
-  }
-
-  virtual ~DynamicMemManager();
+  explicit MemOperator(int node_id);
+  virtual ~MemOperator();

  void *Malloc(size_t size);
  void Free(void *ptr);
@ -53,14 +47,18 @@ class DynamicMemManager {
  int IncRefCount(void *ptr, int ref_count);
  int DecRefCount(void *ptr, int ref_count);
  int RefCount(void *ptr);
+  inline void set_node_id(int node_id) { node_id_ = node_id; }
+  inline int node_id(void) const { return node_id_; }

 private:
-  DynamicMemManager();
  Block *GetBlock();
  void EraseFreeBlock(const int64_t index);
  void AddGarbageBlock(const int64_t index);
+  void *Allocate(size_t rounded_size, int node_id, size_t *allocate_size);

 private:
+  int node_id_ = -1;
+  int64_t least_free_memory_ = 0;
  // all data blocks
  size_t block_count_ = 0;
  int64_t garbage_block_;
@ -70,7 +68,21 @@ class DynamicMemManager {
  std::multimap<size_t, int64_t> free_blocks_;
  // key: data addr, value: Block index
  std::unordered_map<void *, int64_t> datas_;
-  std::vector<void *> all_datas_;
+  std::unordered_map<void *, size_t> all_datas_;
+};
+
+class DynamicMemManager {
+ public:
+  static DynamicMemManager *GetInstance() {
+    static DynamicMemManager instance;
+    return &instance;
+  }
+
+  std::shared_ptr<MemOperator> GetMemOperator(const int node_id);
+
+ private:
+  std::map<int, std::shared_ptr<MemOperator>> nodes_mem_;
+  std::mutex mutex_;
 };
 }  // namespace mindspore

--- a/mindspore/lite/src/runtime/numa_adapter.cc
+++ b/mindspore/lite/src/runtime/numa_adapter.cc
@ -0,0 +1,224 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/numa_adapter.h"
+#include <dlfcn.h>
+#include "src/common/log_adapter.h"
+#include "src/common/common.h"
+
+namespace mindspore {
+namespace numa {
+namespace {
+static constexpr int kSuccess = 0;
+static constexpr int kBitsPerByte = 8;
+}  // namespace
+
+NUMAAdapter::NUMAAdapter() {
+  available_ = false;
+  handle_ = dlopen("libnuma.so.1.0.0", RTLD_LAZY | RTLD_LOCAL);
+  if (handle_ == nullptr) {
+    MS_LOG(WARNING) << "Does not support NUMA.";
+    return;
+  }
+
+  numa_interfaces_.numa_available = reinterpret_cast<int (*)(void)>(dlsym(handle_, "numa_available"));
+  if (UNLIKELY(numa_interfaces_.numa_available == nullptr)) {
+    MS_LOG(ERROR) << "numa_available not found!";
+  }
+  if (numa_interfaces_.numa_available() < 0) {
+    MS_LOG(ERROR) << "numa is not available!";
+    (void)dlclose(handle_);
+    handle_ = nullptr;
+    return;
+  }
+  available_ = true;
+  numa_interfaces_.numa_num_configured_nodes =
+    reinterpret_cast<int (*)(void)>(dlsym(handle_, "numa_num_configured_nodes"));
+  if (UNLIKELY(numa_interfaces_.numa_num_configured_nodes == nullptr)) {
+    MS_LOG(ERROR) << "numa_num_configured_nodes not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_num_task_cpus = reinterpret_cast<int (*)(void)>(dlsym(handle_, "numa_num_task_cpus"));
+  if (UNLIKELY(numa_interfaces_.numa_num_task_cpus == nullptr)) {
+    MS_LOG(ERROR) << "numa_num_task_cpus not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_node_to_cpus =
+    reinterpret_cast<int (*)(int node, struct bitmask *mask)>(dlsym(handle_, "numa_node_to_cpus"));
+  if (UNLIKELY(numa_interfaces_.numa_node_to_cpus == nullptr)) {
+    MS_LOG(ERROR) << "numa_node_to_cpus not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_allocate_nodemask =
+    reinterpret_cast<struct bitmask *(*)(void)>(dlsym(handle_, "numa_allocate_nodemask"));
+  if (UNLIKELY(numa_interfaces_.numa_allocate_nodemask == nullptr)) {
+    MS_LOG(ERROR) << "numa_allocate_nodemask not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_bitmask_clearall =
+    reinterpret_cast<struct bitmask *(*)(struct bitmask *)>(dlsym(handle_, "numa_bitmask_clearall"));
+  if (UNLIKELY(numa_interfaces_.numa_bitmask_clearall == nullptr)) {
+    MS_LOG(ERROR) << "numa_bitmask_clearall not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_bitmask_setbit =
+    reinterpret_cast<struct bitmask *(*)(struct bitmask *, unsigned int)>(dlsym(handle_, "numa_bitmask_setbit"));
+  if (UNLIKELY(numa_interfaces_.numa_bitmask_setbit == nullptr)) {
+    MS_LOG(ERROR) << "numa_bitmask_setbit not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_bind = reinterpret_cast<void (*)(struct bitmask *)>(dlsym(handle_, "numa_bind"));
+  if (UNLIKELY(numa_interfaces_.numa_bind == nullptr)) {
+    MS_LOG(ERROR) << "numa_bind not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_bitmask_free =
+    reinterpret_cast<void (*)(struct bitmask *)>(dlsym(handle_, "numa_bitmask_free"));
+  if (UNLIKELY(numa_interfaces_.numa_bitmask_free == nullptr)) {
+    MS_LOG(ERROR) << "numa_bitmask_free not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_alloc_onnode =
+    reinterpret_cast<void *(*)(size_t size, int node)>(dlsym(handle_, "numa_alloc_onnode"));
+  if (UNLIKELY(numa_interfaces_.numa_alloc_onnode == nullptr)) {
+    MS_LOG(ERROR) << "numa_bitmask_free not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_node_size64 =
+    reinterpret_cast<int64_t (*)(int node, int64_t *freep)>(dlsym(handle_, "numa_node_size64"));
+  if (UNLIKELY(numa_interfaces_.numa_node_size64 == nullptr)) {
+    MS_LOG(ERROR) << "numa_node_size64 not found!";
+    available_ = false;
+  }
+  numa_interfaces_.numa_free = reinterpret_cast<void (*)(void *start, size_t size)>(dlsym(handle_, "numa_free"));
+  if (UNLIKELY(numa_interfaces_.numa_free == nullptr)) {
+    MS_LOG(ERROR) << "numa_free not found!";
+    available_ = false;
+  }
+  if (!available_) {
+    (void)dlclose(handle_);
+    handle_ = nullptr;
+    return;
+  }
+}
+
+void NUMAAdapter::Bind(int node_id) {
+  if (!Available() || node_id < 0) {
+    return;
+  }
+  auto bitmask = numa_interfaces_.numa_allocate_nodemask();
+  if (UNLIKELY(bitmask == nullptr)) {
+    MS_LOG(ERROR) << "bind numa_node " << node_id << " failed!";
+    return;
+  }
+  (void)numa_interfaces_.numa_bitmask_setbit(bitmask, node_id);
+  numa_interfaces_.numa_bind(bitmask);
+  numa_interfaces_.numa_bitmask_free(bitmask);
+}
+
+void *NUMAAdapter::Malloc(int node_id, size_t size) {
+  if (!Available() || node_id < 0) {
+    return nullptr;
+  }
+  return numa_interfaces_.numa_alloc_onnode(size, node_id);
+}
+
+void NUMAAdapter::Free(void *data, size_t size) {
+  if (!Available() || data == nullptr) {
+    return;
+  }
+  numa_interfaces_.numa_free(data, size);
+}
+
+int NUMAAdapter::NodesNum() {
+  if (!Available()) {
+    return 0;
+  }
+  return numa_interfaces_.numa_num_configured_nodes();
+}
+
+int NUMAAdapter::CPUNum() {
+  if (!Available()) {
+    return 0;
+  }
+  return numa_interfaces_.numa_num_task_cpus();
+}
+
+std::vector<int> NUMAAdapter::GetCPUList(int node_id) {
+  std::vector<int> cpu_list;
+  if (!Available() || node_id < 0) {
+    return cpu_list;
+  }
+  struct bitmask *nodemask = numa_interfaces_.numa_allocate_nodemask();
+  if (nodemask == nullptr) {
+    MS_LOG(ERROR) << "allocate nodemask failed!";
+    return cpu_list;
+  }
+  auto ret = numa_interfaces_.numa_node_to_cpus(node_id, nodemask);
+  if (ret != kSuccess || nodemask->maskp == nullptr) {
+    MS_LOG(ERROR) << "numa_node_to_cpus failed!ret = " << ret;
+    return cpu_list;
+  }
+  int cpu_num = numa_interfaces_.numa_num_task_cpus();
+  if (UNLIKELY(cpu_num < 0)) {
+    MS_LOG(ERROR) << "numa_num_task_cpus return " << cpu_num;
+    return cpu_list;
+  }
+  int index = 0;
+  int maskp_index = 0;
+  auto maskp = nodemask->maskp;
+  do {
+    if (UNLIKELY(maskp == nullptr)) {
+      MS_LOG(ERROR) << "maskp is nullptr!";
+      break;
+    }
+    auto mask = *(maskp);
+    static constexpr auto kBitsPerMask = static_cast<int>(sizeof(decltype(mask)) * kBitsPerByte);
+    int step = static_cast<int>(maskp_index * kBitsPerMask);
+    for (int i = 0; i < kBitsPerMask; ++i) {
+      if (mask & 1) {
+        cpu_list.emplace_back(i + step);
+      }
+      mask >>= 1;
+    }
+    index += kBitsPerMask;
+    if (index >= cpu_num) {
+      break;
+    }
+    maskp = nodemask->maskp + 1;
+    ++maskp_index;
+  } while (true);
+
+  numa_interfaces_.numa_bitmask_free(nodemask);
+  return cpu_list;
+}
+
+MemoryInfo NUMAAdapter::GetNodeSize(int node_id) {
+  MemoryInfo mem_info;
+  if (!Available() || node_id < 0) {
+    return mem_info;
+  }
+  mem_info.total = numa_interfaces_.numa_node_size64(node_id, &mem_info.free);
+  return mem_info;
+}
+
+NUMAAdapter::~NUMAAdapter() {
+  if (handle_ == nullptr) {
+    return;
+  }
+  (void)dlclose(handle_);
+}
+}  // namespace numa
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/numa_adapter.h
+++ b/mindspore/lite/src/runtime/numa_adapter.h
@ -0,0 +1,71 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_NUMA_ADAPTER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_NUMA_ADAPTER_H_
+#include <numa.h>
+#include <cstddef>
+#include <vector>
+
+namespace mindspore {
+namespace numa {
+struct NUMAInterface {
+  int (*numa_available)(void);
+  int (*numa_num_configured_nodes)(void);
+  int (*numa_num_task_cpus)();
+  int (*numa_node_to_cpus)(int node, struct bitmask *mask);
+  struct bitmask *(*numa_allocate_nodemask)(void);
+  struct bitmask *(*numa_bitmask_clearall)(struct bitmask *);
+  struct bitmask *(*numa_bitmask_setbit)(struct bitmask *, unsigned int);
+  void (*numa_bind)(struct bitmask *);
+  void (*numa_bitmask_free)(struct bitmask *);
+  void *(*numa_alloc_onnode)(size_t size, int node);
+  int64_t (*numa_node_size64)(int node, int64_t *freep);
+  void (*numa_free)(void *start, size_t size);
+};
+
+struct MemoryInfo {
+  int64_t total = 0;
+  int64_t free = 0;
+};
+
+class NUMAAdapter {
+ public:
+  static NUMAAdapter *GetInstance() {
+    static NUMAAdapter instance;
+    return &instance;
+  }
+
+  virtual ~NUMAAdapter();
+  inline bool Available() const { return false; }
+  void Bind(int node_id);
+  void *Malloc(int node_id, size_t size);
+  void Free(void *data, size_t size);
+  int NodesNum();
+  int CPUNum();
+  std::vector<int> GetCPUList(int node_id);
+  MemoryInfo GetNodeSize(int node_id);
+
+ private:
+  NUMAAdapter();
+
+  void *handle_;  // numa.so handle
+  bool available_ = false;
+  NUMAInterface numa_interfaces_;
+};
+}  // namespace numa
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_NUMA_ADAPTER_H_
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@ -131,6 +131,7 @@ set(LITE_SRC
        ${LITE_SRC}
        ${SRC_DIR}/runtime/dynamic_mem_allocator.cc
        ${SRC_DIR}/runtime/dynamic_mem_manager.cc
+        ${SRC_DIR}/runtime/numa_adapter.cc
        )
 endif()