!15446 [MD] Cache: add dynamic cache memory sizing support

From: @tina_mengting_zhang Reviewed-by: Signed-off-by:
2021-04-23 21:19:11 +08:00 · 2021-04-23 21:19:11 +08:00 · 87742310cc
parent ad87aca10d a46ef22314
commit 87742310cc
5 changed files with 61 additions and 2 deletions
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_hw.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_hw.cc
@ -233,5 +233,32 @@ bool CacheServerHW::numa_enabled() {
  return false;
 #endif
 }
+
+uint64_t CacheServerHW::GetAvailableMemory() {
+  std::ifstream mem_file(kMemInfoFileName);
+  if (mem_file.fail()) {
+    MS_LOG(WARNING) << "Fail to open file: " << kMemInfoFileName;
+    return 0;
+  }
+
+  std::string line;
+  uint64_t mem_available_in_kb = 0L;
+  while (std::getline(mem_file, line)) {
+    // get title
+    std::string::size_type position = line.find(":");
+    std::string title = line.substr(0, position);
+
+    // get the value of MemAvailable
+    if (title == "MemAvailable") {
+      std::string::size_type pos1 = line.find_last_of(" ");
+      std::string::size_type pos2 = line.find_last_of(" ", pos1 - 1);
+      mem_available_in_kb = std::stol(line.substr(pos2, pos1 - pos2));
+      break;
+    }
+  }
+  mem_file.close();
+
+  return mem_available_in_kb * 1024;
+}
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_hw.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_hw.h
@ -90,8 +90,12 @@ class CacheServerHW {
  /// \return the size (in bytes) of the physical RAM on the machine.
  static int64_t GetTotalSystemMemory();

+  /// \brief Get the size (in bytes) of available memory on the machine by reading from file /proc/meminfo.
+  static uint64_t GetAvailableMemory();
+
 private:
  constexpr static char kSysNodePath[] = "/sys/devices/system/node";
+  constexpr static char kMemInfoFileName[] = "/proc/meminfo";
  int32_t num_cpus_;
  std::map<numa_id_t, cpu_set_t> numa_cpuset_;
  std::map<numa_id_t, int32_t> numa_cpu_cnt_;
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_numa.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_numa.h
@ -63,6 +63,9 @@ class NumaMemoryPool : public MemoryPool {
  /// \brief Return maximum available memory
  int64_t GetAvailableMemory() const { return memory_cap_; }

+  /// \brief Return the configured or computed memory cap ratio
+  float GetMemoryCapRatio() const { return memory_cap_ratio_; }
+
 private:
  std::shared_ptr<CacheServerHW> hw_;
  float memory_cap_ratio_;
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_pool.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_pool.cc
@ -22,7 +22,12 @@
 namespace mindspore {
 namespace dataset {
 CachePool::CachePool(std::shared_ptr<NumaMemoryPool> mp, const std::string &root)
-    : mp_(std::move(mp)), root_(root), subfolder_(Services::GetUniqueID()), sm_(nullptr), tree_(nullptr) {}
+    : mp_(std::move(mp)), root_(root), subfolder_(Services::GetUniqueID()), sm_(nullptr), tree_(nullptr) {
+  // Initialize soft memory cap to the current available memory on the machine.
+  soft_mem_limit_ = CacheServerHW::GetAvailableMemory();
+  temp_mem_usage_ = 0;
+  min_avail_mem_ = CacheServerHW::GetTotalSystemMemory() * (1.0 - mp_->GetMemoryCapRatio());
+}

 Status CachePool::DoServiceStart() {
  tree_ = std::make_shared<data_index>();
@ -83,8 +88,23 @@ Status CachePool::Insert(CachePool::key_type key, const std::vector<ReadableSlic
    sz += v.GetSize();
  }
  bl.sz = sz;
-  rc = mp_->Allocate(sz, reinterpret_cast<void **>(&bl.ptr));
+  // If required memory size exceeds the available size, it gives OOM status. To avoid cache server process got killed
+  // or crashing the machine, set lower bound memory, which means stopping cache once the rest available memory is less
+  // than the lower bound. (The default is 20% of physical RAM)
+  if (soft_mem_limit_ - temp_mem_usage_ - static_cast<uint64_t>(sz) < min_avail_mem_) {
+    MS_LOG(WARNING) << "Memory usage will exceed the upper bound limit of: " << min_avail_mem_
+                    << ". The cache server will not cache any more data.";
+    rc = Status(StatusCode::kMDOutOfMemory, __LINE__, __FILE__);
+  } else {
+    rc = mp_->Allocate(sz, reinterpret_cast<void **>(&bl.ptr));
+    // Adjust the soft limit and usage counting when every 100M memory are used.
+    if (temp_mem_usage_ + sz >= kMemoryCapAdjustInterval) {
+      soft_mem_limit_ = CacheServerHW::GetAvailableMemory();
+      temp_mem_usage_ = 0;
+    }
+  }
  if (rc.IsOk()) {
+    temp_mem_usage_ += sz;
    // Write down which numa node where we allocate from. It only make sense if the policy is kOnNode.
    if (CacheServerHW::numa_enabled()) {
      auto &cs = CacheServer::GetInstance();
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_pool.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_pool.h
@ -149,6 +149,11 @@ class CachePool : public Service {
  const std::string subfolder_;
  std::shared_ptr<StorageManager> sm_;
  std::shared_ptr<data_index> tree_;
+  std::atomic<uint64_t> soft_mem_limit_;  // the available memory in the machine
+  std::atomic<uint64_t> temp_mem_usage_;  // temporary count on the amount of memory usage by cache every 100Mb (because
+                                          // we will adjust soft_mem_limit_ every 100Mb based on this parameter)
+  uint64_t min_avail_mem_;                // lower bound of the available memory
+  const int kMemoryCapAdjustInterval = 104857600;
 };
 }  // namespace dataset
 }  // namespace mindspore