!21205 [MS][LITE][TOD] memory optimization

Merge pull request !21205 from yonibaehr/export_yoni
2021-08-04 07:19:25 +00:00 · 2021-08-04 07:19:25 +00:00 · 7383457756
parent c77fbcddc3 208c620cea
commit 7383457756
9 changed files with 272 additions and 6 deletions
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@ -133,6 +133,7 @@ set(TRAIN_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/train/accuracy_monitor.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/train/classification_train_accuracy_monitor.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/train/train_export.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/train/opt_allocator.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/../tools/common/storage.cc
        )
 if(ENABLE_V0)
--- a/mindspore/lite/src/tensor.cc
+++ b/mindspore/lite/src/tensor.cc
@ -316,8 +316,10 @@ void Tensor::FreeData() {
    this->data_ = nullptr;
  } else {
    allocator_->Free(this->data_);
+    if (!IS_STATIC_ALLOCATOR(allocator_) || (allocator_->RefCount(this->data_) != 0)) {
      this->data_ = nullptr;
    }
+  }
 }

 void *Tensor::ReallocData() {
--- a/mindspore/lite/src/tensor.h
+++ b/mindspore/lite/src/tensor.h
@ -34,12 +34,15 @@

 namespace mindspore {
 namespace lite {
+
+#define STATIC_ALLOCATION -271964
+#define IS_STATIC_ALLOCATOR(allocator) ((allocator != nullptr) && (allocator->RefCount(nullptr) == STATIC_ALLOCATION))
 struct LiteQuantParam {
  double scale;
  int32_t zeroPoint;
  float var_corr{1};
  float mean_corr{0};
-  bool inited;
+  bool inited{false};
  std::vector<float> clusters{};
  int bitNum;
  int roundType;
@ -133,7 +136,6 @@ class Tensor : public mindspore::tensor::MSTensor {
  void set_format(mindspore::Format format) override { this->format_ = format; }

  mindspore::Format format() const override { return this->format_; }
-
  virtual int ref_count() const { return ref_count_; }

  virtual int init_ref_count() const { return this->init_ref_count_; }
--- a/mindspore/lite/src/train/opt_allocator.cc
+++ b/mindspore/lite/src/train/opt_allocator.cc
@ -0,0 +1,90 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/train/opt_allocator.h"
+#include <limits>
+#include "nnacl/op_base.h"
+
+namespace mindspore {
+
+size_t OptAllocator::FindFree(size_t size) {
+  size_t min_size = std::numeric_limits<size_t>::max();
+  size_t min_addr = std::numeric_limits<size_t>::max();
+  for (auto const &itr : arena_) {
+    // best fit
+    if (itr.second >= size) {
+      if (min_size > itr.second) {
+        min_size = itr.second;
+        min_addr = itr.first;
+      }
+    }
+  }
+  return min_addr;
+}
+
+void OptAllocator::Reorder(size_t addr) {
+  size_t length = arena_[addr];
+  size_t post = addr + length;
+  // connect to upper block
+  auto it = arena_.find(post);
+  if (it != arena_.end()) {
+    size_t post_size = it->second;
+    arena_[addr] = length + post_size;
+    arena_.erase(post);
+  }
+  // connect to lower block
+  auto itr = arena_.lower_bound(addr);
+  if (itr != arena_.begin()) {
+    itr--;
+    size_t last = itr->first;
+    if ((last + arena_[last]) == addr) {
+      arena_[last] = arena_[last] + arena_[addr];
+      arena_.erase(addr);
+    }
+  }
+}
+
+size_t OptAllocator::Malloc(size_t size) {
+  size = UP_DIV(size, align_size_) * align_size_;
+  size_t addr = FindFree(size);
+  // free block not found
+  if (addr == std::numeric_limits<size_t>::max()) {
+    if (!arena_.empty()) {
+      addr = arena_.rbegin()->first;
+      if (addr + arena_[addr] < heap_) {
+        addr = heap_;
+      } else {
+        arena_.erase(addr);
+      }
+    } else {
+      addr = heap_;
+    }
+    heap_ = addr + size;
+  } else {
+    if (arena_[addr] > size) {
+      arena_[addr + size] = arena_[addr] - size;
+    }
+    arena_.erase(addr);
+  }
+  alloc_[addr] = size;
+  return addr;
+}
+
+void OptAllocator::Free(size_t addr) {
+  arena_[addr] = alloc_[addr];
+  alloc_.erase(addr);
+  Reorder(addr);
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/train/opt_allocator.h
+++ b/mindspore/lite/src/train/opt_allocator.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
+#define MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
+
+#include <map>
+#include "include/api/allocator.h"
+
+namespace mindspore {
+class OptAllocator {
+ public:
+  explicit OptAllocator(size_t aligned_size = 32) : align_size_(aligned_size) {}
+  ~OptAllocator() {}
+  size_t Malloc(size_t size);
+  void Free(size_t offset);
+  size_t total_size() { return heap_; }
+
+ private:
+  size_t FindFree(size_t size);
+  void Reorder(size_t addr);
+  std::map<size_t, size_t> arena_;
+  std::map<size_t, size_t> alloc_;
+  size_t heap_ = 0;
+  size_t align_size_;
+};
+};      // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
--- a/mindspore/lite/src/train/static_allocator.h
+++ b/mindspore/lite/src/train/static_allocator.h
@ -0,0 +1,52 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
+#define MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
+
+namespace mindspore {
+class StaticAllocator : public Allocator {
+ public:
+  void SetContex(void *buf, size_t size) {
+    start_buf_ = buf;
+    size_ = size;
+  }
+  int SetRefCount(void *ptr, int ref_count) override { return 0; }
+  int DecRefCount(void *ptr, int ref_count) override { return 0; }
+  int IncRefCount(void *ptr, int ref_count) override { return 0; }
+  size_t total_size() { return total_size_; }
+  void Clear() {}
+  void *Malloc(size_t size) override {
+    total_size_ += size;
+    return malloc(size);
+  }
+  void Free(void *ptr) override {
+    if (RefCount(ptr) != 0) free(ptr);
+  }
+
+  int RefCount(void *ptr) override {
+    if (ptr == nullptr) return STATIC_ALLOCATION;
+    char *ptrc = reinterpret_cast<char *>(ptr);
+    char *bufc = reinterpret_cast<char *>(start_buf_);
+    return ((ptrc < bufc) || (ptrc - bufc >= static_cast<ptrdiff_t>(size_)) ? 1 : 0);
+  }
+
+ private:
+  void *start_buf_;
+  size_t size_;
+  size_t total_size_ = 0;
+};
+};      // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@ -39,6 +39,8 @@
 #include "src/train/optimizer_kernel.h"
 #include "src/train/train_utils.h"
 #include "src/train/train_export.h"
+#include "src/train/opt_allocator.h"
+#include "src/train/static_allocator.h"
 #include "src/train/train_populate_parameter.h"
 #include "src/train/train_populate_parameter_v0.h"

@ -68,6 +70,7 @@ int TrainSession::Init(const Context *context, const TrainCfg *train_cfg) {
    }
    cfg_ = *train_cfg;
  }
+  allocator_ = context->allocator;
  return lite::LiteSession::Init(context);
 }

@ -159,6 +162,51 @@ int TrainSession::InitCallBack() {
  return RET_OK;
 }

+int TrainSession::AllocTensors(const std::vector<kernel::LiteKernel *> &kernels) {
+  if (!IS_STATIC_ALLOCATOR(allocator_)) return RET_OK;
+  OptAllocator allocator;
+  std::unordered_map<lite::Tensor *, int> ref_count;
+  std::unordered_map<lite::Tensor *, size_t> offset_map;
+  for (auto kernel : kernels) {
+    for (auto tensor : kernel->out_tensors()) {
+      size_t size = tensor->Size();
+      size_t offset = allocator.Malloc(size);
+      offset_map[tensor] = offset;
+      ref_count[tensor] = tensor->init_ref_count();
+    }
+    for (auto tensor : kernel->in_tensors()) {
+      if (tensor->category() == lite::Tensor::VAR) {
+        int count = ref_count[tensor] - 1;
+        ref_count[tensor] = count;
+        if (count == 0) {
+          allocator.Free(offset_map[tensor]);
+        }
+      }
+    }
+  }
+  // Set Tensor data
+  if (tensors_data_ == nullptr) {
+    auto size = allocator.total_size();
+    auto buf = malloc(size);
+    if (buf == nullptr) {
+      MS_LOG(ERROR) << "cannot allocate buffer size" << size;
+      return RET_ERROR;
+    }
+    StaticAllocator *alloc = reinterpret_cast<StaticAllocator *>(allocator_.get());
+    alloc->SetContex(buf, size);
+    tensors_data_ = buf;
+  }
+  for (auto kernel : train_kernels_) {
+    for (auto tensor : kernel->out_tensors()) {
+      auto it = offset_map.find(tensor);
+      if (it != offset_map.end()) {
+        tensor->set_data(reinterpret_cast<void *>(reinterpret_cast<char *>(tensors_data_) + it->second));
+      }
+    }
+  }
+  return RET_OK;
+}
+
 int TrainSession::CompileGraph(lite::Model *model) { return lite::RET_ERROR; }

 int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
@ -194,10 +242,21 @@ int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
    MS_LOG(ERROR) << "failed to allocate space";
    return RET_ERROR;
  }
+  ret = AllocTensors(train_kernels_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "failed to allocate space";
+    return RET_ERROR;
+  }
  return RET_OK;
 }

-TrainSession::~TrainSession() { FreeWorkSpace(); }
+TrainSession::~TrainSession() {
+  FreeWorkSpace();
+  if (tensors_data_ != nullptr) {
+    free(tensors_data_);
+    tensors_data_ = nullptr;
+  }
+}

 int TrainSession::ExecKernels(const KernelCallBack &before, const KernelCallBack &after,
                              const std::vector<kernel::LiteKernel *> &run_kernels) {
@ -420,6 +479,12 @@ int TrainSession::Train() {
      lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1);
    }
  }
+  // allocate tensors
+  auto ret = AllocTensors(train_kernels_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "failed to allocate tensor space";
+    return RET_ERROR;
+  }
  return RET_OK;
 }

@ -446,6 +511,11 @@ int TrainSession::Eval() {
      lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1);
    }
  }
+  auto ret = AllocTensors(inference_kernels_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "failed to allocate space";
+    return RET_ERROR;
+  }
  return RET_OK;
 }

@ -781,7 +851,12 @@ session::LiteSession *session::TrainSession::CreateTrainSession(const std::strin
    MS_LOG(ERROR) << "create session failed";
    return nullptr;
  }
-
+  if (context->allocator == nullptr) {
+    const_cast<lite::Context *>(context)->allocator = std::shared_ptr<Allocator>(new (std::nothrow) StaticAllocator());
+    if (context->allocator == nullptr) {
+      MS_LOG(ERROR) << " cannot convert to static allocation";
+    }
+  }
  auto ret = session->Init(context, cfg);
  if (ret != mindspore::lite::RET_OK) {
    MS_LOG(ERROR) << "init session failed";
--- a/mindspore/lite/src/train/train_session.h
+++ b/mindspore/lite/src/train/train_session.h
@ -147,6 +147,7 @@ class TrainSession : virtual public lite::LiteSession {
  void FreeRestoreTensors();
  bool AllInputsNeedScale(kernel::LiteKernel *kernel);
  void FreeWorkSpace();
+  int AllocTensors(const std::vector<kernel::LiteKernel *> &kernels);

  std::map<Tensor *, Tensor *> restored_origin_tensors_;
  int virtual_batch_idx_ = 0;
@ -155,6 +156,8 @@ class TrainSession : virtual public lite::LiteSession {
  void *workspace_ = nullptr;
  SchedCallBack sched_mix_precision_callback_;
  bool train_mode_ = false;
+  void *tensors_data_ = nullptr;
+  std::shared_ptr<Allocator> allocator_;
 };

 }  // namespace lite
--- a/mindspore/lite/tools/benchmark_train/net_train.cc
+++ b/mindspore/lite/tools/benchmark_train/net_train.cc
@ -603,7 +603,7 @@ int NetTrain::InitCallbackParameter() {
    }
    op_call_times_total_++;
    op_begin_ = GetTimeUs();
-    if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign")) {
+    if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign") || callParam.node_type == "SGD") {
      for (auto tensor : before_outputs) {
        std::fill(reinterpret_cast<int8_t *>(tensor->MutableData()),
                  reinterpret_cast<int8_t *>(tensor->MutableData()) + tensor->Size(), 0);