From 208c620ceaf18d2a299c0dea868b5b2162f7aeca Mon Sep 17 00:00:00 2001
From: yoni <yoni.baehr@huawei.com>
Date: Tue, 3 Aug 2021 16:42:58 +0300
Subject: [PATCH] memory optimization

---
 mindspore/lite/src/CMakeLists.txt             |  1 +
 mindspore/lite/src/tensor.cc                  |  4 +-
 mindspore/lite/src/tensor.h                   |  6 +-
 mindspore/lite/src/train/opt_allocator.cc     | 90 +++++++++++++++++++
 mindspore/lite/src/train/opt_allocator.h      | 41 +++++++++
 mindspore/lite/src/train/static_allocator.h   | 52 +++++++++++
 mindspore/lite/src/train/train_session.cc     | 79 +++++++++++++++-
 mindspore/lite/src/train/train_session.h      |  3 +
 .../lite/tools/benchmark_train/net_train.cc   |  2 +-
 9 files changed, 272 insertions(+), 6 deletions(-)
 create mode 100644 mindspore/lite/src/train/opt_allocator.cc
 create mode 100644 mindspore/lite/src/train/opt_allocator.h
 create mode 100644 mindspore/lite/src/train/static_allocator.h
diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index 571714c701b..9d186047eb5 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -133,6 +133,7 @@ set(TRAIN_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/train/accuracy_monitor.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/train/classification_train_accuracy_monitor.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/train/train_export.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/train/opt_allocator.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/../tools/common/storage.cc
         )
 if(ENABLE_V0)
diff --git a/mindspore/lite/src/tensor.cc b/mindspore/lite/src/tensor.cc
index 93822eb96e3..8dc10b2e0af 100644
--- a/mindspore/lite/src/tensor.cc
+++ b/mindspore/lite/src/tensor.cc
@@ -316,7 +316,9 @@ void Tensor::FreeData() {
     this->data_ = nullptr;
   } else {
     allocator_->Free(this->data_);
-    this->data_ = nullptr;
+    if (!IS_STATIC_ALLOCATOR(allocator_) || (allocator_->RefCount(this->data_) != 0)) {
+      this->data_ = nullptr;
+    }
   }
 }
 
diff --git a/mindspore/lite/src/tensor.h b/mindspore/lite/src/tensor.h
index 1933aeec957..92b058df935 100644
--- a/mindspore/lite/src/tensor.h
+++ b/mindspore/lite/src/tensor.h
@@ -34,12 +34,15 @@
 
 namespace mindspore {
 namespace lite {
+
+#define STATIC_ALLOCATION -271964
+#define IS_STATIC_ALLOCATOR(allocator) ((allocator != nullptr) && (allocator->RefCount(nullptr) == STATIC_ALLOCATION))
 struct LiteQuantParam {
   double scale;
   int32_t zeroPoint;
   float var_corr{1};
   float mean_corr{0};
-  bool inited;
+  bool inited{false};
   std::vector<float> clusters{};
   int bitNum;
   int roundType;
@@ -133,7 +136,6 @@ class Tensor : public mindspore::tensor::MSTensor {
   void set_format(mindspore::Format format) override { this->format_ = format; }
 
   mindspore::Format format() const override { return this->format_; }
-
   virtual int ref_count() const { return ref_count_; }
 
   virtual int init_ref_count() const { return this->init_ref_count_; }
diff --git a/mindspore/lite/src/train/opt_allocator.cc b/mindspore/lite/src/train/opt_allocator.cc
new file mode 100644
index 00000000000..5e4ec9edf1a
--- /dev/null
+++ b/mindspore/lite/src/train/opt_allocator.cc
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/train/opt_allocator.h"
+#include <limits>
+#include "nnacl/op_base.h"
+
+namespace mindspore {
+
+size_t OptAllocator::FindFree(size_t size) {
+  size_t min_size = std::numeric_limits<size_t>::max();
+  size_t min_addr = std::numeric_limits<size_t>::max();
+  for (auto const &itr : arena_) {
+    // best fit
+    if (itr.second >= size) {
+      if (min_size > itr.second) {
+        min_size = itr.second;
+        min_addr = itr.first;
+      }
+    }
+  }
+  return min_addr;
+}
+
+void OptAllocator::Reorder(size_t addr) {
+  size_t length = arena_[addr];
+  size_t post = addr + length;
+  // connect to upper block
+  auto it = arena_.find(post);
+  if (it != arena_.end()) {
+    size_t post_size = it->second;
+    arena_[addr] = length + post_size;
+    arena_.erase(post);
+  }
+  // connect to lower block
+  auto itr = arena_.lower_bound(addr);
+  if (itr != arena_.begin()) {
+    itr--;
+    size_t last = itr->first;
+    if ((last + arena_[last]) == addr) {
+      arena_[last] = arena_[last] + arena_[addr];
+      arena_.erase(addr);
+    }
+  }
+}
+
+size_t OptAllocator::Malloc(size_t size) {
+  size = UP_DIV(size, align_size_) * align_size_;
+  size_t addr = FindFree(size);
+  // free block not found
+  if (addr == std::numeric_limits<size_t>::max()) {
+    if (!arena_.empty()) {
+      addr = arena_.rbegin()->first;
+      if (addr + arena_[addr] < heap_) {
+        addr = heap_;
+      } else {
+        arena_.erase(addr);
+      }
+    } else {
+      addr = heap_;
+    }
+    heap_ = addr + size;
+  } else {
+    if (arena_[addr] > size) {
+      arena_[addr + size] = arena_[addr] - size;
+    }
+    arena_.erase(addr);
+  }
+  alloc_[addr] = size;
+  return addr;
+}
+
+void OptAllocator::Free(size_t addr) {
+  arena_[addr] = alloc_[addr];
+  alloc_.erase(addr);
+  Reorder(addr);
+}
+}  // namespace mindspore
diff --git a/mindspore/lite/src/train/opt_allocator.h b/mindspore/lite/src/train/opt_allocator.h
new file mode 100644
index 00000000000..142165c33aa
--- /dev/null
+++ b/mindspore/lite/src/train/opt_allocator.h
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
+#define MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
+
+#include <map>
+#include "include/api/allocator.h"
+
+namespace mindspore {
+class OptAllocator {
+ public:
+  explicit OptAllocator(size_t aligned_size = 32) : align_size_(aligned_size) {}
+  ~OptAllocator() {}
+  size_t Malloc(size_t size);
+  void Free(size_t offset);
+  size_t total_size() { return heap_; }
+
+ private:
+  size_t FindFree(size_t size);
+  void Reorder(size_t addr);
+  std::map<size_t, size_t> arena_;
+  std::map<size_t, size_t> alloc_;
+  size_t heap_ = 0;
+  size_t align_size_;
+};
+};      // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
diff --git a/mindspore/lite/src/train/static_allocator.h b/mindspore/lite/src/train/static_allocator.h
new file mode 100644
index 00000000000..d78e13ba905
--- /dev/null
+++ b/mindspore/lite/src/train/static_allocator.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
+#define MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
+
+namespace mindspore {
+class StaticAllocator : public Allocator {
+ public:
+  void SetContex(void *buf, size_t size) {
+    start_buf_ = buf;
+    size_ = size;
+  }
+  int SetRefCount(void *ptr, int ref_count) override { return 0; }
+  int DecRefCount(void *ptr, int ref_count) override { return 0; }
+  int IncRefCount(void *ptr, int ref_count) override { return 0; }
+  size_t total_size() { return total_size_; }
+  void Clear() {}
+  void *Malloc(size_t size) override {
+    total_size_ += size;
+    return malloc(size);
+  }
+  void Free(void *ptr) override {
+    if (RefCount(ptr) != 0) free(ptr);
+  }
+
+  int RefCount(void *ptr) override {
+    if (ptr == nullptr) return STATIC_ALLOCATION;
+    char *ptrc = reinterpret_cast<char *>(ptr);
+    char *bufc = reinterpret_cast<char *>(start_buf_);
+    return ((ptrc < bufc) || (ptrc - bufc >= static_cast<ptrdiff_t>(size_)) ? 1 : 0);
+  }
+
+ private:
+  void *start_buf_;
+  size_t size_;
+  size_t total_size_ = 0;
+};
+};      // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
diff --git a/mindspore/lite/src/train/train_session.cc b/mindspore/lite/src/train/train_session.cc
index 2cf97f2718d..d6601eafcde 100644
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@@ -39,6 +39,8 @@
 #include "src/train/optimizer_kernel.h"
 #include "src/train/train_utils.h"
 #include "src/train/train_export.h"
+#include "src/train/opt_allocator.h"
+#include "src/train/static_allocator.h"
 #include "src/train/train_populate_parameter.h"
 #include "src/train/train_populate_parameter_v0.h"
 
@@ -68,6 +70,7 @@ int TrainSession::Init(const Context *context, const TrainCfg *train_cfg) {
     }
     cfg_ = *train_cfg;
   }
+  allocator_ = context->allocator;
   return lite::LiteSession::Init(context);
 }
 
@@ -159,6 +162,51 @@ int TrainSession::InitCallBack() {
   return RET_OK;
 }
 
+int TrainSession::AllocTensors(const std::vector<kernel::LiteKernel *> &kernels) {
+  if (!IS_STATIC_ALLOCATOR(allocator_)) return RET_OK;
+  OptAllocator allocator;
+  std::unordered_map<lite::Tensor *, int> ref_count;
+  std::unordered_map<lite::Tensor *, size_t> offset_map;
+  for (auto kernel : kernels) {
+    for (auto tensor : kernel->out_tensors()) {
+      size_t size = tensor->Size();
+      size_t offset = allocator.Malloc(size);
+      offset_map[tensor] = offset;
+      ref_count[tensor] = tensor->init_ref_count();
+    }
+    for (auto tensor : kernel->in_tensors()) {
+      if (tensor->category() == lite::Tensor::VAR) {
+        int count = ref_count[tensor] - 1;
+        ref_count[tensor] = count;
+        if (count == 0) {
+          allocator.Free(offset_map[tensor]);
+        }
+      }
+    }
+  }
+  // Set Tensor data
+  if (tensors_data_ == nullptr) {
+    auto size = allocator.total_size();
+    auto buf = malloc(size);
+    if (buf == nullptr) {
+      MS_LOG(ERROR) << "cannot allocate buffer size" << size;
+      return RET_ERROR;
+    }
+    StaticAllocator *alloc = reinterpret_cast<StaticAllocator *>(allocator_.get());
+    alloc->SetContex(buf, size);
+    tensors_data_ = buf;
+  }
+  for (auto kernel : train_kernels_) {
+    for (auto tensor : kernel->out_tensors()) {
+      auto it = offset_map.find(tensor);
+      if (it != offset_map.end()) {
+        tensor->set_data(reinterpret_cast<void *>(reinterpret_cast<char *>(tensors_data_) + it->second));
+      }
+    }
+  }
+  return RET_OK;
+}
+
 int TrainSession::CompileGraph(lite::Model *model) { return lite::RET_ERROR; }
 
 int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
@@ -194,10 +242,21 @@ int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
     MS_LOG(ERROR) << "failed to allocate space";
     return RET_ERROR;
   }
+  ret = AllocTensors(train_kernels_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "failed to allocate space";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
-TrainSession::~TrainSession() { FreeWorkSpace(); }
+TrainSession::~TrainSession() {
+  FreeWorkSpace();
+  if (tensors_data_ != nullptr) {
+    free(tensors_data_);
+    tensors_data_ = nullptr;
+  }
+}
 
 int TrainSession::ExecKernels(const KernelCallBack &before, const KernelCallBack &after,
                               const std::vector<kernel::LiteKernel *> &run_kernels) {
@@ -420,6 +479,12 @@ int TrainSession::Train() {
       lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1);
     }
   }
+  // allocate tensors
+  auto ret = AllocTensors(train_kernels_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "failed to allocate tensor space";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -446,6 +511,11 @@ int TrainSession::Eval() {
       lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1);
     }
   }
+  auto ret = AllocTensors(inference_kernels_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "failed to allocate space";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -781,7 +851,12 @@ session::LiteSession *session::TrainSession::CreateTrainSession(const std::strin
     MS_LOG(ERROR) << "create session failed";
     return nullptr;
   }
-
+  if (context->allocator == nullptr) {
+    const_cast<lite::Context *>(context)->allocator = std::shared_ptr<Allocator>(new (std::nothrow) StaticAllocator());
+    if (context->allocator == nullptr) {
+      MS_LOG(ERROR) << " cannot convert to static allocation";
+    }
+  }
   auto ret = session->Init(context, cfg);
   if (ret != mindspore::lite::RET_OK) {
     MS_LOG(ERROR) << "init session failed";
diff --git a/mindspore/lite/src/train/train_session.h b/mindspore/lite/src/train/train_session.h
index a21ab9f07f3..257d29180cc 100644
--- a/mindspore/lite/src/train/train_session.h
+++ b/mindspore/lite/src/train/train_session.h
@@ -147,6 +147,7 @@ class TrainSession : virtual public lite::LiteSession {
   void FreeRestoreTensors();
   bool AllInputsNeedScale(kernel::LiteKernel *kernel);
   void FreeWorkSpace();
+  int AllocTensors(const std::vector<kernel::LiteKernel *> &kernels);
 
   std::map<Tensor *, Tensor *> restored_origin_tensors_;
   int virtual_batch_idx_ = 0;
@@ -155,6 +156,8 @@ class TrainSession : virtual public lite::LiteSession {
   void *workspace_ = nullptr;
   SchedCallBack sched_mix_precision_callback_;
   bool train_mode_ = false;
+  void *tensors_data_ = nullptr;
+  std::shared_ptr<Allocator> allocator_;
 };
 
 }  // namespace lite
diff --git a/mindspore/lite/tools/benchmark_train/net_train.cc b/mindspore/lite/tools/benchmark_train/net_train.cc
index da96eaf47b2..76164f076a3 100644
--- a/mindspore/lite/tools/benchmark_train/net_train.cc
+++ b/mindspore/lite/tools/benchmark_train/net_train.cc
@@ -603,7 +603,7 @@ int NetTrain::InitCallbackParameter() {
     }
     op_call_times_total_++;
     op_begin_ = GetTimeUs();
-    if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign")) {
+    if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign") || callParam.node_type == "SGD") {
       for (auto tensor : before_outputs) {
         std::fill(reinterpret_cast<int8_t *>(tensor->MutableData()),
                   reinterpret_cast<int8_t *>(tensor->MutableData()) + tensor->Size(), 0);