From 208c620ceaf18d2a299c0dea868b5b2162f7aeca Mon Sep 17 00:00:00 2001 From: yoni Date: Tue, 3 Aug 2021 16:42:58 +0300 Subject: [PATCH] memory optimization --- mindspore/lite/src/CMakeLists.txt | 1 + mindspore/lite/src/tensor.cc | 4 +- mindspore/lite/src/tensor.h | 6 +- mindspore/lite/src/train/opt_allocator.cc | 90 +++++++++++++++++++ mindspore/lite/src/train/opt_allocator.h | 41 +++++++++ mindspore/lite/src/train/static_allocator.h | 52 +++++++++++ mindspore/lite/src/train/train_session.cc | 79 +++++++++++++++- mindspore/lite/src/train/train_session.h | 3 + .../lite/tools/benchmark_train/net_train.cc | 2 +- 9 files changed, 272 insertions(+), 6 deletions(-) create mode 100644 mindspore/lite/src/train/opt_allocator.cc create mode 100644 mindspore/lite/src/train/opt_allocator.h create mode 100644 mindspore/lite/src/train/static_allocator.h diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt index 571714c701b..9d186047eb5 100644 --- a/mindspore/lite/src/CMakeLists.txt +++ b/mindspore/lite/src/CMakeLists.txt @@ -133,6 +133,7 @@ set(TRAIN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/train/accuracy_monitor.cc ${CMAKE_CURRENT_SOURCE_DIR}/train/classification_train_accuracy_monitor.cc ${CMAKE_CURRENT_SOURCE_DIR}/train/train_export.cc + ${CMAKE_CURRENT_SOURCE_DIR}/train/opt_allocator.cc ${CMAKE_CURRENT_SOURCE_DIR}/../tools/common/storage.cc ) if(ENABLE_V0) diff --git a/mindspore/lite/src/tensor.cc b/mindspore/lite/src/tensor.cc index 93822eb96e3..8dc10b2e0af 100644 --- a/mindspore/lite/src/tensor.cc +++ b/mindspore/lite/src/tensor.cc @@ -316,7 +316,9 @@ void Tensor::FreeData() { this->data_ = nullptr; } else { allocator_->Free(this->data_); - this->data_ = nullptr; + if (!IS_STATIC_ALLOCATOR(allocator_) || (allocator_->RefCount(this->data_) != 0)) { + this->data_ = nullptr; + } } } diff --git a/mindspore/lite/src/tensor.h b/mindspore/lite/src/tensor.h index 1933aeec957..92b058df935 100644 --- a/mindspore/lite/src/tensor.h +++ b/mindspore/lite/src/tensor.h @@ -34,12 +34,15 @@ namespace mindspore { namespace lite { + +#define STATIC_ALLOCATION -271964 +#define IS_STATIC_ALLOCATOR(allocator) ((allocator != nullptr) && (allocator->RefCount(nullptr) == STATIC_ALLOCATION)) struct LiteQuantParam { double scale; int32_t zeroPoint; float var_corr{1}; float mean_corr{0}; - bool inited; + bool inited{false}; std::vector clusters{}; int bitNum; int roundType; @@ -133,7 +136,6 @@ class Tensor : public mindspore::tensor::MSTensor { void set_format(mindspore::Format format) override { this->format_ = format; } mindspore::Format format() const override { return this->format_; } - virtual int ref_count() const { return ref_count_; } virtual int init_ref_count() const { return this->init_ref_count_; } diff --git a/mindspore/lite/src/train/opt_allocator.cc b/mindspore/lite/src/train/opt_allocator.cc new file mode 100644 index 00000000000..5e4ec9edf1a --- /dev/null +++ b/mindspore/lite/src/train/opt_allocator.cc @@ -0,0 +1,90 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/train/opt_allocator.h" +#include +#include "nnacl/op_base.h" + +namespace mindspore { + +size_t OptAllocator::FindFree(size_t size) { + size_t min_size = std::numeric_limits::max(); + size_t min_addr = std::numeric_limits::max(); + for (auto const &itr : arena_) { + // best fit + if (itr.second >= size) { + if (min_size > itr.second) { + min_size = itr.second; + min_addr = itr.first; + } + } + } + return min_addr; +} + +void OptAllocator::Reorder(size_t addr) { + size_t length = arena_[addr]; + size_t post = addr + length; + // connect to upper block + auto it = arena_.find(post); + if (it != arena_.end()) { + size_t post_size = it->second; + arena_[addr] = length + post_size; + arena_.erase(post); + } + // connect to lower block + auto itr = arena_.lower_bound(addr); + if (itr != arena_.begin()) { + itr--; + size_t last = itr->first; + if ((last + arena_[last]) == addr) { + arena_[last] = arena_[last] + arena_[addr]; + arena_.erase(addr); + } + } +} + +size_t OptAllocator::Malloc(size_t size) { + size = UP_DIV(size, align_size_) * align_size_; + size_t addr = FindFree(size); + // free block not found + if (addr == std::numeric_limits::max()) { + if (!arena_.empty()) { + addr = arena_.rbegin()->first; + if (addr + arena_[addr] < heap_) { + addr = heap_; + } else { + arena_.erase(addr); + } + } else { + addr = heap_; + } + heap_ = addr + size; + } else { + if (arena_[addr] > size) { + arena_[addr + size] = arena_[addr] - size; + } + arena_.erase(addr); + } + alloc_[addr] = size; + return addr; +} + +void OptAllocator::Free(size_t addr) { + arena_[addr] = alloc_[addr]; + alloc_.erase(addr); + Reorder(addr); +} +} // namespace mindspore diff --git a/mindspore/lite/src/train/opt_allocator.h b/mindspore/lite/src/train/opt_allocator.h new file mode 100644 index 00000000000..142165c33aa --- /dev/null +++ b/mindspore/lite/src/train/opt_allocator.h @@ -0,0 +1,41 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_ +#define MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_ + +#include +#include "include/api/allocator.h" + +namespace mindspore { +class OptAllocator { + public: + explicit OptAllocator(size_t aligned_size = 32) : align_size_(aligned_size) {} + ~OptAllocator() {} + size_t Malloc(size_t size); + void Free(size_t offset); + size_t total_size() { return heap_; } + + private: + size_t FindFree(size_t size); + void Reorder(size_t addr); + std::map arena_; + std::map alloc_; + size_t heap_ = 0; + size_t align_size_; +}; +}; // namespace mindspore +#endif // MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_ diff --git a/mindspore/lite/src/train/static_allocator.h b/mindspore/lite/src/train/static_allocator.h new file mode 100644 index 00000000000..d78e13ba905 --- /dev/null +++ b/mindspore/lite/src/train/static_allocator.h @@ -0,0 +1,52 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_ +#define MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_ + +namespace mindspore { +class StaticAllocator : public Allocator { + public: + void SetContex(void *buf, size_t size) { + start_buf_ = buf; + size_ = size; + } + int SetRefCount(void *ptr, int ref_count) override { return 0; } + int DecRefCount(void *ptr, int ref_count) override { return 0; } + int IncRefCount(void *ptr, int ref_count) override { return 0; } + size_t total_size() { return total_size_; } + void Clear() {} + void *Malloc(size_t size) override { + total_size_ += size; + return malloc(size); + } + void Free(void *ptr) override { + if (RefCount(ptr) != 0) free(ptr); + } + + int RefCount(void *ptr) override { + if (ptr == nullptr) return STATIC_ALLOCATION; + char *ptrc = reinterpret_cast(ptr); + char *bufc = reinterpret_cast(start_buf_); + return ((ptrc < bufc) || (ptrc - bufc >= static_cast(size_)) ? 1 : 0); + } + + private: + void *start_buf_; + size_t size_; + size_t total_size_ = 0; +}; +}; // namespace mindspore +#endif // MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_ diff --git a/mindspore/lite/src/train/train_session.cc b/mindspore/lite/src/train/train_session.cc index 2cf97f2718d..d6601eafcde 100644 --- a/mindspore/lite/src/train/train_session.cc +++ b/mindspore/lite/src/train/train_session.cc @@ -39,6 +39,8 @@ #include "src/train/optimizer_kernel.h" #include "src/train/train_utils.h" #include "src/train/train_export.h" +#include "src/train/opt_allocator.h" +#include "src/train/static_allocator.h" #include "src/train/train_populate_parameter.h" #include "src/train/train_populate_parameter_v0.h" @@ -68,6 +70,7 @@ int TrainSession::Init(const Context *context, const TrainCfg *train_cfg) { } cfg_ = *train_cfg; } + allocator_ = context->allocator; return lite::LiteSession::Init(context); } @@ -159,6 +162,51 @@ int TrainSession::InitCallBack() { return RET_OK; } +int TrainSession::AllocTensors(const std::vector &kernels) { + if (!IS_STATIC_ALLOCATOR(allocator_)) return RET_OK; + OptAllocator allocator; + std::unordered_map ref_count; + std::unordered_map offset_map; + for (auto kernel : kernels) { + for (auto tensor : kernel->out_tensors()) { + size_t size = tensor->Size(); + size_t offset = allocator.Malloc(size); + offset_map[tensor] = offset; + ref_count[tensor] = tensor->init_ref_count(); + } + for (auto tensor : kernel->in_tensors()) { + if (tensor->category() == lite::Tensor::VAR) { + int count = ref_count[tensor] - 1; + ref_count[tensor] = count; + if (count == 0) { + allocator.Free(offset_map[tensor]); + } + } + } + } + // Set Tensor data + if (tensors_data_ == nullptr) { + auto size = allocator.total_size(); + auto buf = malloc(size); + if (buf == nullptr) { + MS_LOG(ERROR) << "cannot allocate buffer size" << size; + return RET_ERROR; + } + StaticAllocator *alloc = reinterpret_cast(allocator_.get()); + alloc->SetContex(buf, size); + tensors_data_ = buf; + } + for (auto kernel : train_kernels_) { + for (auto tensor : kernel->out_tensors()) { + auto it = offset_map.find(tensor); + if (it != offset_map.end()) { + tensor->set_data(reinterpret_cast(reinterpret_cast(tensors_data_) + it->second)); + } + } + } + return RET_OK; +} + int TrainSession::CompileGraph(lite::Model *model) { return lite::RET_ERROR; } int TrainSession::CompileTrainGraph(std::shared_ptr model) { @@ -194,10 +242,21 @@ int TrainSession::CompileTrainGraph(std::shared_ptr model) { MS_LOG(ERROR) << "failed to allocate space"; return RET_ERROR; } + ret = AllocTensors(train_kernels_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "failed to allocate space"; + return RET_ERROR; + } return RET_OK; } -TrainSession::~TrainSession() { FreeWorkSpace(); } +TrainSession::~TrainSession() { + FreeWorkSpace(); + if (tensors_data_ != nullptr) { + free(tensors_data_); + tensors_data_ = nullptr; + } +} int TrainSession::ExecKernels(const KernelCallBack &before, const KernelCallBack &after, const std::vector &run_kernels) { @@ -420,6 +479,12 @@ int TrainSession::Train() { lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1); } } + // allocate tensors + auto ret = AllocTensors(train_kernels_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "failed to allocate tensor space"; + return RET_ERROR; + } return RET_OK; } @@ -446,6 +511,11 @@ int TrainSession::Eval() { lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1); } } + auto ret = AllocTensors(inference_kernels_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "failed to allocate space"; + return RET_ERROR; + } return RET_OK; } @@ -781,7 +851,12 @@ session::LiteSession *session::TrainSession::CreateTrainSession(const std::strin MS_LOG(ERROR) << "create session failed"; return nullptr; } - + if (context->allocator == nullptr) { + const_cast(context)->allocator = std::shared_ptr(new (std::nothrow) StaticAllocator()); + if (context->allocator == nullptr) { + MS_LOG(ERROR) << " cannot convert to static allocation"; + } + } auto ret = session->Init(context, cfg); if (ret != mindspore::lite::RET_OK) { MS_LOG(ERROR) << "init session failed"; diff --git a/mindspore/lite/src/train/train_session.h b/mindspore/lite/src/train/train_session.h index a21ab9f07f3..257d29180cc 100644 --- a/mindspore/lite/src/train/train_session.h +++ b/mindspore/lite/src/train/train_session.h @@ -147,6 +147,7 @@ class TrainSession : virtual public lite::LiteSession { void FreeRestoreTensors(); bool AllInputsNeedScale(kernel::LiteKernel *kernel); void FreeWorkSpace(); + int AllocTensors(const std::vector &kernels); std::map restored_origin_tensors_; int virtual_batch_idx_ = 0; @@ -155,6 +156,8 @@ class TrainSession : virtual public lite::LiteSession { void *workspace_ = nullptr; SchedCallBack sched_mix_precision_callback_; bool train_mode_ = false; + void *tensors_data_ = nullptr; + std::shared_ptr allocator_; }; } // namespace lite diff --git a/mindspore/lite/tools/benchmark_train/net_train.cc b/mindspore/lite/tools/benchmark_train/net_train.cc index da96eaf47b2..76164f076a3 100644 --- a/mindspore/lite/tools/benchmark_train/net_train.cc +++ b/mindspore/lite/tools/benchmark_train/net_train.cc @@ -603,7 +603,7 @@ int NetTrain::InitCallbackParameter() { } op_call_times_total_++; op_begin_ = GetTimeUs(); - if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign")) { + if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign") || callParam.node_type == "SGD") { for (auto tensor : before_outputs) { std::fill(reinterpret_cast(tensor->MutableData()), reinterpret_cast(tensor->MutableData()) + tensor->Size(), 0);