memory optimization

This commit is contained in:
yoni 2021-08-03 16:42:58 +03:00
parent b917ceca36
commit 208c620cea
9 changed files with 272 additions and 6 deletions

View File

@ -133,6 +133,7 @@ set(TRAIN_SRC
${CMAKE_CURRENT_SOURCE_DIR}/train/accuracy_monitor.cc
${CMAKE_CURRENT_SOURCE_DIR}/train/classification_train_accuracy_monitor.cc
${CMAKE_CURRENT_SOURCE_DIR}/train/train_export.cc
${CMAKE_CURRENT_SOURCE_DIR}/train/opt_allocator.cc
${CMAKE_CURRENT_SOURCE_DIR}/../tools/common/storage.cc
)
if(ENABLE_V0)

View File

@ -316,8 +316,10 @@ void Tensor::FreeData() {
this->data_ = nullptr;
} else {
allocator_->Free(this->data_);
if (!IS_STATIC_ALLOCATOR(allocator_) || (allocator_->RefCount(this->data_) != 0)) {
this->data_ = nullptr;
}
}
}
void *Tensor::ReallocData() {

View File

@ -34,12 +34,15 @@
namespace mindspore {
namespace lite {
#define STATIC_ALLOCATION -271964
#define IS_STATIC_ALLOCATOR(allocator) ((allocator != nullptr) && (allocator->RefCount(nullptr) == STATIC_ALLOCATION))
struct LiteQuantParam {
double scale;
int32_t zeroPoint;
float var_corr{1};
float mean_corr{0};
bool inited;
bool inited{false};
std::vector<float> clusters{};
int bitNum;
int roundType;
@ -133,7 +136,6 @@ class Tensor : public mindspore::tensor::MSTensor {
void set_format(mindspore::Format format) override { this->format_ = format; }
mindspore::Format format() const override { return this->format_; }
virtual int ref_count() const { return ref_count_; }
virtual int init_ref_count() const { return this->init_ref_count_; }

View File

@ -0,0 +1,90 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/train/opt_allocator.h"
#include <limits>
#include "nnacl/op_base.h"
namespace mindspore {
size_t OptAllocator::FindFree(size_t size) {
size_t min_size = std::numeric_limits<size_t>::max();
size_t min_addr = std::numeric_limits<size_t>::max();
for (auto const &itr : arena_) {
// best fit
if (itr.second >= size) {
if (min_size > itr.second) {
min_size = itr.second;
min_addr = itr.first;
}
}
}
return min_addr;
}
void OptAllocator::Reorder(size_t addr) {
size_t length = arena_[addr];
size_t post = addr + length;
// connect to upper block
auto it = arena_.find(post);
if (it != arena_.end()) {
size_t post_size = it->second;
arena_[addr] = length + post_size;
arena_.erase(post);
}
// connect to lower block
auto itr = arena_.lower_bound(addr);
if (itr != arena_.begin()) {
itr--;
size_t last = itr->first;
if ((last + arena_[last]) == addr) {
arena_[last] = arena_[last] + arena_[addr];
arena_.erase(addr);
}
}
}
size_t OptAllocator::Malloc(size_t size) {
size = UP_DIV(size, align_size_) * align_size_;
size_t addr = FindFree(size);
// free block not found
if (addr == std::numeric_limits<size_t>::max()) {
if (!arena_.empty()) {
addr = arena_.rbegin()->first;
if (addr + arena_[addr] < heap_) {
addr = heap_;
} else {
arena_.erase(addr);
}
} else {
addr = heap_;
}
heap_ = addr + size;
} else {
if (arena_[addr] > size) {
arena_[addr + size] = arena_[addr] - size;
}
arena_.erase(addr);
}
alloc_[addr] = size;
return addr;
}
void OptAllocator::Free(size_t addr) {
arena_[addr] = alloc_[addr];
alloc_.erase(addr);
Reorder(addr);
}
} // namespace mindspore

View File

@ -0,0 +1,41 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
#define MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_
#include <map>
#include "include/api/allocator.h"
namespace mindspore {
class OptAllocator {
public:
explicit OptAllocator(size_t aligned_size = 32) : align_size_(aligned_size) {}
~OptAllocator() {}
size_t Malloc(size_t size);
void Free(size_t offset);
size_t total_size() { return heap_; }
private:
size_t FindFree(size_t size);
void Reorder(size_t addr);
std::map<size_t, size_t> arena_;
std::map<size_t, size_t> alloc_;
size_t heap_ = 0;
size_t align_size_;
};
}; // namespace mindspore
#endif // MINDSPORE_LITE_SRC_TRAIN_OPT_ALLOCATOR_H_

View File

@ -0,0 +1,52 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
#define MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_
namespace mindspore {
class StaticAllocator : public Allocator {
public:
void SetContex(void *buf, size_t size) {
start_buf_ = buf;
size_ = size;
}
int SetRefCount(void *ptr, int ref_count) override { return 0; }
int DecRefCount(void *ptr, int ref_count) override { return 0; }
int IncRefCount(void *ptr, int ref_count) override { return 0; }
size_t total_size() { return total_size_; }
void Clear() {}
void *Malloc(size_t size) override {
total_size_ += size;
return malloc(size);
}
void Free(void *ptr) override {
if (RefCount(ptr) != 0) free(ptr);
}
int RefCount(void *ptr) override {
if (ptr == nullptr) return STATIC_ALLOCATION;
char *ptrc = reinterpret_cast<char *>(ptr);
char *bufc = reinterpret_cast<char *>(start_buf_);
return ((ptrc < bufc) || (ptrc - bufc >= static_cast<ptrdiff_t>(size_)) ? 1 : 0);
}
private:
void *start_buf_;
size_t size_;
size_t total_size_ = 0;
};
}; // namespace mindspore
#endif // MINDSPORE_LITE_SRC_TRAIN_STATIC_ALLOCATOR_H_

View File

@ -39,6 +39,8 @@
#include "src/train/optimizer_kernel.h"
#include "src/train/train_utils.h"
#include "src/train/train_export.h"
#include "src/train/opt_allocator.h"
#include "src/train/static_allocator.h"
#include "src/train/train_populate_parameter.h"
#include "src/train/train_populate_parameter_v0.h"
@ -68,6 +70,7 @@ int TrainSession::Init(const Context *context, const TrainCfg *train_cfg) {
}
cfg_ = *train_cfg;
}
allocator_ = context->allocator;
return lite::LiteSession::Init(context);
}
@ -159,6 +162,51 @@ int TrainSession::InitCallBack() {
return RET_OK;
}
int TrainSession::AllocTensors(const std::vector<kernel::LiteKernel *> &kernels) {
if (!IS_STATIC_ALLOCATOR(allocator_)) return RET_OK;
OptAllocator allocator;
std::unordered_map<lite::Tensor *, int> ref_count;
std::unordered_map<lite::Tensor *, size_t> offset_map;
for (auto kernel : kernels) {
for (auto tensor : kernel->out_tensors()) {
size_t size = tensor->Size();
size_t offset = allocator.Malloc(size);
offset_map[tensor] = offset;
ref_count[tensor] = tensor->init_ref_count();
}
for (auto tensor : kernel->in_tensors()) {
if (tensor->category() == lite::Tensor::VAR) {
int count = ref_count[tensor] - 1;
ref_count[tensor] = count;
if (count == 0) {
allocator.Free(offset_map[tensor]);
}
}
}
}
// Set Tensor data
if (tensors_data_ == nullptr) {
auto size = allocator.total_size();
auto buf = malloc(size);
if (buf == nullptr) {
MS_LOG(ERROR) << "cannot allocate buffer size" << size;
return RET_ERROR;
}
StaticAllocator *alloc = reinterpret_cast<StaticAllocator *>(allocator_.get());
alloc->SetContex(buf, size);
tensors_data_ = buf;
}
for (auto kernel : train_kernels_) {
for (auto tensor : kernel->out_tensors()) {
auto it = offset_map.find(tensor);
if (it != offset_map.end()) {
tensor->set_data(reinterpret_cast<void *>(reinterpret_cast<char *>(tensors_data_) + it->second));
}
}
}
return RET_OK;
}
int TrainSession::CompileGraph(lite::Model *model) { return lite::RET_ERROR; }
int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
@ -194,10 +242,21 @@ int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
MS_LOG(ERROR) << "failed to allocate space";
return RET_ERROR;
}
ret = AllocTensors(train_kernels_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "failed to allocate space";
return RET_ERROR;
}
return RET_OK;
}
TrainSession::~TrainSession() { FreeWorkSpace(); }
TrainSession::~TrainSession() {
FreeWorkSpace();
if (tensors_data_ != nullptr) {
free(tensors_data_);
tensors_data_ = nullptr;
}
}
int TrainSession::ExecKernels(const KernelCallBack &before, const KernelCallBack &after,
const std::vector<kernel::LiteKernel *> &run_kernels) {
@ -420,6 +479,12 @@ int TrainSession::Train() {
lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1);
}
}
// allocate tensors
auto ret = AllocTensors(train_kernels_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "failed to allocate tensor space";
return RET_ERROR;
}
return RET_OK;
}
@ -446,6 +511,11 @@ int TrainSession::Eval() {
lite_tensor->set_init_ref_count(lite_tensor->init_ref_count() + 1);
}
}
auto ret = AllocTensors(inference_kernels_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "failed to allocate space";
return RET_ERROR;
}
return RET_OK;
}
@ -781,7 +851,12 @@ session::LiteSession *session::TrainSession::CreateTrainSession(const std::strin
MS_LOG(ERROR) << "create session failed";
return nullptr;
}
if (context->allocator == nullptr) {
const_cast<lite::Context *>(context)->allocator = std::shared_ptr<Allocator>(new (std::nothrow) StaticAllocator());
if (context->allocator == nullptr) {
MS_LOG(ERROR) << " cannot convert to static allocation";
}
}
auto ret = session->Init(context, cfg);
if (ret != mindspore::lite::RET_OK) {
MS_LOG(ERROR) << "init session failed";

View File

@ -147,6 +147,7 @@ class TrainSession : virtual public lite::LiteSession {
void FreeRestoreTensors();
bool AllInputsNeedScale(kernel::LiteKernel *kernel);
void FreeWorkSpace();
int AllocTensors(const std::vector<kernel::LiteKernel *> &kernels);
std::map<Tensor *, Tensor *> restored_origin_tensors_;
int virtual_batch_idx_ = 0;
@ -155,6 +156,8 @@ class TrainSession : virtual public lite::LiteSession {
void *workspace_ = nullptr;
SchedCallBack sched_mix_precision_callback_;
bool train_mode_ = false;
void *tensors_data_ = nullptr;
std::shared_ptr<Allocator> allocator_;
};
} // namespace lite

View File

@ -603,7 +603,7 @@ int NetTrain::InitCallbackParameter() {
}
op_call_times_total_++;
op_begin_ = GetTimeUs();
if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign")) {
if ((callParam.node_type == "Adam") || (callParam.node_type == "Assign") || callParam.node_type == "SGD") {
for (auto tensor : before_outputs) {
std::fill(reinterpret_cast<int8_t *>(tensor->MutableData()),
reinterpret_cast<int8_t *>(tensor->MutableData()) + tensor->Size(), 0);