memory reuse code clean

This commit is contained in:
LaiYongqiang 2021-06-18 09:41:55 +08:00
parent 8301489439
commit d4d6fb940d
13 changed files with 65 additions and 953 deletions

View File

@ -55,6 +55,7 @@ endif()
if(DEBUG_MODE)
set(CMAKE_BUILD_TYPE "Debug")
add_compile_definitions(MEM_REUSE_DEBUG)
else()
set(CMAKE_BUILD_TYPE "Release")
endif()

View File

@ -111,6 +111,26 @@ class MemReuseUtil {
std::unordered_map<AnfNodePtr, session::KernelWithIndex> visit_kernel_with_return_type_in0pos_skip_nop_cache_;
};
using MemReuseUtilPtr = std::shared_ptr<MemReuseUtil>;
enum Status { kUnused, kReused };
enum MemType { kNew, kInStreamReuse, kBetweenStreamReuse, kKernelDependenceReuse };
class Membuf {
public:
Membuf() = default;
Membuf(Status status, size_t size, size_t offset, int index, MemType type, const KernelDefPtr &used_kernel)
: status_(status), size_(size), offset_(offset), index_(index), type_(type), used_kernel_(used_kernel) {}
~Membuf() = default;
// Memory block status flags
Status status_ = kUnused;
size_t size_{0};
size_t offset_{0};
// Store the tensor index stored in this memory block at a certain moment
int index_{0};
MemType type_{kNew};
KernelDefPtr used_kernel_;
};
using MembufPtr = std::shared_ptr<Membuf>;
} // namespace memreuse
} // namespace mindspore

View File

@ -1,536 +0,0 @@
/**
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
#include "backend/optimizer/mem_reuse/mem_reuse.h"
#include "backend/optimizer/mem_reuse/mem_reuse_checker.h"
#ifdef ENABLE_D
#include "runtime/device/ascend/ascend_stream_assign.h"
#endif
#ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h"
#include "debug/debugger/debugger.h"
#endif
namespace mindspore {
namespace memreuse {
void BestFitMemReuse::InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr) {
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
set_tensor_ptr_list(mem_reuse_util_ptr->total_refs_list());
set_workspace_ptr_list(mem_reuse_util_ptr->total_wk_ref_list());
set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list());
// check info Correctness
for (auto &tensor : tensor_ptr_list_) {
tensor->size_ = AlignCommonMemorySize(tensor->size_);
}
// align wk size to 512 && refcount == 1
for (auto &wk : wk_tensor_list_) {
wk->size_ = AlignCommonMemorySize(wk->size_);
wk->ref_count_ = 1;
}
#ifdef ENABLE_D
stream_groups_ = device::ascend::AscendStreamAssign::GetInstance().get_stream_group();
#endif
}
void BestFitMemReuse::InitKernelDependence() {
for (const auto &kernel : op_ptr_list_) {
std::set<KernelDefPtr> front;
std::queue<KernelDefPtr> to_visit;
to_visit.push(kernel);
// find all kernels before current kernel
while (!to_visit.empty()) {
auto curr = to_visit.front();
to_visit.pop();
if (front.count(curr)) {
continue;
}
front.insert(curr);
auto iter = kernel_front_map_.find(curr);
if (iter != kernel_front_map_.end()) {
auto visited_front = iter->second;
front.insert(visited_front.begin(), visited_front.end());
continue;
}
for (const auto &input : curr->input_kernels()) {
to_visit.push(input);
}
}
kernel_front_map_[kernel] = front;
}
}
bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr &mem_buf) {
// determine whether the kernel_curr can reuse kernel_prev's output tensor membuf
MS_EXCEPTION_IF_NULL(kernel_curr);
MS_EXCEPTION_IF_NULL(mem_buf);
auto kernel_prev = mem_buf->used_kernel_;
MS_EXCEPTION_IF_NULL(kernel_prev);
#ifdef ENABLE_DEBUGGER
auto debugger_ = mindspore::Debugger::GetInstance();
if (debugger_->DebuggerBackendEnabled()) {
std::string current_kernel_name = kernel_curr->scope_full_name();
if (debugger_->DebugServicesIsWatchPoint(current_kernel_name)) {
return false;
}
}
#endif
auto curr_stream_id = kernel_curr->stream_id();
auto prev_stream_id = kernel_prev->stream_id();
if (curr_stream_id == prev_stream_id) {
mem_buf->type_ = kInStreamReuse;
return true;
}
bool reuse_between_streams = true;
for (auto &stream_group : stream_groups_) {
size_t cur_index = UINT32_MAX;
size_t prev_index = UINT32_MAX;
for (size_t index = 0; index < stream_group.size(); index++) {
if (curr_stream_id == stream_group[index]) {
cur_index = index;
continue;
}
if (prev_stream_id == stream_group[index]) {
prev_index = index;
continue;
}
}
if ((prev_index != UINT32_MAX) && (cur_index == UINT32_MAX || (prev_index > cur_index))) {
// previous stream and current stream are not in the same group can't be reused
// previous stream is behind current stream can't be reused
reuse_between_streams = false;
break;
}
}
if (reuse_between_streams) {
mem_buf->type_ = kBetweenStreamReuse;
return true;
}
auto iter = kernel_front_map_.find(kernel_curr);
if (iter == kernel_front_map_.end()) {
MS_LOG(EXCEPTION) << kernel_curr->scope_full_name() << " is not init.";
}
auto kernel_curr_front = iter->second;
auto depend_count = kernel_curr_front.count(kernel_prev);
if (depend_count) {
mem_buf->type_ = kKernelDependenceReuse;
return true;
}
return false;
}
void BestFitMemReuse::AssignCommonNodeOutputOffset() {
MS_EXCEPTION_IF_NULL(current_kernel_);
for (const auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
if (tensor_desc->type_ == kRefNodeInput) {
total_refinput_size += tensor_desc->size_;
} else if (tensor_desc->type_ == kRefNodeOutput) {
total_refoutput_size += tensor_desc->size_;
// no need to alloc refnode output's memory
continue;
} else if (tensor_desc->type_ == kCommNotReuse) {
total_comm_not_reuse_size += tensor_desc->size_;
} else if (tensor_desc->type_ == kCommReuse) {
// get align size for communication op's single input
tensor_desc->size_ = AlignCommunicationMemorySize(tensor_desc->size_);
total_comm_reuse_size += tensor_desc->size_;
}
auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_);
if (!reusable_membuf_map.empty()) {
auto membuf_index = reusable_membuf_map.begin()->second;
// find the best suitable membuf in membuf list, and reuse it
ReuseExistMembuf(tensor_desc.get(), membuf_index, kDynamicMem);
} else {
// no membuf can reuse, add new membuf after the membuf_ptr_list
AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
#ifdef MEM_REUSE_DEBUG
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
#endif
}
// skip left align border for communication op single input to used
if (tensor_desc->type_ == kCommReuse) {
tensor_desc->offset_ += kDefaultMemAlignSize;
}
}
}
void BestFitMemReuse::AssignCommunicationNodeOutputOffset() {
size_t total_kernel_output_size = 0;
// get all output size
MS_EXCEPTION_IF_NULL(current_kernel_);
for (const auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
if (tensor_desc->type_ == kCommReuse) {
total_comm_reuse_size += tensor_desc->size_;
total_comm_output_reuse_size += tensor_desc->size_;
total_kernel_output_size += tensor_desc->size_;
} else {
MS_LOG(ERROR) << "All communication op's outputs should be memory reuse, Kernel:"
<< current_kernel_->scope_full_name() << " output index:" << tensor_idx
<< " tensor_type:" << tensor_desc->type_;
continue;
}
}
total_kernel_output_size = AlignCommunicationMemorySize(total_kernel_output_size);
// add left align border for the first output and right align border for the last output to alloc align border memory
size_t output_index = 0;
auto output_ref_indexes = current_kernel_->GetOutputRefIndexs();
for (const auto &out_index : output_ref_indexes) {
size_t index = GetTensorIndex(out_index);
auto descption = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(descption);
if (output_index == 0) {
descption->size_ += kDefaultMemAlignSize;
}
if ((output_index == 0) && (output_ref_indexes.size() == 1)) {
// add right align border for single output
descption->size_ += kDefaultMemAlignSize;
}
output_index++;
}
auto reusable_membuf_map = GetReusableMembufMap(total_kernel_output_size);
if (!reusable_membuf_map.empty()) {
auto membuf_index = reusable_membuf_map.begin()->second;
output_index = 0;
for (const auto &idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(idx);
auto desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(desc);
ReuseExistMembuf(desc.get(), membuf_index + output_index, kDynamicMem);
// skip skip left align border for communication op's first output to used
if (output_index == 0) {
desc->offset_ += kDefaultMemAlignSize;
}
output_index++;
}
} else {
// no membuf can reuse, add new membuf after the membuf_ptr_list
output_index = 0;
for (const auto &tensor_index : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_index);
auto desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(desc);
AddNewMembufPtr(desc.get(), kDynamicMem);
// skip align size offset for first output to used
if (output_index == 0) {
desc->offset_ += kDefaultMemAlignSize;
}
output_index++;
#ifdef MEM_REUSE_DEBUG
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
#endif
}
}
}
void BestFitMemReuse::AssignNodeOutputOffset() {
if (current_kernel_->type_ == kCommunicationNode) {
AssignCommunicationNodeOutputOffset();
} else {
AssignCommonNodeOutputOffset();
}
}
void BestFitMemReuse::AssignNodeWorkspaceOffset() {
for (auto &wk_idx : current_kernel_->GetWorkspaceRefIndexs()) {
size_t index = GetWorkspaceIndex(wk_idx);
auto wk_ref = wk_tensor_list_[index];
MS_EXCEPTION_IF_NULL(wk_ref);
auto re_wk_membuf_map = GetReusableMembufMap(wk_ref->size_);
if (!re_wk_membuf_map.empty()) {
auto membuf_index = re_wk_membuf_map.begin()->second;
ReuseExistMembuf(wk_ref.get(), membuf_index, kWorkspaceMem);
} else {
AddNewMembufPtr(wk_ref.get(), kWorkspaceMem);
}
}
}
void BestFitMemReuse::ReuseExistMembuf(KernelRefCount *tensor_desc, size_t membuf_index, int flag) {
MS_EXCEPTION_IF_NULL(tensor_desc);
CheckMembufIndx(membuf_index);
auto membuf = membuf_ptr_list_[membuf_index];
MS_EXCEPTION_IF_NULL(membuf);
// first to split && then update membuf_info
if (IsSplit(tensor_desc->size_, membuf->size_)) {
// split the membuf, and insert a new membuf after this membuf
SplitMembuf(tensor_desc, membuf_index);
}
// update membuf status, and set tensor offset
UpdateMembufInfo(tensor_desc, membuf.get(), flag);
}
std::map<size_t, size_t> BestFitMemReuse::GetReusableMembufMap(size_t tensor_size) {
std::map<size_t, size_t> size_map;
for (size_t i = 0; i < membuf_ptr_list_.size(); ++i) {
auto membuf = membuf_ptr_list_[i];
auto index = i;
bool is_membuf_ok = membuf->status_ == kUnused && membuf->size_ >= tensor_size;
if (is_membuf_ok && IsUsable(current_kernel_, membuf)) {
(void)size_map.insert(std::make_pair(membuf->size_, index));
break;
}
}
return size_map;
}
void BestFitMemReuse::UpdateMembufInfo(KernelRefCount *tensor_desc, Membuf *membuf, int flag) {
MS_EXCEPTION_IF_NULL(tensor_desc);
MS_EXCEPTION_IF_NULL(membuf);
auto real_index = GetRealIndex(IntToSize(tensor_desc->index_), flag);
membuf->status_ = kReused;
membuf->index_ = real_index;
membuf->used_kernel_ = current_kernel_;
tensor_desc->offset_ = membuf->offset_;
}
bool BestFitMemReuse::IsSplit(size_t tensor_size, size_t membuf_size) const { return tensor_size < membuf_size; }
void BestFitMemReuse::SplitMembuf(const KernelRefCount *tensor_desc, size_t membuf_index) {
MS_EXCEPTION_IF_NULL(tensor_desc);
CheckMembufIndx(membuf_index);
auto membuf = membuf_ptr_list_[membuf_index];
MS_EXCEPTION_IF_NULL(membuf);
auto bias = membuf->size_ - tensor_desc->size_;
membuf->size_ = tensor_desc->size_;
// to check if spilt membuf can be merge
auto new_membuf = std::make_shared<Membuf>(kUnused, bias, membuf->offset_ + membuf->size_, kInvalidIndex,
membuf->type_, current_kernel_);
(void)membuf_ptr_list_.insert(membuf_ptr_list_.begin() + SizeToInt(membuf_index + 1), new_membuf);
}
void BestFitMemReuse::AddNewMembufPtr(KernelRefCount *tensor_desc, int flag) {
MS_EXCEPTION_IF_NULL(tensor_desc);
size_t membuf_offset = 0;
if (!membuf_ptr_list_.empty()) {
membuf_offset = membuf_ptr_list_.back()->offset_ + membuf_ptr_list_.back()->size_;
}
auto membuf_size = tensor_desc->size_;
auto real_index = GetRealIndex(IntToSize(tensor_desc->index_), flag);
auto membuf = std::make_shared<Membuf>(kReused, membuf_size, membuf_offset, real_index, kNew, current_kernel_);
membuf_ptr_list_.push_back(membuf);
tensor_desc->offset_ = membuf_offset;
}
void BestFitMemReuse::UpdateNodeInputAndMembuf() {
// process node input tensor
for (const auto &tensor_idx : current_kernel_->GetInputRefIndexs()) {
size_t tensor_index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[tensor_index];
MS_EXCEPTION_IF_NULL(tensor_desc);
tensor_desc->ref_count_--;
if (tensor_desc->ref_count_ == 0) {
ReleaseMembuf(tensor_index, kDynamicMem);
} else if (tensor_desc->ref_count_ < 0) {
MS_LOG(EXCEPTION) << "tensor: " << tensor_desc->index_ << " refcount: " << tensor_desc->ref_count_
<< " check error";
}
}
}
void BestFitMemReuse::ReleaseNodeUnusedOutput() {
for (const auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t tensor_index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[tensor_index];
MS_EXCEPTION_IF_NULL(tensor_desc);
if (tensor_desc->ref_count_ == 0) {
ReleaseMembuf(tensor_index, kDynamicMem);
} else if (tensor_desc->ref_count_ < 0) {
MS_LOG(EXCEPTION) << "tensor: " << tensor_desc->index_ << " refcount: " << tensor_desc->ref_count_
<< " check error";
}
}
}
void BestFitMemReuse::ReleasePreNodeWorkspace(const KernelDef *kernel_def_ptr) {
for (auto &workspace_index : kernel_def_ptr->GetWorkspaceRefIndexs()) {
size_t index = GetWorkspaceIndex(workspace_index);
auto wk_tensor = wk_tensor_list_[index];
wk_tensor->ref_count_--;
if (wk_tensor->ref_count_ == 0) {
ReleaseMembuf(index, kWorkspaceMem);
} else if (wk_tensor->ref_count_ < 0) {
MS_LOG(EXCEPTION) << "tensor: " << wk_tensor->index_ << " refcount: " << wk_tensor->ref_count_ << " check error";
}
}
}
void BestFitMemReuse::ReleaseMembuf(size_t tensor_index, int flag) {
if (membuf_ptr_list_.empty()) {
return;
}
auto real_index = GetRealIndex(tensor_index, flag);
auto membuf_iter = std::find_if(membuf_ptr_list_.begin(), membuf_ptr_list_.end(),
[real_index](const MembufPtr &membuf) { return membuf->index_ == real_index; });
if (membuf_iter == membuf_ptr_list_.end()) {
return;
}
auto membuf = (*membuf_iter);
MS_EXCEPTION_IF_NULL(membuf);
membuf->status_ = kUnused;
if (membuf_iter != membuf_ptr_list_.end() - 1) {
auto next_iter = membuf_iter + 1;
auto membuf_next = (*next_iter);
MS_EXCEPTION_IF_NULL(membuf_next);
if (membuf_next->status_ == kUnused) {
bool is_merge = IsUsable(current_kernel_, membuf_next);
if (is_merge) {
membuf->size_ += membuf_next->size_;
(void)membuf_ptr_list_.erase(next_iter);
}
}
}
if (membuf_iter != membuf_ptr_list_.begin()) {
auto prev_iter = membuf_iter - 1;
auto membuf_prev = (*prev_iter);
MS_EXCEPTION_IF_NULL(membuf_prev);
if (membuf_prev->status_ == kUnused) {
bool is_merge = IsUsable(current_kernel_, membuf_prev);
if (is_merge) {
membuf->size_ += membuf_prev->size_;
membuf->offset_ = membuf_prev->offset_;
(void)membuf_ptr_list_.erase(prev_iter);
}
}
}
}
size_t BestFitMemReuse::AlignCommonMemorySize(size_t size) const {
// memory size 512 align
return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize;
}
size_t BestFitMemReuse::AlignCommunicationMemorySize(size_t size) const {
// memory size 512 align and add communication memory: left align border memory - data - right align border memory
return kDefaultMemAlignSize + (size + kDefaultMemAlignSize - 1) / kDefaultMemAlignSize * kDefaultMemAlignSize +
kDefaultMemAlignSize;
}
size_t BestFitMemReuse::GetAllocatedSize() {
size_t AllocatedSize = kTotalSize;
if (membuf_ptr_list_.empty()) {
return AllocatedSize;
}
AllocatedSize = membuf_ptr_list_.back()->offset_ + membuf_ptr_list_.back()->size_;
MS_LOG(INFO) << "MemReuse Allocated Dynamic Size: " << AllocatedSize;
return AllocatedSize;
}
bool BestFitMemReuse::IsRelease() {
// unable_used_node include the node type that output tensor cannot be released,
// even if its refcount is equal to zero.
std::unordered_set<std::string> unable_used_node = {
prim::kPrimBatchNorm->name(),
prim::kPrimBatchNormGrad->name(),
};
return unable_used_node.find(current_kernel_->kernel_name()) == unable_used_node.end();
}
size_t BestFitMemReuse::GetTensorIndex(int index) const {
if (index < 0 || IntToSize(index) >= tensor_ptr_list_.size()) {
MS_LOG(WARNING) << "current cnode: " << current_kernel_->scope_full_name();
MS_LOG(EXCEPTION) << "invalid tensor index";
}
return IntToSize(index);
}
size_t BestFitMemReuse::GetWorkspaceIndex(int index) const {
if (index < 0 || IntToSize(index) >= wk_tensor_list_.size()) {
MS_LOG(WARNING) << "current cnode: " << current_kernel_->scope_full_name();
MS_LOG(EXCEPTION) << "invalid tensor index";
}
return IntToSize(index);
}
int BestFitMemReuse::GetRealIndex(size_t index, int flag) const {
if (flag == kDynamicMem) {
return SizeToInt(index);
} else if (flag == kWorkspaceMem) {
return kWorkspaceIndexFactor * SizeToInt(index + 1);
} else {
MS_LOG(EXCEPTION) << "flag " << flag << " is invalid";
}
}
void BestFitMemReuse::CheckMembufIndx(size_t membuf_index) const {
if (membuf_index >= membuf_ptr_list_.size()) {
MS_LOG(WARNING) << "current cnode: " << current_kernel_->scope_full_name();
MS_LOG(EXCEPTION) << "invalid membuf index: " << membuf_index << ", real size: " << membuf_ptr_list_.size();
}
}
void BestFitMemReuse::Reuse(const MemReuseUtil *mem_reuse_util_ptr) {
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
InitMemReuseInfo(mem_reuse_util_ptr);
InitKernelDependence();
KernelDefPtr pre_op = nullptr;
#ifdef MEM_REUSE_DEBUG
size_t op_num = 0;
#endif
for (const auto &op_def_ptr : op_ptr_list_) {
current_kernel_ = op_def_ptr;
// release pre_op_def
if (pre_op != nullptr) {
ReleasePreNodeWorkspace(pre_op.get());
}
MemReuseChecker::GetInstance().IsAddNewMembuf_ = false;
// process node output tensor
AssignNodeOutputOffset();
#ifdef MEM_REUSE_DEBUG
if (MemReuseChecker::GetInstance().IsAddNewMembuf_) {
MemReuseChecker::GetInstance().SetAddNewMembuInfos(op_def_ptr.get(), membuf_ptr_list_, op_num);
}
#endif
// deal with current op'workspace
AssignNodeWorkspaceOffset();
pre_op = op_def_ptr;
// update node input tensor refcount, and membuf list status
UpdateNodeInputAndMembuf();
// check node output tensor which refcount is equal to zero
if (IsRelease()) {
ReleaseNodeUnusedOutput();
}
#ifdef MEM_REUSE_DEBUG
MemReuseChecker::GetInstance().SetMembuInfos(op_def_ptr.get(), membuf_ptr_list_);
++op_num;
#endif
}
MS_LOG(INFO) << "Special Tensor total size: RefInput: " << total_refinput_size
<< " RefOutput: " << total_refoutput_size << " CommReuse: " << total_comm_reuse_size
<< " CommOutputReuse: " << total_comm_output_reuse_size
<< " CommNotReuse: " << total_comm_not_reuse_size;
#ifdef MEM_REUSE_DEBUG
MemReuseChecker::GetInstance().ExportMembufInfoIR();
MemReuseChecker::GetInstance().ExportAddNewMmebufIR();
MemReuseChecker::GetInstance().set_kernel_front_map(kernel_front_map_);
MemReuseChecker::GetInstance().ExportKernelDependence();
#endif
}
} // namespace memreuse
} // namespace mindspore

View File

@ -1,174 +0,0 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_REUSE_ALLOCATOR_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_REUSE_ALLOCATOR_H_
#include <cmath>
#include <map>
#include <list>
#include <memory>
#include <vector>
#include <numeric>
#include <algorithm>
#include <utility>
#include <fstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <set>
#include <queue>
#include "backend/optimizer/mem_reuse/kernel_refcount.h"
#include "backend/optimizer/mem_reuse/mem_reuse.h"
namespace mindspore {
namespace memreuse {
static constexpr int kWorkspaceIndexFactor = -1000;
static constexpr int kDynamicMem = -1;
static constexpr int kWorkspaceMem = 1;
static constexpr size_t kTotalSize = 0;
enum Status { kUnused, kReused };
enum MemType { kNew, kInStreamReuse, kBetweenStreamReuse, kKernelDependenceReuse };
class Membuf {
public:
Membuf() = default;
Membuf(Status status, size_t size, size_t offset, int index, MemType type, const KernelDefPtr &used_kernel)
: status_(status), size_(size), offset_(offset), index_(index), type_(type), used_kernel_(used_kernel) {}
~Membuf() = default;
// Memory block status flags
Status status_ = kUnused;
size_t size_{0};
size_t offset_{0};
// Store the tensor index stored in this memory block at a certain moment
int index_{0};
MemType type_{kNew};
KernelDefPtr used_kernel_;
};
using MembufPtr = std::shared_ptr<Membuf>;
class BestFitMemReuse {
public:
BestFitMemReuse() = default;
~BestFitMemReuse() { membuf_ptr_list_.clear(); }
/**
* Init all information need by memory reuse
* @param mem_reuse_util_ptr, initialize in the memreuse.cc
*/
void InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr);
void CheckMembufIndx(size_t check_idx) const;
void AssignNodeWorkspaceOffset();
void ReleasePreNodeWorkspace(const KernelDef *kernel_def_ptr);
/**
* Assign output tensor memory offset of current kernel
*/
void AssignNodeOutputOffset();
/**
* Assign output tensor memory offset of common kernel
*/
void AssignCommonNodeOutputOffset();
/**
* Assign output tensor memory offset of communication kernel
*/
void AssignCommunicationNodeOutputOffset();
/**
* Update input tensor's status of current kernel, and the status of membuf used by current kernel
*/
void UpdateNodeInputAndMembuf();
/**
* Check whether to release the kernel output tensor which refcount is equal to zero
*/
void ReleaseNodeUnusedOutput();
/**
* Reuse the exist membuf if possible
* @param tensor_desc, the output tensor of current kernel
* @param membuf_index, the index of membuf to be reused
* @param flag
*/
void ReuseExistMembuf(KernelRefCount *tensor_desc, size_t membuf_index, int flag);
/**
* Get the membuf that can be reused
* @param tensor_size, the size of the tensor ready to assign memory offset
* @return membuf map, key: the membuf size, value: the membuf index
*/
std::map<size_t, size_t> GetReusableMembufMap(size_t tensor_size);
/**
* Update the status of the reused memory block
* @param tensor_desc, the tensor ready to assign memory
* @param membuf, the membuf to be reused
* @param flag, distinguish dynamic memory and workspace
*/
void UpdateMembufInfo(KernelRefCount *tensor_desc, Membuf *membuf, int flag);
// If the size of the memory block is greater than the size of the tensor, split the extra memory
void SplitMembuf(const KernelRefCount *tensor_desc, size_t membuf_index);
// Determine if the memory block needs to be split
bool IsSplit(size_t tensor_size, size_t membuf_size) const;
// If there is no memory block that can be reused, add a new memory block at the end
void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag);
// Merge unused membuf
void ReleaseMembuf(size_t tensor_index, int flag);
// Memory address alignment for common memory
size_t AlignCommonMemorySize(size_t size) const;
// Memory address alignment for communication used memory
size_t AlignCommunicationMemorySize(size_t size) const;
int GetRealIndex(size_t index, int flag = kDynamicMem) const;
size_t GetTensorIndex(int index) const;
size_t GetWorkspaceIndex(int index) const;
// Memory reuse main program entry
void Reuse(const MemReuseUtil *mem_reuse_util_ptr);
// Get the total memory that needs to be applied eventually
size_t GetAllocatedSize();
// return false, when the node output cannot be released
bool IsRelease();
/**
* determine if the kernel_curr can reuse the output tensor add of kernel_prev
* @param kernel_curr, current kernel
* @param mem_buf, the membuf
* @return bool
*/
bool IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr &mem_buf);
/**
* init the dependence of all kernels in the graph
*/
void InitKernelDependence();
// set tensor_def and op_def
void set_tensor_ptr_list(const std::vector<KernelRefCountPtr> &tensor_ptr_list) {
tensor_ptr_list_ = tensor_ptr_list;
}
void set_workspace_ptr_list(const std::vector<KernelRefCountPtr> &workspace_ptr_list) {
wk_tensor_list_ = workspace_ptr_list;
}
void set_op_ptr_list(const std::vector<KernelDefPtr> &op_ptr_list) { op_ptr_list_ = op_ptr_list; }
private:
KernelDefPtr current_kernel_;
// Save all tensor information
std::vector<KernelRefCountPtr> tensor_ptr_list_;
std::vector<KernelRefCountPtr> wk_tensor_list_;
// Save all op information, including input and output tensor index
std::vector<KernelDefPtr> op_ptr_list_;
// Memory block information sequence, temporary variables
std::vector<MembufPtr> membuf_ptr_list_;
// kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def
std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
std::vector<std::vector<uint32_t>> stream_groups_;
size_t total_refinput_size{0};
size_t total_refoutput_size{0};
size_t total_comm_reuse_size{0};
size_t total_comm_output_reuse_size{0};
size_t total_comm_not_reuse_size{0};
};
} // namespace memreuse
} // namespace mindspore
#endif // #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_REUSE_ALLOCATOR_H_

View File

@ -26,7 +26,6 @@
#include "backend/session/anf_runtime_algorithm.h"
#include "backend/optimizer/mem_reuse/mem_reuse.h"
#include "backend/kernel_compiler/common_utils.h"
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
namespace mindspore {
namespace memreuse {
constexpr auto kSplitC = '/';

View File

@ -47,6 +47,9 @@ void AscendMemoryManager::MallocDeviceMemory() {
} else {
MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]";
}
} else {
MS_LOG(INFO) << "Call rtMalloc to allocate device memory Success, size : " << device_mem_size_
<< " bytes , address : " << reinterpret_cast<void *>(device_mem_base_);
}
AscendMemoryPool::GetInstance().Init(device_mem_base_, device_mem_size_, dynamic_mem_offset_);
}
@ -107,6 +110,12 @@ uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_me
} else {
align_size = GetCommonAlignSize(size);
}
auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset();
MS_LOG(INFO) << "Malloc Memory for Static: size[" << align_size << "], Memory statistics: total[" << device_mem_size_
<< "] dynamic [" << total_dynamic_size_ << "] static [" << device_mem_size_ - device_mem_pool_offset
<< "], Pool statistics: pool total size [" << AscendMemoryPool::GetInstance().total_mem_statistics()
<< "] used [" << AscendMemoryPool::GetInstance().used_mem_statistics()
<< "] communication_mem:" << communication_mem;
if (MemoryProfiling::GetInstance().IsMemoryProfilingEnable() && graph_id != kInvalidGraphId) {
auto node = MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id);
@ -136,9 +145,9 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m
}
auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset();
MS_LOG(INFO) << "Malloc Memory: Dynamic, total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_
<< "] memory pool[" << device_mem_size_ - device_mem_pool_offset << "])"
<< " malloc [" << align_size << "] communication_mem: " << communication_mem;
MS_LOG(INFO) << "Malloc Memory for Dynamic: size[" << align_size << "], Memory statistics: total[" << device_mem_size_
<< "] dynamic[" << total_dynamic_size_ << "] static[" << device_mem_size_ - device_mem_pool_offset
<< "] communication_mem: " << communication_mem;
auto offset = dynamic_mem_offset_;
auto new_offset = dynamic_mem_offset_ + align_size;
if (new_offset >= device_mem_pool_offset) {

View File

@ -22,6 +22,7 @@
#include <string>
#include <unordered_map>
#include <vector>
#include <fstream>
#include "runtime/device/kernel_runtime.h"
#include "ir/anf.h"
#include "backend/kernel_compiler/ascend_kernel_mod.h"

View File

@ -29,11 +29,13 @@
#include "runtime/device/kernel_runtime.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
#include "backend/optimizer/mem_reuse/mem_reuse.h"
namespace mindspore {
namespace device {
namespace gpu {
using mindspore::device::memswap::MemSwapManagerPtr;
using mindspore::memreuse::MemReuseUtilPtr;
class GPUKernelRuntime : public KernelRuntime {
public:
GPUKernelRuntime() = default;

View File

@ -276,7 +276,7 @@ void KernelRuntime::RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value
void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(mem_manager_);
MS_LOG(INFO) << "AssignStaticMemoryInput start";
MS_LOG(INFO) << "AssignStaticMemoryInput start for graph " << graph->graph_id();
auto graph_inputs = graph->inputs();
auto graph_valid_input = graph->valid_inputs();
graph_inputs.insert(graph_inputs.end(), graph->child_graph_result().begin(), graph->child_graph_result().end());
@ -342,8 +342,8 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
#endif
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index);
device_address = CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
MS_LOG(INFO) << "Malloc Input for graph " << graph->graph_id() << ", node: " << item->fullname_with_scope()
<< " index: " << index << " size: " << tensor_size;
MS_LOG(INFO) << "Assign Static Memory for Input node, size:" << tensor_size
<< " node:" << item->fullname_with_scope() << " index: " << index;
if (mem_manager_->MallocMem(kStaticMem, tensor_size, device_address, graph->graph_id()) == nullptr) {
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << tensor_size;
}
@ -355,7 +355,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
void KernelRuntime::AssignStaticMemoryOutput(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "AssignStaticMemoryOutput start";
MS_LOG(INFO) << "AssignStaticMemoryOutput start for graph " << graph->graph_id();
auto nodes = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem});
std::vector<session::KernelWithIndex> non_communication_op;
// Assign Communicate Op Memory firstly.
@ -500,12 +500,7 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(MemType type, const AnfNode
return;
}
if (type == kReuseDynamicMem) {
// reuse communication op's all outputs' memory
type = kReuseDynamicCommMem;
}
if (type == kReuseDynamicCommMem || type == kSomasReuseDynamicMem) {
if (type == kSomasReuseDynamicMem) {
bool not_reuse = KernelMemNotReuse(node);
if (not_reuse) {
type = kDynamicMem;
@ -588,7 +583,7 @@ void KernelRuntime::AssignCommunicationNodeInputMem(MemType type, const AnfNodeP
return;
}
if (type == kReuseDynamicMem || type == kSomasReuseDynamicMem) {
if (type == kSomasReuseDynamicMem) {
bool not_reuse = KernelMemNotReuse(node);
if (not_reuse) {
type = kDynamicMem;
@ -616,20 +611,8 @@ void KernelRuntime::AssignCommunicationNodeInputMem(MemType type, const AnfNodeP
void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, int index) {
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(mem_manager_);
if (AnfAlgo::IsGetNext(NOT_NULL(node)) && type == kReuseDynamicMem) {
MS_LOG(INFO) << "GetNext disable mem_reuse";
type = kDynamicMem;
}
if (node->isa<CNode>()) {
bool independent = AnfAlgo::IsIndependentNode(node->cast<CNodePtr>());
if (independent && (type == kReuseDynamicMem)) {
MS_LOG(INFO) << "Independent node " << node->fullname_with_scope() << " disable memory reuse";
type = kDynamicMem;
}
}
if (type == kReuseDynamicMem || type == kSomasReuseDynamicMem) {
if (type == kSomasReuseDynamicMem) {
bool not_reuse = KernelMemNotReuse(node);
if (not_reuse) {
type = kDynamicMem;
@ -652,6 +635,10 @@ void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, in
continue;
}
MS_LOG(DEBUG) << "Assign Node:" << node->fullname_with_scope() << " output memory size:" << output_sizes[i];
if (type == kStaticMem) {
MS_LOG(INFO) << "Assign Static Memory for Output node, size:" << output_sizes[i]
<< " node:" << node->fullname_with_scope();
}
std::string output_format = AnfAlgo::GetOutputFormat(node, i);
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
@ -699,8 +686,12 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER) &&
!mem_manager_->MallocMemFromMemPool(address, node_size)) {
MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << node_size;
} else if (mem_manager_->MallocMem(kStaticMem, node_size, address, graph_id) == nullptr) {
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << node_size;
} else {
MS_LOG(INFO) << "Assign Static Memory for Value node, size:" << node_size
<< " node:" << value_node->fullname_with_scope();
if (mem_manager_->MallocMem(kStaticMem, node_size, address, graph_id) == nullptr) {
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << node_size;
}
}
AnfAlgo::SetOutputAddr(address, output_idx, value_node.get());
if (!address->SyncHostToDevice(trans::GetRuntimePaddingShape(value_node, 0), tensor_size, tensor->data_type(),
@ -717,7 +708,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(mem_manager_);
MS_LOG(DEBUG) << "AssignStaticMemoryValueNode start";
MS_LOG(DEBUG) << "AssignStaticMemoryValueNode start for graph " << graph->graph_id();
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
// order the value nodes
@ -747,8 +738,13 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER) &&
!mem_manager_->MallocMemFromMemPool(address, tensor_size)) {
MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << tensor_size;
} else if (mem_manager_->MallocMem(kStaticMem, tensor_size, address, graph->graph_id()) == nullptr) {
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << tensor_size;
} else {
MS_LOG(INFO) << "Assign Static Memory for Value node, size:" << tensor_size
<< " node:" << value_node->fullname_with_scope();
if (mem_manager_->MallocMem(kStaticMem, tensor_size, address, graph->graph_id()) == nullptr) {
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem
<< ", tensor size is: " << tensor_size;
}
}
AnfAlgo::SetOutputAddr(address, 0, value_node.get());
ShapeVector shape = {1, SizeToLong(tensor_size)};
@ -776,13 +772,8 @@ void KernelRuntime::AssignDynamicMemory(session::KernelGraph *graph) {
if (is_enable_mem_reuse) {
MS_LOG(INFO) << "Memory Reuse is enable...";
#ifdef MEM_REUSE_DEBUG
mem_manager_->MallocReusedDynamicMem(graph);
mem_type = kReuseDynamicMem;
#else
mem_manager_->MallocSomasDynamicMem(graph);
mem_type = kSomasReuseDynamicMem;
#endif
} else {
MS_LOG(INFO) << "Memory Reuse is disable...";
}
@ -973,8 +964,8 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
MS_EXCEPTION_IF_NULL(kernel_mod);
// Skip transpose kernel with "nop_op" attr which is not hidden or removed in PyNative infer scenario. Transpose
// kernel, which is not supposed to be executed, is generated in TransDataSplit to support specific Transdata. And
// hard code here should be removed after new Transdata programme is implemented in the foreseeable future.
// kernel, which is not supposed to be executed, is generated in TransDataSplit to support specific Transdata.
// And hard code here should be removed after new Transdata programme is implemented in the foreseeable future.
if (AnfAlgo::HasNodeAttr("nop_op", kernel)) {
for (size_t idx = 0; idx < AnfAlgo::GetOutputTensorNum(kernel); idx += 1) {
auto real_input = AnfAlgo::GetRealInputIndex(kernel, idx);

View File

@ -22,9 +22,6 @@
#endif
#include "utils/ms_context.h"
using mindspore::memreuse::BestFitMemReuse;
using mindspore::memreuse::MemReuseUtilPtr;
namespace mindspore {
namespace device {
constexpr size_t kAlignBytes = 32;
@ -37,24 +34,6 @@ size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) const {
return (input_size + kMemAlignSize - 1) / kMemAlignSize * kMemAlignSize + 2 * kMemAlignSize;
}
void MemoryManager::MallocReusedDynamicMem(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
// set all infos
mem_reuse_util_ptr->SetAllInfo(graph);
auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>();
MS_EXCEPTION_IF_NULL(bestfit_mem_reuse);
bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize();
MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]";
mem_reuse_util_ptr_ = mem_reuse_util_ptr;
auto base_ptr = MallocDynamicMem(total_allocated_size, false);
MS_LOG(INFO) << "Reuse Memory from [" << reinterpret_cast<void *>(base_ptr) << "] to ["
<< reinterpret_cast<void *>(base_ptr + total_allocated_size) << "]";
mem_reuse_util_ptr_->set_mem_base(base_ptr);
}
void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
SomasPtr somas_reuse_util_ptr = std::make_shared<somas::Somas>();
@ -117,9 +96,6 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
if (communication_mem) {
address->communication_ptr_ = ptr - kMemAlignSize;
}
} else if (type == kReuseDynamicCommMem) {
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_);
ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
} else if (type == kSomasReuseDynamicMem) {
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
@ -135,9 +111,6 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
address->from_mem_pool_ = true;
} else if (type == kDynamicMem) {
ptr = MallocDynamicMem(size, false);
} else if (type == kReuseDynamicMem) {
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_);
ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
} else if (type == kSomasReuseDynamicMem) {
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
@ -147,10 +120,7 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
}
uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size) {
if (type == kReuseDynamicMem) {
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_);
return mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
} else if (type == kSomasReuseDynamicMem) {
if (type == kSomasReuseDynamicMem) {
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
return somas_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
}

View File

@ -20,14 +20,12 @@
#include <utility>
#include <vector>
#include "backend/optimizer/mem_reuse/mem_reuse.h"
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
#include "backend/optimizer/somas/somas.h"
namespace mindspore {
namespace device {
enum MemType { kStaticMem, kDynamicMem, kReuseDynamicMem, kSomasReuseDynamicMem, kReuseDynamicCommMem };
enum MemType { kStaticMem, kDynamicMem, kSomasReuseDynamicMem };
const int kGetAllOuts = -1;
const uint64_t kMemAlignSize = 512;
using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr;
using SomasPtr = mindspore::somas::SomasPtr;
class MemoryManager {
@ -43,7 +41,6 @@ class MemoryManager {
}
virtual void ClearGlobalIdleMem() {}
void MallocReusedDynamicMem(const session::KernelGraph *graph);
virtual void MallocSomasDynamicMem(const session::KernelGraph *graph);
uint8_t *MallocOutputMem(const AnfNodePtr &node, size_t index, MemType type, size_t size,
const DeviceAddressPtr &address, bool comm_mem);
@ -72,7 +69,6 @@ class MemoryManager {
uint64_t static_mem_offset_{0};
size_t total_static_size_ = 0;
size_t total_dynamic_size_ = 0;
MemReuseUtilPtr mem_reuse_util_ptr_{nullptr};
SomasPtr somas_reuse_util_ptr_{nullptr};
};
} // namespace device

View File

@ -1,153 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <memory>
#include <vector>
#include <string>
#include "frontend/operator/ops.h"
#include "backend/optimizer/mem_reuse/mem_reuse.h"
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
#include "common/common_test.h"
#include "common/py_func_graph_fetcher.h"
using mindspore::memreuse::BestFitMemReuse;
using mindspore::memreuse::KernelDef;
using mindspore::memreuse::KernelDefPtr;
using mindspore::memreuse::KernelRefCount;
using mindspore::memreuse::KernelRefCountPtr;
using mindspore::memreuse::MemReuseUtil;
using mindspore::memreuse::MemReuseUtilPtr;
using mindspore::memreuse::RefCountType;
using MembufPtr = std::shared_ptr<mindspore::memreuse::Membuf>;
namespace mindspore {
namespace memreuse {
class TestMemReuseAllocator : public UT::Common {
public:
TestMemReuseAllocator() : getPyFun_("gtest_input.mem_reuse.TestMemReuseAllocator", true) {}
void SetUp() {}
public:
UT::PyFuncGraphFetcher getPyFun_;
};
KernelDefPtr GetNewKernelDef(const std::vector<KernelRefCountPtr> &inputs,
const std::vector<KernelRefCountPtr> &outputs, uint32_t stream_id) {
auto kernel_def = std::make_shared<KernelDef>();
kernel_def->set_input_refs(inputs);
kernel_def->set_output_refs(outputs);
kernel_def->set_stream_id(stream_id);
return kernel_def;
}
void InitMemReuseUtils(MemReuseUtil *mem_reuse_util_ptr) {
// tensor params: ref_count, offset, size, index,
auto tensor_0 = std::make_shared<KernelRefCount>();
tensor_0->index_ = 0;
tensor_0->size_ = 512;
tensor_0->ref_count_ = 999;
ASSERT_NE(tensor_0, nullptr);
auto tensor_1 = std::make_shared<KernelRefCount>();
tensor_1->index_ = 1;
tensor_1->size_ = 1024;
tensor_1->ref_count_ = 1;
auto tensor_2 = std::make_shared<KernelRefCount>();
tensor_2->index_ = 2;
tensor_2->size_ = 1024;
tensor_2->ref_count_ = 2;
auto tensor_3 = std::make_shared<KernelRefCount>();
tensor_3->index_ = 3;
tensor_3->size_ = 32;
tensor_3->ref_count_ = 1;
auto tensor_4 = std::make_shared<KernelRefCount>();
tensor_4->index_ = 4;
tensor_4->size_ = 2048;
tensor_4->ref_count_ = 1;
auto tensor_5 = std::make_shared<KernelRefCount>();
tensor_5->index_ = 5;
tensor_5->size_ = 256;
tensor_5->ref_count_ = 1;
MS_LOG(INFO) << "init all tensor info success.";
std::vector<KernelRefCountPtr> inputs;
std::vector<KernelRefCountPtr> outputs;
inputs = {tensor_0};
outputs = {tensor_1};
auto kernel0 = GetNewKernelDef(inputs, outputs, 0);
inputs = {tensor_1};
outputs = {tensor_2};
auto kernel1 = GetNewKernelDef(inputs, outputs, 0);
inputs = {tensor_2};
outputs = {tensor_3};
auto kernel2 = GetNewKernelDef(inputs, outputs, 0);
inputs = {tensor_2, tensor_3};
outputs = {tensor_4};
auto kernel3 = GetNewKernelDef(inputs, outputs, 0);
inputs = {tensor_4};
outputs = {tensor_5};
auto kernel4 = GetNewKernelDef(inputs, outputs, 1);
MS_LOG(INFO) << "init all op info success.";
std::vector<KernelRefCountPtr> tensor_ptr_list{tensor_0, tensor_1, tensor_2, tensor_3, tensor_4, tensor_5};
std::vector<KernelDefPtr> op_ptr_list{kernel0, kernel1, kernel2, kernel3, kernel4};
mem_reuse_util_ptr->set_total_refs_list(tensor_ptr_list);
mem_reuse_util_ptr->set_kernel_def_ptr_list(op_ptr_list);
}
TEST_F(TestMemReuseAllocator, mem_reuse_allocator) {
MS_LOG(INFO) << "mem_resue_allocator UT";
auto mem_reuse_util_ptr = std::make_shared<MemReuseUtil>();
InitMemReuseUtils(mem_reuse_util_ptr.get());
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
best_fit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
MS_LOG(INFO) << "run mem reuse success";
size_t total_allocated_size = best_fit_mem_reuse->GetAllocatedSize();
ASSERT_NE(total_allocated_size, 0);
}
TEST_F(TestMemReuseAllocator, mem_reuse_allocator_add_membuf) {
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
auto tensor_desc = std::make_shared<KernelRefCount>();
tensor_desc->SetKernelRefCountInfo(0, 1024, kDynamicRefCount);
best_fit_mem_reuse->AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
auto allocated_size = best_fit_mem_reuse->GetAllocatedSize();
ASSERT_EQ(allocated_size, 1024);
}
TEST_F(TestMemReuseAllocator, mem_reuse_allocator_split_membuf) {
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
auto tensor_0 = std::make_shared<KernelRefCount>();
tensor_0->SetKernelRefCountInfo(0, 2048, kDynamicRefCount);
best_fit_mem_reuse->AddNewMembufPtr(tensor_0.get(), kDynamicMem);
auto tensor_1 = std::make_shared<KernelRefCount>();
tensor_1->SetKernelRefCountInfo(1, 800, kDynamicRefCount);
auto is_split = best_fit_mem_reuse->IsSplit(tensor_1->size_, tensor_0->size_);
ASSERT_EQ(is_split, true);
best_fit_mem_reuse->SplitMembuf(tensor_1.get(), 0);
auto allocated_size = best_fit_mem_reuse->GetAllocatedSize();
ASSERT_EQ(allocated_size, 2048);
}
TEST_F(TestMemReuseAllocator, mem_reuse_allocator_align) {
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
auto size = best_fit_mem_reuse->AlignCommonMemorySize(510);
ASSERT_EQ(size, 1024);
}
} // namespace memreuse
} // namespace mindspore

View File

@ -20,7 +20,6 @@
#include "backend/session/session_basic.h"
#include "backend/session/ascend_session.h"
#include "backend/optimizer/mem_reuse/kernel_refcount.h"
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
#include "runtime/device/kernel_info.h"
#include "backend/kernel_compiler/tbe/tbe_kernel_mod.h"
#include "frontend/operator/ops.h"
@ -229,19 +228,6 @@ TEST_F(TestMemReuseWithPy, KernelRef) {
ASSERT_NE(membuf_ptr, nullptr);
}
TEST_F(TestMemReuseWithPy, ReuseAssignDynamicMemory) {
MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<MemReuseUtil>();
ASSERT_NE(mem_reuse_util_ptr, nullptr);
auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>();
ASSERT_NE(bestfit_mem_reuse, nullptr);
bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
auto total_size = bestfit_mem_reuse->GetAllocatedSize();
ASSERT_EQ(total_size, 0);
KernelGraphPtr kernel_graph = std::make_shared<KernelGraph>();
bool ret = mem_reuse_util_ptr->InitDynamicKernelRef(kernel_graph.get());
ASSERT_EQ(ret, true);
}
TEST_F(TestMemReuseWithPy, TestSetInfo) {
KernelGraphPtr g = CreateKernelGraph();
ASSERT_NE(g, nullptr);