memory reuse code clean
This commit is contained in:
parent
8301489439
commit
d4d6fb940d
|
@ -55,6 +55,7 @@ endif()
|
|||
|
||||
if(DEBUG_MODE)
|
||||
set(CMAKE_BUILD_TYPE "Debug")
|
||||
add_compile_definitions(MEM_REUSE_DEBUG)
|
||||
else()
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
endif()
|
||||
|
|
|
@ -111,6 +111,26 @@ class MemReuseUtil {
|
|||
std::unordered_map<AnfNodePtr, session::KernelWithIndex> visit_kernel_with_return_type_in0pos_skip_nop_cache_;
|
||||
};
|
||||
using MemReuseUtilPtr = std::shared_ptr<MemReuseUtil>;
|
||||
|
||||
enum Status { kUnused, kReused };
|
||||
enum MemType { kNew, kInStreamReuse, kBetweenStreamReuse, kKernelDependenceReuse };
|
||||
class Membuf {
|
||||
public:
|
||||
Membuf() = default;
|
||||
Membuf(Status status, size_t size, size_t offset, int index, MemType type, const KernelDefPtr &used_kernel)
|
||||
: status_(status), size_(size), offset_(offset), index_(index), type_(type), used_kernel_(used_kernel) {}
|
||||
~Membuf() = default;
|
||||
// Memory block status flags
|
||||
Status status_ = kUnused;
|
||||
size_t size_{0};
|
||||
size_t offset_{0};
|
||||
// Store the tensor index stored in this memory block at a certain moment
|
||||
int index_{0};
|
||||
MemType type_{kNew};
|
||||
KernelDefPtr used_kernel_;
|
||||
};
|
||||
using MembufPtr = std::shared_ptr<Membuf>;
|
||||
|
||||
} // namespace memreuse
|
||||
} // namespace mindspore
|
||||
|
||||
|
|
|
@ -1,536 +0,0 @@
|
|||
/**
|
||||
* Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse_checker.h"
|
||||
#ifdef ENABLE_D
|
||||
#include "runtime/device/ascend/ascend_stream_assign.h"
|
||||
#endif
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debug_services.h"
|
||||
#include "debug/debugger/debugger.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace memreuse {
|
||||
void BestFitMemReuse::InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr) {
|
||||
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
|
||||
set_tensor_ptr_list(mem_reuse_util_ptr->total_refs_list());
|
||||
set_workspace_ptr_list(mem_reuse_util_ptr->total_wk_ref_list());
|
||||
set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list());
|
||||
// check info Correctness
|
||||
for (auto &tensor : tensor_ptr_list_) {
|
||||
tensor->size_ = AlignCommonMemorySize(tensor->size_);
|
||||
}
|
||||
// align wk size to 512 && refcount == 1
|
||||
for (auto &wk : wk_tensor_list_) {
|
||||
wk->size_ = AlignCommonMemorySize(wk->size_);
|
||||
wk->ref_count_ = 1;
|
||||
}
|
||||
#ifdef ENABLE_D
|
||||
stream_groups_ = device::ascend::AscendStreamAssign::GetInstance().get_stream_group();
|
||||
#endif
|
||||
}
|
||||
|
||||
void BestFitMemReuse::InitKernelDependence() {
|
||||
for (const auto &kernel : op_ptr_list_) {
|
||||
std::set<KernelDefPtr> front;
|
||||
std::queue<KernelDefPtr> to_visit;
|
||||
to_visit.push(kernel);
|
||||
// find all kernels before current kernel
|
||||
while (!to_visit.empty()) {
|
||||
auto curr = to_visit.front();
|
||||
to_visit.pop();
|
||||
if (front.count(curr)) {
|
||||
continue;
|
||||
}
|
||||
front.insert(curr);
|
||||
auto iter = kernel_front_map_.find(curr);
|
||||
if (iter != kernel_front_map_.end()) {
|
||||
auto visited_front = iter->second;
|
||||
front.insert(visited_front.begin(), visited_front.end());
|
||||
continue;
|
||||
}
|
||||
for (const auto &input : curr->input_kernels()) {
|
||||
to_visit.push(input);
|
||||
}
|
||||
}
|
||||
kernel_front_map_[kernel] = front;
|
||||
}
|
||||
}
|
||||
|
||||
bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr &mem_buf) {
|
||||
// determine whether the kernel_curr can reuse kernel_prev's output tensor membuf
|
||||
MS_EXCEPTION_IF_NULL(kernel_curr);
|
||||
MS_EXCEPTION_IF_NULL(mem_buf);
|
||||
auto kernel_prev = mem_buf->used_kernel_;
|
||||
MS_EXCEPTION_IF_NULL(kernel_prev);
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger_ = mindspore::Debugger::GetInstance();
|
||||
if (debugger_->DebuggerBackendEnabled()) {
|
||||
std::string current_kernel_name = kernel_curr->scope_full_name();
|
||||
if (debugger_->DebugServicesIsWatchPoint(current_kernel_name)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
auto curr_stream_id = kernel_curr->stream_id();
|
||||
auto prev_stream_id = kernel_prev->stream_id();
|
||||
if (curr_stream_id == prev_stream_id) {
|
||||
mem_buf->type_ = kInStreamReuse;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool reuse_between_streams = true;
|
||||
for (auto &stream_group : stream_groups_) {
|
||||
size_t cur_index = UINT32_MAX;
|
||||
size_t prev_index = UINT32_MAX;
|
||||
for (size_t index = 0; index < stream_group.size(); index++) {
|
||||
if (curr_stream_id == stream_group[index]) {
|
||||
cur_index = index;
|
||||
continue;
|
||||
}
|
||||
if (prev_stream_id == stream_group[index]) {
|
||||
prev_index = index;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if ((prev_index != UINT32_MAX) && (cur_index == UINT32_MAX || (prev_index > cur_index))) {
|
||||
// previous stream and current stream are not in the same group can't be reused
|
||||
// previous stream is behind current stream can't be reused
|
||||
reuse_between_streams = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (reuse_between_streams) {
|
||||
mem_buf->type_ = kBetweenStreamReuse;
|
||||
return true;
|
||||
}
|
||||
|
||||
auto iter = kernel_front_map_.find(kernel_curr);
|
||||
if (iter == kernel_front_map_.end()) {
|
||||
MS_LOG(EXCEPTION) << kernel_curr->scope_full_name() << " is not init.";
|
||||
}
|
||||
auto kernel_curr_front = iter->second;
|
||||
auto depend_count = kernel_curr_front.count(kernel_prev);
|
||||
if (depend_count) {
|
||||
mem_buf->type_ = kKernelDependenceReuse;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void BestFitMemReuse::AssignCommonNodeOutputOffset() {
|
||||
MS_EXCEPTION_IF_NULL(current_kernel_);
|
||||
for (const auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
|
||||
size_t index = GetTensorIndex(tensor_idx);
|
||||
auto tensor_desc = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
if (tensor_desc->type_ == kRefNodeInput) {
|
||||
total_refinput_size += tensor_desc->size_;
|
||||
} else if (tensor_desc->type_ == kRefNodeOutput) {
|
||||
total_refoutput_size += tensor_desc->size_;
|
||||
// no need to alloc refnode output's memory
|
||||
continue;
|
||||
} else if (tensor_desc->type_ == kCommNotReuse) {
|
||||
total_comm_not_reuse_size += tensor_desc->size_;
|
||||
} else if (tensor_desc->type_ == kCommReuse) {
|
||||
// get align size for communication op's single input
|
||||
tensor_desc->size_ = AlignCommunicationMemorySize(tensor_desc->size_);
|
||||
total_comm_reuse_size += tensor_desc->size_;
|
||||
}
|
||||
|
||||
auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_);
|
||||
if (!reusable_membuf_map.empty()) {
|
||||
auto membuf_index = reusable_membuf_map.begin()->second;
|
||||
// find the best suitable membuf in membuf list, and reuse it
|
||||
ReuseExistMembuf(tensor_desc.get(), membuf_index, kDynamicMem);
|
||||
} else {
|
||||
// no membuf can reuse, add new membuf after the membuf_ptr_list
|
||||
AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
|
||||
#endif
|
||||
}
|
||||
// skip left align border for communication op single input to used
|
||||
if (tensor_desc->type_ == kCommReuse) {
|
||||
tensor_desc->offset_ += kDefaultMemAlignSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::AssignCommunicationNodeOutputOffset() {
|
||||
size_t total_kernel_output_size = 0;
|
||||
// get all output size
|
||||
MS_EXCEPTION_IF_NULL(current_kernel_);
|
||||
for (const auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
|
||||
size_t index = GetTensorIndex(tensor_idx);
|
||||
auto tensor_desc = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
if (tensor_desc->type_ == kCommReuse) {
|
||||
total_comm_reuse_size += tensor_desc->size_;
|
||||
total_comm_output_reuse_size += tensor_desc->size_;
|
||||
total_kernel_output_size += tensor_desc->size_;
|
||||
} else {
|
||||
MS_LOG(ERROR) << "All communication op's outputs should be memory reuse, Kernel:"
|
||||
<< current_kernel_->scope_full_name() << " output index:" << tensor_idx
|
||||
<< " tensor_type:" << tensor_desc->type_;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
total_kernel_output_size = AlignCommunicationMemorySize(total_kernel_output_size);
|
||||
|
||||
// add left align border for the first output and right align border for the last output to alloc align border memory
|
||||
size_t output_index = 0;
|
||||
auto output_ref_indexes = current_kernel_->GetOutputRefIndexs();
|
||||
for (const auto &out_index : output_ref_indexes) {
|
||||
size_t index = GetTensorIndex(out_index);
|
||||
auto descption = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(descption);
|
||||
if (output_index == 0) {
|
||||
descption->size_ += kDefaultMemAlignSize;
|
||||
}
|
||||
|
||||
if ((output_index == 0) && (output_ref_indexes.size() == 1)) {
|
||||
// add right align border for single output
|
||||
descption->size_ += kDefaultMemAlignSize;
|
||||
}
|
||||
|
||||
output_index++;
|
||||
}
|
||||
|
||||
auto reusable_membuf_map = GetReusableMembufMap(total_kernel_output_size);
|
||||
if (!reusable_membuf_map.empty()) {
|
||||
auto membuf_index = reusable_membuf_map.begin()->second;
|
||||
output_index = 0;
|
||||
for (const auto &idx : current_kernel_->GetOutputRefIndexs()) {
|
||||
size_t index = GetTensorIndex(idx);
|
||||
auto desc = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(desc);
|
||||
ReuseExistMembuf(desc.get(), membuf_index + output_index, kDynamicMem);
|
||||
// skip skip left align border for communication op's first output to used
|
||||
if (output_index == 0) {
|
||||
desc->offset_ += kDefaultMemAlignSize;
|
||||
}
|
||||
output_index++;
|
||||
}
|
||||
} else {
|
||||
// no membuf can reuse, add new membuf after the membuf_ptr_list
|
||||
output_index = 0;
|
||||
for (const auto &tensor_index : current_kernel_->GetOutputRefIndexs()) {
|
||||
size_t index = GetTensorIndex(tensor_index);
|
||||
auto desc = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(desc);
|
||||
AddNewMembufPtr(desc.get(), kDynamicMem);
|
||||
// skip align size offset for first output to used
|
||||
if (output_index == 0) {
|
||||
desc->offset_ += kDefaultMemAlignSize;
|
||||
}
|
||||
output_index++;
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::AssignNodeOutputOffset() {
|
||||
if (current_kernel_->type_ == kCommunicationNode) {
|
||||
AssignCommunicationNodeOutputOffset();
|
||||
} else {
|
||||
AssignCommonNodeOutputOffset();
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::AssignNodeWorkspaceOffset() {
|
||||
for (auto &wk_idx : current_kernel_->GetWorkspaceRefIndexs()) {
|
||||
size_t index = GetWorkspaceIndex(wk_idx);
|
||||
auto wk_ref = wk_tensor_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(wk_ref);
|
||||
auto re_wk_membuf_map = GetReusableMembufMap(wk_ref->size_);
|
||||
if (!re_wk_membuf_map.empty()) {
|
||||
auto membuf_index = re_wk_membuf_map.begin()->second;
|
||||
ReuseExistMembuf(wk_ref.get(), membuf_index, kWorkspaceMem);
|
||||
} else {
|
||||
AddNewMembufPtr(wk_ref.get(), kWorkspaceMem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::ReuseExistMembuf(KernelRefCount *tensor_desc, size_t membuf_index, int flag) {
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
CheckMembufIndx(membuf_index);
|
||||
auto membuf = membuf_ptr_list_[membuf_index];
|
||||
MS_EXCEPTION_IF_NULL(membuf);
|
||||
// first to split && then update membuf_info
|
||||
if (IsSplit(tensor_desc->size_, membuf->size_)) {
|
||||
// split the membuf, and insert a new membuf after this membuf
|
||||
SplitMembuf(tensor_desc, membuf_index);
|
||||
}
|
||||
// update membuf status, and set tensor offset
|
||||
UpdateMembufInfo(tensor_desc, membuf.get(), flag);
|
||||
}
|
||||
|
||||
std::map<size_t, size_t> BestFitMemReuse::GetReusableMembufMap(size_t tensor_size) {
|
||||
std::map<size_t, size_t> size_map;
|
||||
for (size_t i = 0; i < membuf_ptr_list_.size(); ++i) {
|
||||
auto membuf = membuf_ptr_list_[i];
|
||||
auto index = i;
|
||||
bool is_membuf_ok = membuf->status_ == kUnused && membuf->size_ >= tensor_size;
|
||||
if (is_membuf_ok && IsUsable(current_kernel_, membuf)) {
|
||||
(void)size_map.insert(std::make_pair(membuf->size_, index));
|
||||
break;
|
||||
}
|
||||
}
|
||||
return size_map;
|
||||
}
|
||||
|
||||
void BestFitMemReuse::UpdateMembufInfo(KernelRefCount *tensor_desc, Membuf *membuf, int flag) {
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
MS_EXCEPTION_IF_NULL(membuf);
|
||||
auto real_index = GetRealIndex(IntToSize(tensor_desc->index_), flag);
|
||||
membuf->status_ = kReused;
|
||||
membuf->index_ = real_index;
|
||||
membuf->used_kernel_ = current_kernel_;
|
||||
tensor_desc->offset_ = membuf->offset_;
|
||||
}
|
||||
|
||||
bool BestFitMemReuse::IsSplit(size_t tensor_size, size_t membuf_size) const { return tensor_size < membuf_size; }
|
||||
|
||||
void BestFitMemReuse::SplitMembuf(const KernelRefCount *tensor_desc, size_t membuf_index) {
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
CheckMembufIndx(membuf_index);
|
||||
auto membuf = membuf_ptr_list_[membuf_index];
|
||||
MS_EXCEPTION_IF_NULL(membuf);
|
||||
auto bias = membuf->size_ - tensor_desc->size_;
|
||||
membuf->size_ = tensor_desc->size_;
|
||||
// to check if spilt membuf can be merge
|
||||
auto new_membuf = std::make_shared<Membuf>(kUnused, bias, membuf->offset_ + membuf->size_, kInvalidIndex,
|
||||
membuf->type_, current_kernel_);
|
||||
(void)membuf_ptr_list_.insert(membuf_ptr_list_.begin() + SizeToInt(membuf_index + 1), new_membuf);
|
||||
}
|
||||
|
||||
void BestFitMemReuse::AddNewMembufPtr(KernelRefCount *tensor_desc, int flag) {
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
size_t membuf_offset = 0;
|
||||
if (!membuf_ptr_list_.empty()) {
|
||||
membuf_offset = membuf_ptr_list_.back()->offset_ + membuf_ptr_list_.back()->size_;
|
||||
}
|
||||
auto membuf_size = tensor_desc->size_;
|
||||
auto real_index = GetRealIndex(IntToSize(tensor_desc->index_), flag);
|
||||
auto membuf = std::make_shared<Membuf>(kReused, membuf_size, membuf_offset, real_index, kNew, current_kernel_);
|
||||
membuf_ptr_list_.push_back(membuf);
|
||||
tensor_desc->offset_ = membuf_offset;
|
||||
}
|
||||
|
||||
void BestFitMemReuse::UpdateNodeInputAndMembuf() {
|
||||
// process node input tensor
|
||||
for (const auto &tensor_idx : current_kernel_->GetInputRefIndexs()) {
|
||||
size_t tensor_index = GetTensorIndex(tensor_idx);
|
||||
auto tensor_desc = tensor_ptr_list_[tensor_index];
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
tensor_desc->ref_count_--;
|
||||
if (tensor_desc->ref_count_ == 0) {
|
||||
ReleaseMembuf(tensor_index, kDynamicMem);
|
||||
} else if (tensor_desc->ref_count_ < 0) {
|
||||
MS_LOG(EXCEPTION) << "tensor: " << tensor_desc->index_ << " refcount: " << tensor_desc->ref_count_
|
||||
<< " check error";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::ReleaseNodeUnusedOutput() {
|
||||
for (const auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
|
||||
size_t tensor_index = GetTensorIndex(tensor_idx);
|
||||
auto tensor_desc = tensor_ptr_list_[tensor_index];
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
if (tensor_desc->ref_count_ == 0) {
|
||||
ReleaseMembuf(tensor_index, kDynamicMem);
|
||||
} else if (tensor_desc->ref_count_ < 0) {
|
||||
MS_LOG(EXCEPTION) << "tensor: " << tensor_desc->index_ << " refcount: " << tensor_desc->ref_count_
|
||||
<< " check error";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::ReleasePreNodeWorkspace(const KernelDef *kernel_def_ptr) {
|
||||
for (auto &workspace_index : kernel_def_ptr->GetWorkspaceRefIndexs()) {
|
||||
size_t index = GetWorkspaceIndex(workspace_index);
|
||||
auto wk_tensor = wk_tensor_list_[index];
|
||||
wk_tensor->ref_count_--;
|
||||
if (wk_tensor->ref_count_ == 0) {
|
||||
ReleaseMembuf(index, kWorkspaceMem);
|
||||
} else if (wk_tensor->ref_count_ < 0) {
|
||||
MS_LOG(EXCEPTION) << "tensor: " << wk_tensor->index_ << " refcount: " << wk_tensor->ref_count_ << " check error";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::ReleaseMembuf(size_t tensor_index, int flag) {
|
||||
if (membuf_ptr_list_.empty()) {
|
||||
return;
|
||||
}
|
||||
auto real_index = GetRealIndex(tensor_index, flag);
|
||||
auto membuf_iter = std::find_if(membuf_ptr_list_.begin(), membuf_ptr_list_.end(),
|
||||
[real_index](const MembufPtr &membuf) { return membuf->index_ == real_index; });
|
||||
if (membuf_iter == membuf_ptr_list_.end()) {
|
||||
return;
|
||||
}
|
||||
auto membuf = (*membuf_iter);
|
||||
MS_EXCEPTION_IF_NULL(membuf);
|
||||
membuf->status_ = kUnused;
|
||||
if (membuf_iter != membuf_ptr_list_.end() - 1) {
|
||||
auto next_iter = membuf_iter + 1;
|
||||
auto membuf_next = (*next_iter);
|
||||
MS_EXCEPTION_IF_NULL(membuf_next);
|
||||
if (membuf_next->status_ == kUnused) {
|
||||
bool is_merge = IsUsable(current_kernel_, membuf_next);
|
||||
if (is_merge) {
|
||||
membuf->size_ += membuf_next->size_;
|
||||
(void)membuf_ptr_list_.erase(next_iter);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (membuf_iter != membuf_ptr_list_.begin()) {
|
||||
auto prev_iter = membuf_iter - 1;
|
||||
auto membuf_prev = (*prev_iter);
|
||||
MS_EXCEPTION_IF_NULL(membuf_prev);
|
||||
if (membuf_prev->status_ == kUnused) {
|
||||
bool is_merge = IsUsable(current_kernel_, membuf_prev);
|
||||
if (is_merge) {
|
||||
membuf->size_ += membuf_prev->size_;
|
||||
membuf->offset_ = membuf_prev->offset_;
|
||||
(void)membuf_ptr_list_.erase(prev_iter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t BestFitMemReuse::AlignCommonMemorySize(size_t size) const {
|
||||
// memory size 512 align
|
||||
return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize;
|
||||
}
|
||||
|
||||
size_t BestFitMemReuse::AlignCommunicationMemorySize(size_t size) const {
|
||||
// memory size 512 align and add communication memory: left align border memory - data - right align border memory
|
||||
return kDefaultMemAlignSize + (size + kDefaultMemAlignSize - 1) / kDefaultMemAlignSize * kDefaultMemAlignSize +
|
||||
kDefaultMemAlignSize;
|
||||
}
|
||||
|
||||
size_t BestFitMemReuse::GetAllocatedSize() {
|
||||
size_t AllocatedSize = kTotalSize;
|
||||
if (membuf_ptr_list_.empty()) {
|
||||
return AllocatedSize;
|
||||
}
|
||||
AllocatedSize = membuf_ptr_list_.back()->offset_ + membuf_ptr_list_.back()->size_;
|
||||
MS_LOG(INFO) << "MemReuse Allocated Dynamic Size: " << AllocatedSize;
|
||||
return AllocatedSize;
|
||||
}
|
||||
|
||||
bool BestFitMemReuse::IsRelease() {
|
||||
// unable_used_node include the node type that output tensor cannot be released,
|
||||
// even if its refcount is equal to zero.
|
||||
std::unordered_set<std::string> unable_used_node = {
|
||||
prim::kPrimBatchNorm->name(),
|
||||
prim::kPrimBatchNormGrad->name(),
|
||||
};
|
||||
return unable_used_node.find(current_kernel_->kernel_name()) == unable_used_node.end();
|
||||
}
|
||||
|
||||
size_t BestFitMemReuse::GetTensorIndex(int index) const {
|
||||
if (index < 0 || IntToSize(index) >= tensor_ptr_list_.size()) {
|
||||
MS_LOG(WARNING) << "current cnode: " << current_kernel_->scope_full_name();
|
||||
MS_LOG(EXCEPTION) << "invalid tensor index";
|
||||
}
|
||||
return IntToSize(index);
|
||||
}
|
||||
|
||||
size_t BestFitMemReuse::GetWorkspaceIndex(int index) const {
|
||||
if (index < 0 || IntToSize(index) >= wk_tensor_list_.size()) {
|
||||
MS_LOG(WARNING) << "current cnode: " << current_kernel_->scope_full_name();
|
||||
MS_LOG(EXCEPTION) << "invalid tensor index";
|
||||
}
|
||||
return IntToSize(index);
|
||||
}
|
||||
|
||||
int BestFitMemReuse::GetRealIndex(size_t index, int flag) const {
|
||||
if (flag == kDynamicMem) {
|
||||
return SizeToInt(index);
|
||||
} else if (flag == kWorkspaceMem) {
|
||||
return kWorkspaceIndexFactor * SizeToInt(index + 1);
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "flag " << flag << " is invalid";
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::CheckMembufIndx(size_t membuf_index) const {
|
||||
if (membuf_index >= membuf_ptr_list_.size()) {
|
||||
MS_LOG(WARNING) << "current cnode: " << current_kernel_->scope_full_name();
|
||||
MS_LOG(EXCEPTION) << "invalid membuf index: " << membuf_index << ", real size: " << membuf_ptr_list_.size();
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::Reuse(const MemReuseUtil *mem_reuse_util_ptr) {
|
||||
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
|
||||
InitMemReuseInfo(mem_reuse_util_ptr);
|
||||
InitKernelDependence();
|
||||
KernelDefPtr pre_op = nullptr;
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
size_t op_num = 0;
|
||||
#endif
|
||||
for (const auto &op_def_ptr : op_ptr_list_) {
|
||||
current_kernel_ = op_def_ptr;
|
||||
// release pre_op_def
|
||||
if (pre_op != nullptr) {
|
||||
ReleasePreNodeWorkspace(pre_op.get());
|
||||
}
|
||||
MemReuseChecker::GetInstance().IsAddNewMembuf_ = false;
|
||||
// process node output tensor
|
||||
AssignNodeOutputOffset();
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
if (MemReuseChecker::GetInstance().IsAddNewMembuf_) {
|
||||
MemReuseChecker::GetInstance().SetAddNewMembuInfos(op_def_ptr.get(), membuf_ptr_list_, op_num);
|
||||
}
|
||||
#endif
|
||||
// deal with current op'workspace
|
||||
AssignNodeWorkspaceOffset();
|
||||
pre_op = op_def_ptr;
|
||||
// update node input tensor refcount, and membuf list status
|
||||
UpdateNodeInputAndMembuf();
|
||||
// check node output tensor which refcount is equal to zero
|
||||
if (IsRelease()) {
|
||||
ReleaseNodeUnusedOutput();
|
||||
}
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
MemReuseChecker::GetInstance().SetMembuInfos(op_def_ptr.get(), membuf_ptr_list_);
|
||||
++op_num;
|
||||
#endif
|
||||
}
|
||||
MS_LOG(INFO) << "Special Tensor total size: RefInput: " << total_refinput_size
|
||||
<< " RefOutput: " << total_refoutput_size << " CommReuse: " << total_comm_reuse_size
|
||||
<< " CommOutputReuse: " << total_comm_output_reuse_size
|
||||
<< " CommNotReuse: " << total_comm_not_reuse_size;
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
MemReuseChecker::GetInstance().ExportMembufInfoIR();
|
||||
MemReuseChecker::GetInstance().ExportAddNewMmebufIR();
|
||||
MemReuseChecker::GetInstance().set_kernel_front_map(kernel_front_map_);
|
||||
MemReuseChecker::GetInstance().ExportKernelDependence();
|
||||
#endif
|
||||
}
|
||||
} // namespace memreuse
|
||||
} // namespace mindspore
|
|
@ -1,174 +0,0 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_REUSE_ALLOCATOR_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_REUSE_ALLOCATOR_H_
|
||||
#include <cmath>
|
||||
#include <map>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <set>
|
||||
#include <queue>
|
||||
#include "backend/optimizer/mem_reuse/kernel_refcount.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace memreuse {
|
||||
static constexpr int kWorkspaceIndexFactor = -1000;
|
||||
static constexpr int kDynamicMem = -1;
|
||||
static constexpr int kWorkspaceMem = 1;
|
||||
static constexpr size_t kTotalSize = 0;
|
||||
enum Status { kUnused, kReused };
|
||||
enum MemType { kNew, kInStreamReuse, kBetweenStreamReuse, kKernelDependenceReuse };
|
||||
class Membuf {
|
||||
public:
|
||||
Membuf() = default;
|
||||
Membuf(Status status, size_t size, size_t offset, int index, MemType type, const KernelDefPtr &used_kernel)
|
||||
: status_(status), size_(size), offset_(offset), index_(index), type_(type), used_kernel_(used_kernel) {}
|
||||
~Membuf() = default;
|
||||
// Memory block status flags
|
||||
Status status_ = kUnused;
|
||||
size_t size_{0};
|
||||
size_t offset_{0};
|
||||
// Store the tensor index stored in this memory block at a certain moment
|
||||
int index_{0};
|
||||
MemType type_{kNew};
|
||||
KernelDefPtr used_kernel_;
|
||||
};
|
||||
using MembufPtr = std::shared_ptr<Membuf>;
|
||||
|
||||
class BestFitMemReuse {
|
||||
public:
|
||||
BestFitMemReuse() = default;
|
||||
~BestFitMemReuse() { membuf_ptr_list_.clear(); }
|
||||
/**
|
||||
* Init all information need by memory reuse
|
||||
* @param mem_reuse_util_ptr, initialize in the memreuse.cc
|
||||
*/
|
||||
void InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr);
|
||||
void CheckMembufIndx(size_t check_idx) const;
|
||||
void AssignNodeWorkspaceOffset();
|
||||
void ReleasePreNodeWorkspace(const KernelDef *kernel_def_ptr);
|
||||
/**
|
||||
* Assign output tensor memory offset of current kernel
|
||||
*/
|
||||
void AssignNodeOutputOffset();
|
||||
/**
|
||||
* Assign output tensor memory offset of common kernel
|
||||
*/
|
||||
void AssignCommonNodeOutputOffset();
|
||||
/**
|
||||
* Assign output tensor memory offset of communication kernel
|
||||
*/
|
||||
void AssignCommunicationNodeOutputOffset();
|
||||
/**
|
||||
* Update input tensor's status of current kernel, and the status of membuf used by current kernel
|
||||
*/
|
||||
void UpdateNodeInputAndMembuf();
|
||||
/**
|
||||
* Check whether to release the kernel output tensor which refcount is equal to zero
|
||||
*/
|
||||
void ReleaseNodeUnusedOutput();
|
||||
/**
|
||||
* Reuse the exist membuf if possible
|
||||
* @param tensor_desc, the output tensor of current kernel
|
||||
* @param membuf_index, the index of membuf to be reused
|
||||
* @param flag
|
||||
*/
|
||||
void ReuseExistMembuf(KernelRefCount *tensor_desc, size_t membuf_index, int flag);
|
||||
/**
|
||||
* Get the membuf that can be reused
|
||||
* @param tensor_size, the size of the tensor ready to assign memory offset
|
||||
* @return membuf map, key: the membuf size, value: the membuf index
|
||||
*/
|
||||
std::map<size_t, size_t> GetReusableMembufMap(size_t tensor_size);
|
||||
/**
|
||||
* Update the status of the reused memory block
|
||||
* @param tensor_desc, the tensor ready to assign memory
|
||||
* @param membuf, the membuf to be reused
|
||||
* @param flag, distinguish dynamic memory and workspace
|
||||
*/
|
||||
void UpdateMembufInfo(KernelRefCount *tensor_desc, Membuf *membuf, int flag);
|
||||
// If the size of the memory block is greater than the size of the tensor, split the extra memory
|
||||
void SplitMembuf(const KernelRefCount *tensor_desc, size_t membuf_index);
|
||||
// Determine if the memory block needs to be split
|
||||
bool IsSplit(size_t tensor_size, size_t membuf_size) const;
|
||||
// If there is no memory block that can be reused, add a new memory block at the end
|
||||
void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag);
|
||||
// Merge unused membuf
|
||||
void ReleaseMembuf(size_t tensor_index, int flag);
|
||||
// Memory address alignment for common memory
|
||||
size_t AlignCommonMemorySize(size_t size) const;
|
||||
// Memory address alignment for communication used memory
|
||||
size_t AlignCommunicationMemorySize(size_t size) const;
|
||||
int GetRealIndex(size_t index, int flag = kDynamicMem) const;
|
||||
size_t GetTensorIndex(int index) const;
|
||||
size_t GetWorkspaceIndex(int index) const;
|
||||
// Memory reuse main program entry
|
||||
void Reuse(const MemReuseUtil *mem_reuse_util_ptr);
|
||||
// Get the total memory that needs to be applied eventually
|
||||
size_t GetAllocatedSize();
|
||||
// return false, when the node output cannot be released
|
||||
bool IsRelease();
|
||||
/**
|
||||
* determine if the kernel_curr can reuse the output tensor add of kernel_prev
|
||||
* @param kernel_curr, current kernel
|
||||
* @param mem_buf, the membuf
|
||||
* @return bool
|
||||
*/
|
||||
bool IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr &mem_buf);
|
||||
/**
|
||||
* init the dependence of all kernels in the graph
|
||||
*/
|
||||
void InitKernelDependence();
|
||||
// set tensor_def and op_def
|
||||
void set_tensor_ptr_list(const std::vector<KernelRefCountPtr> &tensor_ptr_list) {
|
||||
tensor_ptr_list_ = tensor_ptr_list;
|
||||
}
|
||||
void set_workspace_ptr_list(const std::vector<KernelRefCountPtr> &workspace_ptr_list) {
|
||||
wk_tensor_list_ = workspace_ptr_list;
|
||||
}
|
||||
void set_op_ptr_list(const std::vector<KernelDefPtr> &op_ptr_list) { op_ptr_list_ = op_ptr_list; }
|
||||
|
||||
private:
|
||||
KernelDefPtr current_kernel_;
|
||||
// Save all tensor information
|
||||
std::vector<KernelRefCountPtr> tensor_ptr_list_;
|
||||
std::vector<KernelRefCountPtr> wk_tensor_list_;
|
||||
// Save all op information, including input and output tensor index
|
||||
std::vector<KernelDefPtr> op_ptr_list_;
|
||||
// Memory block information sequence, temporary variables
|
||||
std::vector<MembufPtr> membuf_ptr_list_;
|
||||
// kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def
|
||||
std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
|
||||
std::vector<std::vector<uint32_t>> stream_groups_;
|
||||
size_t total_refinput_size{0};
|
||||
size_t total_refoutput_size{0};
|
||||
size_t total_comm_reuse_size{0};
|
||||
size_t total_comm_output_reuse_size{0};
|
||||
size_t total_comm_not_reuse_size{0};
|
||||
};
|
||||
} // namespace memreuse
|
||||
} // namespace mindspore
|
||||
#endif // #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_REUSE_ALLOCATOR_H_
|
|
@ -26,7 +26,6 @@
|
|||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse.h"
|
||||
#include "backend/kernel_compiler/common_utils.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
|
||||
namespace mindspore {
|
||||
namespace memreuse {
|
||||
constexpr auto kSplitC = '/';
|
||||
|
|
|
@ -47,6 +47,9 @@ void AscendMemoryManager::MallocDeviceMemory() {
|
|||
} else {
|
||||
MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]";
|
||||
}
|
||||
} else {
|
||||
MS_LOG(INFO) << "Call rtMalloc to allocate device memory Success, size : " << device_mem_size_
|
||||
<< " bytes , address : " << reinterpret_cast<void *>(device_mem_base_);
|
||||
}
|
||||
AscendMemoryPool::GetInstance().Init(device_mem_base_, device_mem_size_, dynamic_mem_offset_);
|
||||
}
|
||||
|
@ -107,6 +110,12 @@ uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_me
|
|||
} else {
|
||||
align_size = GetCommonAlignSize(size);
|
||||
}
|
||||
auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset();
|
||||
MS_LOG(INFO) << "Malloc Memory for Static: size[" << align_size << "], Memory statistics: total[" << device_mem_size_
|
||||
<< "] dynamic [" << total_dynamic_size_ << "] static [" << device_mem_size_ - device_mem_pool_offset
|
||||
<< "], Pool statistics: pool total size [" << AscendMemoryPool::GetInstance().total_mem_statistics()
|
||||
<< "] used [" << AscendMemoryPool::GetInstance().used_mem_statistics()
|
||||
<< "] communication_mem:" << communication_mem;
|
||||
|
||||
if (MemoryProfiling::GetInstance().IsMemoryProfilingEnable() && graph_id != kInvalidGraphId) {
|
||||
auto node = MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id);
|
||||
|
@ -136,9 +145,9 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m
|
|||
}
|
||||
|
||||
auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset();
|
||||
MS_LOG(INFO) << "Malloc Memory: Dynamic, total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_
|
||||
<< "] memory pool[" << device_mem_size_ - device_mem_pool_offset << "])"
|
||||
<< " malloc [" << align_size << "] communication_mem: " << communication_mem;
|
||||
MS_LOG(INFO) << "Malloc Memory for Dynamic: size[" << align_size << "], Memory statistics: total[" << device_mem_size_
|
||||
<< "] dynamic[" << total_dynamic_size_ << "] static[" << device_mem_size_ - device_mem_pool_offset
|
||||
<< "] communication_mem: " << communication_mem;
|
||||
auto offset = dynamic_mem_offset_;
|
||||
auto new_offset = dynamic_mem_offset_ + align_size;
|
||||
if (new_offset >= device_mem_pool_offset) {
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include "runtime/device/kernel_runtime.h"
|
||||
#include "ir/anf.h"
|
||||
#include "backend/kernel_compiler/ascend_kernel_mod.h"
|
||||
|
|
|
@ -29,11 +29,13 @@
|
|||
#include "runtime/device/kernel_runtime.h"
|
||||
#include "runtime/device/kernel_runtime_manager.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_swap_manager.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace gpu {
|
||||
using mindspore::device::memswap::MemSwapManagerPtr;
|
||||
using mindspore::memreuse::MemReuseUtilPtr;
|
||||
class GPUKernelRuntime : public KernelRuntime {
|
||||
public:
|
||||
GPUKernelRuntime() = default;
|
||||
|
|
|
@ -276,7 +276,7 @@ void KernelRuntime::RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value
|
|||
void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||
MS_LOG(INFO) << "AssignStaticMemoryInput start";
|
||||
MS_LOG(INFO) << "AssignStaticMemoryInput start for graph " << graph->graph_id();
|
||||
auto graph_inputs = graph->inputs();
|
||||
auto graph_valid_input = graph->valid_inputs();
|
||||
graph_inputs.insert(graph_inputs.end(), graph->child_graph_result().begin(), graph->child_graph_result().end());
|
||||
|
@ -342,8 +342,8 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
|
|||
#endif
|
||||
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index);
|
||||
device_address = CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
|
||||
MS_LOG(INFO) << "Malloc Input for graph " << graph->graph_id() << ", node: " << item->fullname_with_scope()
|
||||
<< " index: " << index << " size: " << tensor_size;
|
||||
MS_LOG(INFO) << "Assign Static Memory for Input node, size:" << tensor_size
|
||||
<< " node:" << item->fullname_with_scope() << " index: " << index;
|
||||
if (mem_manager_->MallocMem(kStaticMem, tensor_size, device_address, graph->graph_id()) == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << tensor_size;
|
||||
}
|
||||
|
@ -355,7 +355,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
|
|||
|
||||
void KernelRuntime::AssignStaticMemoryOutput(const session::KernelGraph *graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_LOG(INFO) << "AssignStaticMemoryOutput start";
|
||||
MS_LOG(INFO) << "AssignStaticMemoryOutput start for graph " << graph->graph_id();
|
||||
auto nodes = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem});
|
||||
std::vector<session::KernelWithIndex> non_communication_op;
|
||||
// Assign Communicate Op Memory firstly.
|
||||
|
@ -500,12 +500,7 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(MemType type, const AnfNode
|
|||
return;
|
||||
}
|
||||
|
||||
if (type == kReuseDynamicMem) {
|
||||
// reuse communication op's all outputs' memory
|
||||
type = kReuseDynamicCommMem;
|
||||
}
|
||||
|
||||
if (type == kReuseDynamicCommMem || type == kSomasReuseDynamicMem) {
|
||||
if (type == kSomasReuseDynamicMem) {
|
||||
bool not_reuse = KernelMemNotReuse(node);
|
||||
if (not_reuse) {
|
||||
type = kDynamicMem;
|
||||
|
@ -588,7 +583,7 @@ void KernelRuntime::AssignCommunicationNodeInputMem(MemType type, const AnfNodeP
|
|||
return;
|
||||
}
|
||||
|
||||
if (type == kReuseDynamicMem || type == kSomasReuseDynamicMem) {
|
||||
if (type == kSomasReuseDynamicMem) {
|
||||
bool not_reuse = KernelMemNotReuse(node);
|
||||
if (not_reuse) {
|
||||
type = kDynamicMem;
|
||||
|
@ -616,20 +611,8 @@ void KernelRuntime::AssignCommunicationNodeInputMem(MemType type, const AnfNodeP
|
|||
void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, int index) {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||
if (AnfAlgo::IsGetNext(NOT_NULL(node)) && type == kReuseDynamicMem) {
|
||||
MS_LOG(INFO) << "GetNext disable mem_reuse";
|
||||
type = kDynamicMem;
|
||||
}
|
||||
|
||||
if (node->isa<CNode>()) {
|
||||
bool independent = AnfAlgo::IsIndependentNode(node->cast<CNodePtr>());
|
||||
if (independent && (type == kReuseDynamicMem)) {
|
||||
MS_LOG(INFO) << "Independent node " << node->fullname_with_scope() << " disable memory reuse";
|
||||
type = kDynamicMem;
|
||||
}
|
||||
}
|
||||
|
||||
if (type == kReuseDynamicMem || type == kSomasReuseDynamicMem) {
|
||||
if (type == kSomasReuseDynamicMem) {
|
||||
bool not_reuse = KernelMemNotReuse(node);
|
||||
if (not_reuse) {
|
||||
type = kDynamicMem;
|
||||
|
@ -652,6 +635,10 @@ void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, in
|
|||
continue;
|
||||
}
|
||||
MS_LOG(DEBUG) << "Assign Node:" << node->fullname_with_scope() << " output memory size:" << output_sizes[i];
|
||||
if (type == kStaticMem) {
|
||||
MS_LOG(INFO) << "Assign Static Memory for Output node, size:" << output_sizes[i]
|
||||
<< " node:" << node->fullname_with_scope();
|
||||
}
|
||||
std::string output_format = AnfAlgo::GetOutputFormat(node, i);
|
||||
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
|
||||
auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
|
||||
|
@ -699,8 +686,12 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
|
|||
if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER) &&
|
||||
!mem_manager_->MallocMemFromMemPool(address, node_size)) {
|
||||
MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << node_size;
|
||||
} else if (mem_manager_->MallocMem(kStaticMem, node_size, address, graph_id) == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << node_size;
|
||||
} else {
|
||||
MS_LOG(INFO) << "Assign Static Memory for Value node, size:" << node_size
|
||||
<< " node:" << value_node->fullname_with_scope();
|
||||
if (mem_manager_->MallocMem(kStaticMem, node_size, address, graph_id) == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << node_size;
|
||||
}
|
||||
}
|
||||
AnfAlgo::SetOutputAddr(address, output_idx, value_node.get());
|
||||
if (!address->SyncHostToDevice(trans::GetRuntimePaddingShape(value_node, 0), tensor_size, tensor->data_type(),
|
||||
|
@ -717,7 +708,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
|
|||
void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||
MS_LOG(DEBUG) << "AssignStaticMemoryValueNode start";
|
||||
MS_LOG(DEBUG) << "AssignStaticMemoryValueNode start for graph " << graph->graph_id();
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
// order the value nodes
|
||||
|
@ -747,8 +738,13 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
|
|||
if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER) &&
|
||||
!mem_manager_->MallocMemFromMemPool(address, tensor_size)) {
|
||||
MS_LOG(EXCEPTION) << "Device memory isn't enough and alloc failed, alloc size:" << tensor_size;
|
||||
} else if (mem_manager_->MallocMem(kStaticMem, tensor_size, address, graph->graph_id()) == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << tensor_size;
|
||||
} else {
|
||||
MS_LOG(INFO) << "Assign Static Memory for Value node, size:" << tensor_size
|
||||
<< " node:" << value_node->fullname_with_scope();
|
||||
if (mem_manager_->MallocMem(kStaticMem, tensor_size, address, graph->graph_id()) == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem
|
||||
<< ", tensor size is: " << tensor_size;
|
||||
}
|
||||
}
|
||||
AnfAlgo::SetOutputAddr(address, 0, value_node.get());
|
||||
ShapeVector shape = {1, SizeToLong(tensor_size)};
|
||||
|
@ -776,13 +772,8 @@ void KernelRuntime::AssignDynamicMemory(session::KernelGraph *graph) {
|
|||
|
||||
if (is_enable_mem_reuse) {
|
||||
MS_LOG(INFO) << "Memory Reuse is enable...";
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
mem_manager_->MallocReusedDynamicMem(graph);
|
||||
mem_type = kReuseDynamicMem;
|
||||
#else
|
||||
mem_manager_->MallocSomasDynamicMem(graph);
|
||||
mem_type = kSomasReuseDynamicMem;
|
||||
#endif
|
||||
} else {
|
||||
MS_LOG(INFO) << "Memory Reuse is disable...";
|
||||
}
|
||||
|
@ -973,8 +964,8 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
|
|||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||
|
||||
// Skip transpose kernel with "nop_op" attr which is not hidden or removed in PyNative infer scenario. Transpose
|
||||
// kernel, which is not supposed to be executed, is generated in TransDataSplit to support specific Transdata. And
|
||||
// hard code here should be removed after new Transdata programme is implemented in the foreseeable future.
|
||||
// kernel, which is not supposed to be executed, is generated in TransDataSplit to support specific Transdata.
|
||||
// And hard code here should be removed after new Transdata programme is implemented in the foreseeable future.
|
||||
if (AnfAlgo::HasNodeAttr("nop_op", kernel)) {
|
||||
for (size_t idx = 0; idx < AnfAlgo::GetOutputTensorNum(kernel); idx += 1) {
|
||||
auto real_input = AnfAlgo::GetRealInputIndex(kernel, idx);
|
||||
|
|
|
@ -22,9 +22,6 @@
|
|||
#endif
|
||||
#include "utils/ms_context.h"
|
||||
|
||||
using mindspore::memreuse::BestFitMemReuse;
|
||||
using mindspore::memreuse::MemReuseUtilPtr;
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
constexpr size_t kAlignBytes = 32;
|
||||
|
@ -37,24 +34,6 @@ size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) const {
|
|||
return (input_size + kMemAlignSize - 1) / kMemAlignSize * kMemAlignSize + 2 * kMemAlignSize;
|
||||
}
|
||||
|
||||
void MemoryManager::MallocReusedDynamicMem(const session::KernelGraph *graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
|
||||
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
|
||||
// set all infos
|
||||
mem_reuse_util_ptr->SetAllInfo(graph);
|
||||
auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>();
|
||||
MS_EXCEPTION_IF_NULL(bestfit_mem_reuse);
|
||||
bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
|
||||
size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize();
|
||||
MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]";
|
||||
mem_reuse_util_ptr_ = mem_reuse_util_ptr;
|
||||
auto base_ptr = MallocDynamicMem(total_allocated_size, false);
|
||||
MS_LOG(INFO) << "Reuse Memory from [" << reinterpret_cast<void *>(base_ptr) << "] to ["
|
||||
<< reinterpret_cast<void *>(base_ptr + total_allocated_size) << "]";
|
||||
mem_reuse_util_ptr_->set_mem_base(base_ptr);
|
||||
}
|
||||
|
||||
void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph *graph) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
SomasPtr somas_reuse_util_ptr = std::make_shared<somas::Somas>();
|
||||
|
@ -117,9 +96,6 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
|
|||
if (communication_mem) {
|
||||
address->communication_ptr_ = ptr - kMemAlignSize;
|
||||
}
|
||||
} else if (type == kReuseDynamicCommMem) {
|
||||
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_);
|
||||
ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
|
||||
} else if (type == kSomasReuseDynamicMem) {
|
||||
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
|
||||
ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
|
||||
|
@ -135,9 +111,6 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
|
|||
address->from_mem_pool_ = true;
|
||||
} else if (type == kDynamicMem) {
|
||||
ptr = MallocDynamicMem(size, false);
|
||||
} else if (type == kReuseDynamicMem) {
|
||||
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_);
|
||||
ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
|
||||
} else if (type == kSomasReuseDynamicMem) {
|
||||
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
|
||||
ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
|
||||
|
@ -147,10 +120,7 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
|
|||
}
|
||||
|
||||
uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size) {
|
||||
if (type == kReuseDynamicMem) {
|
||||
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_);
|
||||
return mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
|
||||
} else if (type == kSomasReuseDynamicMem) {
|
||||
if (type == kSomasReuseDynamicMem) {
|
||||
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
|
||||
return somas_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
|
||||
}
|
||||
|
|
|
@ -20,14 +20,12 @@
|
|||
#include <utility>
|
||||
#include <vector>
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
|
||||
#include "backend/optimizer/somas/somas.h"
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
enum MemType { kStaticMem, kDynamicMem, kReuseDynamicMem, kSomasReuseDynamicMem, kReuseDynamicCommMem };
|
||||
enum MemType { kStaticMem, kDynamicMem, kSomasReuseDynamicMem };
|
||||
const int kGetAllOuts = -1;
|
||||
const uint64_t kMemAlignSize = 512;
|
||||
using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr;
|
||||
using SomasPtr = mindspore::somas::SomasPtr;
|
||||
|
||||
class MemoryManager {
|
||||
|
@ -43,7 +41,6 @@ class MemoryManager {
|
|||
}
|
||||
virtual void ClearGlobalIdleMem() {}
|
||||
|
||||
void MallocReusedDynamicMem(const session::KernelGraph *graph);
|
||||
virtual void MallocSomasDynamicMem(const session::KernelGraph *graph);
|
||||
uint8_t *MallocOutputMem(const AnfNodePtr &node, size_t index, MemType type, size_t size,
|
||||
const DeviceAddressPtr &address, bool comm_mem);
|
||||
|
@ -72,7 +69,6 @@ class MemoryManager {
|
|||
uint64_t static_mem_offset_{0};
|
||||
size_t total_static_size_ = 0;
|
||||
size_t total_dynamic_size_ = 0;
|
||||
MemReuseUtilPtr mem_reuse_util_ptr_{nullptr};
|
||||
SomasPtr somas_reuse_util_ptr_{nullptr};
|
||||
};
|
||||
} // namespace device
|
||||
|
|
|
@ -1,153 +0,0 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "frontend/operator/ops.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
|
||||
|
||||
#include "common/common_test.h"
|
||||
#include "common/py_func_graph_fetcher.h"
|
||||
|
||||
using mindspore::memreuse::BestFitMemReuse;
|
||||
using mindspore::memreuse::KernelDef;
|
||||
using mindspore::memreuse::KernelDefPtr;
|
||||
using mindspore::memreuse::KernelRefCount;
|
||||
using mindspore::memreuse::KernelRefCountPtr;
|
||||
using mindspore::memreuse::MemReuseUtil;
|
||||
using mindspore::memreuse::MemReuseUtilPtr;
|
||||
using mindspore::memreuse::RefCountType;
|
||||
using MembufPtr = std::shared_ptr<mindspore::memreuse::Membuf>;
|
||||
|
||||
namespace mindspore {
|
||||
namespace memreuse {
|
||||
class TestMemReuseAllocator : public UT::Common {
|
||||
public:
|
||||
TestMemReuseAllocator() : getPyFun_("gtest_input.mem_reuse.TestMemReuseAllocator", true) {}
|
||||
void SetUp() {}
|
||||
|
||||
public:
|
||||
UT::PyFuncGraphFetcher getPyFun_;
|
||||
};
|
||||
|
||||
KernelDefPtr GetNewKernelDef(const std::vector<KernelRefCountPtr> &inputs,
|
||||
const std::vector<KernelRefCountPtr> &outputs, uint32_t stream_id) {
|
||||
auto kernel_def = std::make_shared<KernelDef>();
|
||||
kernel_def->set_input_refs(inputs);
|
||||
kernel_def->set_output_refs(outputs);
|
||||
kernel_def->set_stream_id(stream_id);
|
||||
return kernel_def;
|
||||
}
|
||||
|
||||
void InitMemReuseUtils(MemReuseUtil *mem_reuse_util_ptr) {
|
||||
// tensor params: ref_count, offset, size, index,
|
||||
auto tensor_0 = std::make_shared<KernelRefCount>();
|
||||
tensor_0->index_ = 0;
|
||||
tensor_0->size_ = 512;
|
||||
tensor_0->ref_count_ = 999;
|
||||
ASSERT_NE(tensor_0, nullptr);
|
||||
auto tensor_1 = std::make_shared<KernelRefCount>();
|
||||
tensor_1->index_ = 1;
|
||||
tensor_1->size_ = 1024;
|
||||
tensor_1->ref_count_ = 1;
|
||||
auto tensor_2 = std::make_shared<KernelRefCount>();
|
||||
tensor_2->index_ = 2;
|
||||
tensor_2->size_ = 1024;
|
||||
tensor_2->ref_count_ = 2;
|
||||
auto tensor_3 = std::make_shared<KernelRefCount>();
|
||||
tensor_3->index_ = 3;
|
||||
tensor_3->size_ = 32;
|
||||
tensor_3->ref_count_ = 1;
|
||||
auto tensor_4 = std::make_shared<KernelRefCount>();
|
||||
tensor_4->index_ = 4;
|
||||
tensor_4->size_ = 2048;
|
||||
tensor_4->ref_count_ = 1;
|
||||
auto tensor_5 = std::make_shared<KernelRefCount>();
|
||||
tensor_5->index_ = 5;
|
||||
tensor_5->size_ = 256;
|
||||
tensor_5->ref_count_ = 1;
|
||||
MS_LOG(INFO) << "init all tensor info success.";
|
||||
|
||||
std::vector<KernelRefCountPtr> inputs;
|
||||
std::vector<KernelRefCountPtr> outputs;
|
||||
inputs = {tensor_0};
|
||||
outputs = {tensor_1};
|
||||
auto kernel0 = GetNewKernelDef(inputs, outputs, 0);
|
||||
inputs = {tensor_1};
|
||||
outputs = {tensor_2};
|
||||
auto kernel1 = GetNewKernelDef(inputs, outputs, 0);
|
||||
inputs = {tensor_2};
|
||||
outputs = {tensor_3};
|
||||
auto kernel2 = GetNewKernelDef(inputs, outputs, 0);
|
||||
inputs = {tensor_2, tensor_3};
|
||||
outputs = {tensor_4};
|
||||
auto kernel3 = GetNewKernelDef(inputs, outputs, 0);
|
||||
inputs = {tensor_4};
|
||||
outputs = {tensor_5};
|
||||
auto kernel4 = GetNewKernelDef(inputs, outputs, 1);
|
||||
MS_LOG(INFO) << "init all op info success.";
|
||||
std::vector<KernelRefCountPtr> tensor_ptr_list{tensor_0, tensor_1, tensor_2, tensor_3, tensor_4, tensor_5};
|
||||
std::vector<KernelDefPtr> op_ptr_list{kernel0, kernel1, kernel2, kernel3, kernel4};
|
||||
|
||||
mem_reuse_util_ptr->set_total_refs_list(tensor_ptr_list);
|
||||
mem_reuse_util_ptr->set_kernel_def_ptr_list(op_ptr_list);
|
||||
}
|
||||
|
||||
TEST_F(TestMemReuseAllocator, mem_reuse_allocator) {
|
||||
MS_LOG(INFO) << "mem_resue_allocator UT";
|
||||
auto mem_reuse_util_ptr = std::make_shared<MemReuseUtil>();
|
||||
InitMemReuseUtils(mem_reuse_util_ptr.get());
|
||||
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
|
||||
best_fit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
|
||||
MS_LOG(INFO) << "run mem reuse success";
|
||||
size_t total_allocated_size = best_fit_mem_reuse->GetAllocatedSize();
|
||||
ASSERT_NE(total_allocated_size, 0);
|
||||
}
|
||||
|
||||
TEST_F(TestMemReuseAllocator, mem_reuse_allocator_add_membuf) {
|
||||
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
|
||||
auto tensor_desc = std::make_shared<KernelRefCount>();
|
||||
tensor_desc->SetKernelRefCountInfo(0, 1024, kDynamicRefCount);
|
||||
best_fit_mem_reuse->AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
|
||||
auto allocated_size = best_fit_mem_reuse->GetAllocatedSize();
|
||||
ASSERT_EQ(allocated_size, 1024);
|
||||
}
|
||||
|
||||
TEST_F(TestMemReuseAllocator, mem_reuse_allocator_split_membuf) {
|
||||
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
|
||||
auto tensor_0 = std::make_shared<KernelRefCount>();
|
||||
tensor_0->SetKernelRefCountInfo(0, 2048, kDynamicRefCount);
|
||||
best_fit_mem_reuse->AddNewMembufPtr(tensor_0.get(), kDynamicMem);
|
||||
|
||||
auto tensor_1 = std::make_shared<KernelRefCount>();
|
||||
tensor_1->SetKernelRefCountInfo(1, 800, kDynamicRefCount);
|
||||
auto is_split = best_fit_mem_reuse->IsSplit(tensor_1->size_, tensor_0->size_);
|
||||
ASSERT_EQ(is_split, true);
|
||||
|
||||
best_fit_mem_reuse->SplitMembuf(tensor_1.get(), 0);
|
||||
auto allocated_size = best_fit_mem_reuse->GetAllocatedSize();
|
||||
ASSERT_EQ(allocated_size, 2048);
|
||||
}
|
||||
|
||||
TEST_F(TestMemReuseAllocator, mem_reuse_allocator_align) {
|
||||
auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
|
||||
auto size = best_fit_mem_reuse->AlignCommonMemorySize(510);
|
||||
ASSERT_EQ(size, 1024);
|
||||
}
|
||||
} // namespace memreuse
|
||||
} // namespace mindspore
|
|
@ -20,7 +20,6 @@
|
|||
#include "backend/session/session_basic.h"
|
||||
#include "backend/session/ascend_session.h"
|
||||
#include "backend/optimizer/mem_reuse/kernel_refcount.h"
|
||||
#include "backend/optimizer/mem_reuse/mem_reuse_allocator.h"
|
||||
#include "runtime/device/kernel_info.h"
|
||||
#include "backend/kernel_compiler/tbe/tbe_kernel_mod.h"
|
||||
#include "frontend/operator/ops.h"
|
||||
|
@ -229,19 +228,6 @@ TEST_F(TestMemReuseWithPy, KernelRef) {
|
|||
ASSERT_NE(membuf_ptr, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(TestMemReuseWithPy, ReuseAssignDynamicMemory) {
|
||||
MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<MemReuseUtil>();
|
||||
ASSERT_NE(mem_reuse_util_ptr, nullptr);
|
||||
auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>();
|
||||
ASSERT_NE(bestfit_mem_reuse, nullptr);
|
||||
bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
|
||||
auto total_size = bestfit_mem_reuse->GetAllocatedSize();
|
||||
ASSERT_EQ(total_size, 0);
|
||||
KernelGraphPtr kernel_graph = std::make_shared<KernelGraph>();
|
||||
bool ret = mem_reuse_util_ptr->InitDynamicKernelRef(kernel_graph.get());
|
||||
ASSERT_EQ(ret, true);
|
||||
}
|
||||
|
||||
TEST_F(TestMemReuseWithPy, TestSetInfo) {
|
||||
KernelGraphPtr g = CreateKernelGraph();
|
||||
ASSERT_NE(g, nullptr);
|
||||
|
|
Loading…
Reference in New Issue