From 5452e711b500c563e69fbd8d1f04dc7a91f2dfbe Mon Sep 17 00:00:00 2001 From: laiyongqiang Date: Thu, 22 Oct 2020 19:46:50 +0800 Subject: [PATCH] [MemOpt]Safe Optimized Memory Allocation Solver --- cmake/options.cmake | 1 - .../ccsrc/backend/optimizer/CMakeLists.txt | 1 + .../ccsrc/backend/optimizer/somas/somas.cc | 1360 +++++++++++++++++ .../ccsrc/backend/optimizer/somas/somas.h | 132 ++ .../backend/optimizer/somas/somas_node.cc | 46 + .../backend/optimizer/somas/somas_node.h | 78 + .../optimizer/somas/somas_solver_alg.cc | 260 ++++ .../optimizer/somas/somas_solver_alg.h | 177 +++ .../optimizer/somas/somas_solver_core.cc | 405 +++++ .../optimizer/somas/somas_solver_core.h | 101 ++ .../optimizer/somas/somas_solver_pre.cc | 209 +++ .../optimizer/somas/somas_solver_pre.h | 159 ++ .../backend/optimizer/somas/somas_stream.cc | 53 + .../backend/optimizer/somas/somas_stream.h | 61 + .../backend/optimizer/somas/somas_tensor.cc | 64 + .../backend/optimizer/somas/somas_tensor.h | 129 ++ .../ccsrc/runtime/device/kernel_runtime.cc | 37 +- .../ccsrc/runtime/device/memory_manager.cc | 44 + .../ccsrc/runtime/device/memory_manager.h | 8 +- 19 files changed, 3308 insertions(+), 17 deletions(-) create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas.cc create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas.h create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_node.cc create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_node.h create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_solver_alg.cc create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_solver_alg.h create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_solver_core.cc create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_solver_core.h create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.cc create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.h create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_stream.cc create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_stream.h create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_tensor.cc create mode 100644 mindspore/ccsrc/backend/optimizer/somas/somas_tensor.h diff --git a/cmake/options.cmake b/cmake/options.cmake index 25eb42d940..286f432460 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -47,7 +47,6 @@ endif() if (DEBUG_MODE) set(CMAKE_BUILD_TYPE "Debug") - add_compile_definitions(MEM_REUSE_DEBUG) else() set(CMAKE_BUILD_TYPE "Release") endif() diff --git a/mindspore/ccsrc/backend/optimizer/CMakeLists.txt b/mindspore/ccsrc/backend/optimizer/CMakeLists.txt index 4574e2952f..115c8a50fe 100644 --- a/mindspore/ccsrc/backend/optimizer/CMakeLists.txt +++ b/mindspore/ccsrc/backend/optimizer/CMakeLists.txt @@ -2,6 +2,7 @@ file(GLOB_RECURSE _PREACTIVATE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc" "mem_reuse/*.cc" "pass/*.cc" + "somas/*.cc" ) if (ENABLE_D) diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas.cc b/mindspore/ccsrc/backend/optimizer/somas/somas.cc new file mode 100644 index 0000000000..2c7add77d6 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas.cc @@ -0,0 +1,1360 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include "backend/optimizer/somas/somas.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "backend/optimizer/somas/somas_node.h" +#include "backend/optimizer/somas/somas_solver_pre.h" +#include "backend/optimizer/somas/somas_stream.h" +#include "backend/optimizer/somas/somas_tensor.h" +#ifdef ENABLE_D +#include "runtime/device/ascend/ascend_stream_assign.h" +#endif +#include "backend/optimizer/common/helper.h" +#include "utils/ms_context.h" + +namespace mindspore { +namespace somas { +std::map tensor_type_name_map = {{kCommon, "Common"}, + {kOutputOnly, "OutputOnly"}, + {kWorkspace, "Workspace"}, + {kGetNextOutput, "GetNextOutput"}, + {kSummaryInput, "SummaryInput"}, + {kRefNodeInput, "RefNodeInput"}, + {kRefNodeOutput, "RefNodeOutput"}, + {kGap, "Gap"}, + {kUnknown, "Unknown"}}; + +bool Somas::Allocate(const session::KernelGraph *graph) { + auto ret = InitSomasTensors(graph); + if (!ret) { + MS_LOG(EXCEPTION) << "Somas Initialize Failed."; + } + + GenStatisticInfo(); + + // Computing Conflict pairs + MS_LOG(INFO) << "Start Computing Conflict Pairs"; + ComputeConflictPairs(); + MS_LOG(INFO) << "End Computing Conflict Pairs"; + + ret = Assign(graph); + if (!ret) { + MS_LOG(EXCEPTION) << "Somas Assign Failed."; + } + + return ret; +} + +bool Somas::InitSomasTensors(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + InitBasicInfo(graph); + GetNextOutputProcess(graph); + IndependentNodeOutputProcess(graph); + SummaryInputProcess(graph); + RefNodeProcess(graph); + UnReuseNodeProcess(graph); + GenContiguousList(graph); + + if (tensors_list_.empty()) { + MS_LOG(INFO) << "No Tensor from graph " << graph->graph_id(); + return true; + } + + MS_LOG(INFO) << "Created " << streams_list_.size() << " streams (" << streams_groups_.size() << " groups), " + << nodes_list_.size() << " nodes, " << tensors_list_.size() << " tensors, and " + << contiguous_tensors_list_.size() << " contiguous lists"; + + if (save_graphs_) { + std::string offline_file_path = + save_graphs_path_ + "/" + "somas_offline_log_" + std::to_string(graph->graph_id()) + ".ir"; + DumpOfflineIR(offline_file_path); + } + + return true; +} + +void Somas::InitSomasStreamAndNode(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + streams_list_ = {}; + nodes_list_ = {}; + size_t node_index = 0; + auto kernel_cnodes = graph->execution_order(); + for (const auto &kernel : kernel_cnodes) { + SomasStreamPtr stream; + auto stream_id = AnfAlgo::GetStreamId(kernel); + auto it = find_if(streams_list_.begin(), streams_list_.end(), + [stream_id](const SomasStreamPtr &s) { return s->GetId() == stream_id; }); + if (it == streams_list_.end()) { + stream = std::make_shared(stream_id); + streams_list_.push_back(stream); + } else { + stream = *it; + } + + // Node + NodeType type = kCommonNode; + if (AnfAlgo::IsCommunicationOp(kernel)) { + type = kCommunicationNode; + } + auto node = std::make_shared(node_index, type, stream); + MS_EXCEPTION_IF_NULL(node); + node->scope_full_name_ = kernel->fullname_with_scope(); + nodes_list_.push_back(node); + auto key = kernel.get(); + nodes_map_[key] = node; + node_index++; + } +} + +void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + tensors_list_ = {}; + size_t tensor_index = 0; + auto kernel_cnodes = graph->execution_order(); + for (const auto &kernel : kernel_cnodes) { + auto node = nodes_map_[kernel.get()]; + MS_EXCEPTION_IF_NULL(node); + auto stream = node->GetStream(); + MS_EXCEPTION_IF_NULL(stream); + + // Output Tensor + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + auto output_sizes = kernel_mod->GetOutputSizeList(); + for (const auto &size : output_sizes) { + auto output_tensor_index = tensor_index; + tensor_index++; + // Set all output tensor lifelong to true. + auto tensor = std::make_shared(output_tensor_index, node, stream, size, kLifeLongNone); + tensor->lifetime_.start_ = node->GetId(); + tensor->lifetime_.end_ = node->GetId(); + tensor->type_ = kOutputOnly; + tensors_list_.push_back(tensor); + tensors_map_[output_tensor_index] = tensor; + stream->tensors_.push_back(tensor); + node->tensors_.insert(tensor); + node->output_tensors_.push_back(tensor); + } + + // WorkSpace Tensor + auto workspace_sizes = kernel_mod->GetWorkspaceSizeList(); + for (const auto &size : workspace_sizes) { + auto workspace_tensor_index = tensor_index; + tensor_index++; + SomasTensorPtr tensor = std::make_shared(workspace_tensor_index, node, stream, size, kLifeLongNone); + tensor->type_ = kWorkspace; + tensor->lifetime_.start_ = node->GetId(); + tensor->lifetime_.end_ = node->GetId(); + tensors_list_.push_back(tensor); + tensors_map_[workspace_tensor_index] = tensor; + stream->tensors_.push_back(tensor); + node->tensors_.insert(tensor); + node->workspace_tensors_.push_back(tensor); + } + } +} + +void Somas::InitSomasInputTensors(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + bool is_all_nop_node = opt::IsAllNopNode(graph); + auto kernel_cnodes = graph->execution_order(); + for (const auto &kernel : kernel_cnodes) { + auto node = nodes_map_[kernel.get()]; + MS_EXCEPTION_IF_NULL(node); + auto stream = node->GetStream(); + MS_EXCEPTION_IF_NULL(stream); + + // Input Tensor + auto input_tensor_num = AnfAlgo::GetInputTensorNum(kernel); + for (size_t i = 0; i < input_tensor_num; i++) { + auto input_node = kernel->input(i + 1); + session::KernelWithIndex prenode_index; + if (is_all_nop_node) { + prenode_index = AnfAlgo::VisitKernelWithReturnType(input_node, 0, false); + } else { + prenode_index = AnfAlgo::VisitKernelWithReturnType(input_node, 0, true); + } + if (AnfAlgo::CheckPrimitiveType(prenode_index.first, prim::kPrimMakeTuple)) { + MS_LOG(EXCEPTION) << "Input node [" << input_node->DebugString() << "]'s input " << i << " is MakeTuple"; + } + + if (!AnfAlgo::IsRealCNodeKernel(prenode_index.first)) { + MS_LOG(DEBUG) << "Input [" << prenode_index.first->fullname_with_scope() << "] is not a real cnode kernel."; + continue; + } + + auto iter = nodes_map_.find(prenode_index.first.get()); + if (iter == nodes_map_.end()) { + MS_LOG(EXCEPTION) << "Kernel[" << kernel->fullname_with_scope() << "]'s input " << i << " [" + << prenode_index.first->fullname_with_scope() << "] is not init."; + } + auto pre_somas_node = iter->second; + if (prenode_index.second > pre_somas_node->output_tensors_.size()) { + MS_LOG(EXCEPTION) << "Output index " << prenode_index.second << " exceed input node [" + << prenode_index.first->fullname_with_scope() << "]'s outputs size " + << pre_somas_node->output_tensors_.size(); + } + auto input_somas_tensor = pre_somas_node->output_tensors_[prenode_index.second]; + MS_EXCEPTION_IF_NULL(input_somas_tensor); + node->input_tensors_.push_back(input_somas_tensor); + if (input_somas_tensor->type_ == kOutputOnly) { + input_somas_tensor->type_ = kCommon; + } + input_somas_tensor->destinations_.insert(node); + input_somas_tensor->destinationStreams_.insert(stream); + if (input_somas_tensor->lifetime_.end_ < node->GetId()) { + input_somas_tensor->lifetime_.end_ = node->GetId(); + } + + if (node != pre_somas_node) { + node->ancestor_nodes_.insert(pre_somas_node); + } + auto input_tensor_stream = input_somas_tensor->GetSourceStream(); + if (input_tensor_stream != stream) { + stream->ancestor_streams_.insert(input_tensor_stream); + input_somas_tensor->between_streams_ = true; + } + } + } +} + +void Somas::InitBasicInfo(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); +#ifdef ENABLE_D + streams_groups_ = device::ascend::AscendStreamAssign::GetInstance().get_stream_group(); +#endif + InitSomasStreamAndNode(graph); + InitSomasOutputAndWorkspaceTensors(graph); + InitSomasInputTensors(graph); + + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + save_graphs_ = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); + save_graphs_path_ = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_PATH); + if (save_graphs_path_.empty()) { + save_graphs_path_ = "."; + } + if (save_graphs_) { + std::string file_path = save_graphs_path_ + "/" + "somas_basic_info_" + std::to_string(graph->graph_id()) + ".ir"; + DumpSomasBasicIR(file_path); + } +} + +void Somas::GetNextOutputProcess(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto kernel_cnodes = graph->execution_order(); + size_t total_size = 0; + for (const auto &kernel : kernel_cnodes) { + if (AnfAlgo::GetCNodeName(kernel) != kGetNextOpName) { + continue; + } + auto iter = nodes_map_.find(kernel.get()); + if (iter != nodes_map_.end()) { + auto getnext_output_tensors = iter->second->output_tensors_; + for (auto &tensor : getnext_output_tensors) { + tensor->offset_ = total_size; + total_size += tensor->GetAlignedSize(); + tensor->lifelong_value_ = kLifeLongGraphAll; + tensor->type_ = kGetNextOutput; + } + } + } + + this->get_next_size_ = total_size; + + MS_LOG(INFO) << "Special Tensor total size: GetNext Output " << total_size; +} + +void Somas::IndependentNodeOutputProcess(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto kernel_cnodes = graph->execution_order(); + size_t total_size = 0; + for (const auto &kernel : kernel_cnodes) { + bool independent = AnfAlgo::IsIndependentNode(kernel); + if (!independent) { + continue; + } + auto iter = nodes_map_.find(kernel.get()); + if (iter != nodes_map_.end()) { + auto semi_reuse_output_tensors = iter->second->output_tensors_; + for (auto &tensor : semi_reuse_output_tensors) { + total_size += tensor->GetAlignedSize(); + tensor->aligned_size_ = 0; + } + } + } + + MS_LOG(INFO) << "Special Tensor total size: Independent Node output " << total_size; + + if (save_graphs_ && total_size) { + std::string file_path = + save_graphs_path_ + "/" + "Independent_node_process_" + std::to_string(graph->graph_id()) + ".ir"; + DumpSomasBasicIR(file_path); + } +} + +void Somas::SummaryInputProcess(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + bool summary_exist = graph->summary_node_exist(); + if (!summary_exist) { + return; + } + + auto summary_nodes = graph->summary_nodes(); + if (summary_nodes.empty()) { + return; + } + + size_t total_summary_size = 0; + for (auto &node_item : summary_nodes) { + auto node = node_item.second.first; + size_t index = IntToSize(node_item.second.second); + auto iter = nodes_map_.find(node.get()); + if (iter != nodes_map_.end()) { + auto input_node = iter->second; + if (index < input_node->output_tensors_.size()) { + auto tensor = iter->second->output_tensors_[index]; + tensor->lifelong_value_ = kLifeLongGraphEnd; + tensor->type_ = kSummaryInput; + total_summary_size += tensor->GetAlignedSize(); + MS_LOG(INFO) << "Set summary node input tensor's lifelong, node: " << node->fullname_with_scope() + << " index: " << index; + } else { + MS_LOG(WARNING) << "Index exceed size, node " << node->fullname_with_scope() << " index: " << index + << " size: " << input_node->output_tensors_.size(); + } + } else { + MS_LOG(WARNING) << "Can't find summary input node " << node->fullname_with_scope() << " index: " << index; + } + } + + MS_LOG(INFO) << "Special Tensor total size: SummaryNodes: " << total_summary_size; + + if (save_graphs_) { + std::string file_path = + save_graphs_path_ + "/" + "somas_summary_process_" + std::to_string(graph->graph_id()) + ".ir"; + DumpSomasBasicIR(file_path); + } +} + +void Somas::RefNodeProcess(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto kernel_cnodes = graph->execution_order(); + size_t total_output_size = 0; + size_t total_input_size = 0; + for (const auto &kernel : kernel_cnodes) { + auto kernel_mod = AnfAlgo::GetKernelMod(kernel); + if (kernel_mod == nullptr) { + MS_LOG(WARNING) << "Kernel mode is NULL Of " << kernel->fullname_with_scope(); + } + auto output_sizes = kernel_mod->GetOutputSizeList(); + size_t output_index = 0; + for (const auto &size : output_sizes) { + auto out_index = output_index; + output_index++; + session::AnfWithOutIndex out_pair(kernel, out_index); + if (graph->IsInRefOutputMap(out_pair)) { + auto origin_pair = graph->GetRefCorrespondOutput(out_pair); + MS_EXCEPTION_IF_NULL(origin_pair.first); + auto output_tensor = nodes_map_[kernel.get()]->output_tensors_[out_index]; + MS_EXCEPTION_IF_NULL(output_tensor); + output_tensor->type_ = kRefNodeOutput; + total_output_size += size; + + if (AnfAlgo::IsRealCNodeKernel(origin_pair.first)) { + auto ori_node = origin_pair.first->cast(); + auto ori_index = origin_pair.second; + auto input_tensor = nodes_map_[ori_node.get()]->output_tensors_[ori_index]; + MS_EXCEPTION_IF_NULL(input_tensor); + input_tensor->type_ = kRefNodeInput; + total_input_size += input_tensor->aligned_size_; + std::vector refnode_input_output; + refnode_input_output.push_back(input_tensor->GetId()); + refnode_input_output.push_back(output_tensor->GetId()); + ref_node_constraints_.push_back(refnode_input_output); + MS_LOG(INFO) << "RefNode: input " << input_tensor->GetId() << " output " << output_tensor->GetId(); + } + } + } + } + + MS_LOG(INFO) << "Special Tensor total size: RefNode: input " << total_input_size << " output " << total_output_size; + + if (save_graphs_ && (total_input_size || total_output_size)) { + std::string file_path = + save_graphs_path_ + "/" + "somas_refnode_process_" + std::to_string(graph->graph_id()) + ".ir"; + DumpSomasBasicIR(file_path); + } +} + +void Somas::UnReuseNodeProcess(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + vector full_name_list = {}; + if (full_name_list.size() == 0) { + return; + } + + auto kernel_cnodes = graph->execution_order(); + for (const auto &kernel : kernel_cnodes) { + auto full_name = kernel->fullname_with_scope(); + auto iter = std::find(full_name_list.begin(), full_name_list.end(), full_name); + if (iter != full_name_list.end()) { + MS_LOG(INFO) << "Set UnReuse Node in somas, Node:" << full_name; + auto key = kernel.get(); + auto somas_node = nodes_map_[key]; + // input + auto inputs = somas_node->input_tensors_; + for (auto &input : inputs) { + input->lifelong_value_ = kLifeLongGraphAll; + } + + // output + auto outputs = somas_node->output_tensors_; + MS_LOG(INFO) << "Output size of " << kernel->fullname_with_scope() << " is " << outputs.size(); + for (auto &output : outputs) { + output->lifelong_value_ = kLifeLongGraphAll; + } + + // workspace + auto workspaces = somas_node->workspace_tensors_; + for (auto &workspace : workspaces) { + workspace->lifelong_value_ = kLifeLongGraphAll; + } + } + } + + if (save_graphs_) { + std::string file_path = + save_graphs_path_ + "/" + "somas_unreuse_node_process_" + std::to_string(graph->graph_id()) + ".ir"; + DumpSomasBasicIR(file_path); + } +} + +SomasTensorPtr Somas::CreateGapTensor(size_t gap_tensor_id) { + // real size 512 and lifelong_ + const size_t gap_size = 512; + auto gap_tensor = std::make_shared(gap_tensor_id++, nullptr, nullptr, gap_size, kLifeLongGraphAll); + gap_tensor->type_ = kGap; + gap_tensor->lifetime_.start_ = 0; + gap_tensor->lifetime_.end_ = 0xffff; + tensors_map_[gap_tensor->GetId()] = gap_tensor; + tensors_list_.push_back(gap_tensor); + return gap_tensor; +} + +void Somas::GenContiguousList(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + size_t gap_tensor_id = tensors_list_.size(); + for (const auto &node : nodes_list_) { + MS_EXCEPTION_IF_NULL(node); + if (node->GetType() != kCommunicationNode) { + continue; + } + std::vector inputs; + auto input_before_gap = CreateGapTensor(gap_tensor_id); + gap_tensor_id++; + inputs.push_back(input_before_gap->GetId()); + + for (const auto &input_tensor : node->input_tensors_) { + comm_input_total_size_ += input_tensor->aligned_size_; + inputs.push_back(input_tensor->GetId()); + } + + auto input_after_gap = CreateGapTensor(gap_tensor_id); + gap_tensor_id++; + inputs.push_back(input_after_gap->GetId()); + contiguous_tensors_list_.push_back(inputs); + + std::vector outputs; + auto output_before_gap = CreateGapTensor(gap_tensor_id); + gap_tensor_id++; + outputs.push_back(output_before_gap->GetId()); + + for (const auto &output_tensor : node->output_tensors_) { + output_tensor->lifelong_value_ = kLifeLongGraphStart; + comm_output_total_size_ += output_tensor->aligned_size_; + outputs.push_back(output_tensor->GetId()); + } + + auto output_after_gap = CreateGapTensor(gap_tensor_id); + gap_tensor_id++; + outputs.push_back(output_after_gap->GetId()); + contiguous_tensors_list_.push_back(outputs); + } +} + +void Somas::PreprocessingConflicts() { + // Compute ancestor streams + for (auto stream : streams_list_) { + stream->ComputeAncestorStreams(); + } + + // Preset ancestor streams for node + for (auto node : nodes_list_) { + node->PresetAncestorStreams(streams_list_); + } + + // Compute ancestor nodes : needs to be executed in topological order + for (auto node : nodes_list_) { + node->ComputeAncestorNodes(); + } + + // Compute MaxDestinationId for between-stream tensors + for (auto tensor : tensors_list_) { + if (tensor->IsBetweenStreams()) { + tensor->ComputeMaxDestinationId(); + } + } + + // Preprocessing for stream groups + for (auto group : streams_groups_) { + vector previous_streams; + for (int64_t stream_id : group) { + auto it = std::find_if(streams_list_.begin(), streams_list_.end(), + [stream_id](const SomasStreamPtr &stream) { return stream->GetId() == stream_id; }); + if (it != streams_list_.end()) { + for (auto stream : previous_streams) { + (*it)->ancestor_streams_group_.insert(stream); + } + previous_streams.push_back(*it); + } + } + } + + // Atomic: fix any issues on saved lifetimes of tensors + for (auto tensor : tensors_list_) { + MS_EXCEPTION_IF_NULL(tensor); + if (tensor->IsGap()) { + continue; + } + for (auto node : tensor->destinations_) { + MS_EXCEPTION_IF_NULL(node); + MS_EXCEPTION_IF_NULL(tensor->GetSourceNode()); + if (tensor->GetSourceNode()->GetId() > node->GetId()) { + tensor->lifetime_.start_ = node->GetId(); + } + } + MS_EXCEPTION_IF_NULL(tensor->GetSourceNode()); + if (tensor->GetSourceNode()->GetId() > tensor->lifetime_.end_) { + tensor->lifetime_.end_ = tensor->GetSourceNode()->GetId(); + } + } +} + +static bool LifetimeOverlap(lifetime_t lifetime1, lifetime_t lifetime2) { + size_t start1 = std::min(lifetime1.start_, lifetime1.end_); + size_t end1 = std::max(lifetime1.start_, lifetime1.end_); + size_t start2 = std::min(lifetime2.start_, lifetime2.end_); + size_t end2 = std::max(lifetime2.start_, lifetime2.end_); + return (std::max(end1, end2) - std::min(start1, start2) <= end2 - start2 + end1 - start1); +} + +static bool Subset(std::set streamSet1, std::set streamSet2) { + for (auto stream : streamSet1) { + if (streamSet2.count(stream) == 0) { + return false; + } + } + return true; +} + +static void EraseSet(std::set *streamSet, std::set removeStreamsSet) { + for (auto stream : removeStreamsSet) { + streamSet->erase(stream); + } +} + +static bool ValidSubset(std::set destStreams, std::set ancestorsAndSelf, + SomasTensorPtr ancestorTensor, SomasTensorPtr tensor) { + MS_EXCEPTION_IF_NULL(ancestorTensor); + MS_EXCEPTION_IF_NULL(tensor); + for (auto stream : destStreams) { + if (ancestorsAndSelf.count(stream) == 0) { + return false; + } + + if (stream != tensor->GetSourceStream()) { + MS_EXCEPTION_IF_NULL(tensor->GetSourceStream()); + if (tensor->GetSourceStream()->ancestor_streams_group_.count(stream) == 0 && + ancestorTensor->max_destination_id_[stream] > + tensor->GetSourceNode()->anc_stream_max_order_[stream->GetId()]) { + return false; + } + } else { // stream == tensor->GetSourceStream() + if (ancestorTensor->max_destination_id_[stream] >= tensor->lifetime_.start_) { + return false; + } + } + } + return true; +} + +void Somas::ComputeConflictPairs() { + if (tensors_list_.empty()) { + MS_LOG(INFO) << "No Tensor for Conflict computing"; + return; + } + + MS_LOG(INFO) << "Start Preprocessing Conflicts"; + PreprocessingConflicts(); + MS_LOG(INFO) << "End Preprocessing Conflicts"; + + MS_LOG(INFO) << "Start Array Initialization"; + cannot_reuse_ = + std::make_shared(tensors_list_.back()->GetId() + 1, + tensors_list_.back()->GetId() + 1); // array size is (max_id + 1) x (max_id + 1) + MS_LOG(INFO) << "End Array Initialization"; + + MS_LOG(INFO) << "Start Conflict Computing"; + + size_t count_reuse = 0; + + // Loop for ancestor stream groups reuse + for (auto stream : streams_list_) { + std::set ancestors = stream->ancestor_streams_group_; + + std::set ancestors_and_self = ancestors; + ancestors_and_self.insert(stream); + + for (auto ancestor_stream : ancestors) { + for (auto ancestor_tensor : ancestor_stream->tensors_) { + if (ancestor_tensor->GetAlignedSize() == 0) continue; + if (ancestor_tensor->IsLifelong()) continue; + if (ancestor_tensor->IsSemiLifelongEnd()) continue; + if (ancestor_tensor->IsRefOverlap()) continue; + + if (!ancestor_tensor->IsBetweenStreams() || Subset(ancestor_tensor->destinationStreams_, ancestors)) { + for (auto tensor : stream->tensors_) { + if (tensor->IsGap()) continue; + if (tensor->GetAlignedSize() == 0) continue; + if (tensor->IsLifelong()) continue; + if (tensor->IsSemiLifelongStart()) continue; + if (tensor->IsRefOverlap()) continue; + + (*cannot_reuse_)(ancestor_tensor->GetId(), tensor->GetId()) = 0; + (*cannot_reuse_)(tensor->GetId(), ancestor_tensor->GetId()) = 0; + count_reuse++; + } + } else { + for (auto tensor : stream->tensors_) { + if (Subset(ancestor_tensor->destinationStreams_, ancestors_and_self) && + ancestor_tensor->max_destination_id_[tensor->GetSourceStream()] < tensor->lifetime_.start_) { + if (tensor->IsGap()) continue; + if (tensor->GetAlignedSize() == 0) continue; + if (tensor->IsLifelong()) continue; + if (tensor->IsSemiLifelongStart()) continue; + if (tensor->IsRefOverlap()) continue; + + (*cannot_reuse_)(ancestor_tensor->GetId(), tensor->GetId()) = 0; + (*cannot_reuse_)(tensor->GetId(), ancestor_tensor->GetId()) = 0; + count_reuse++; + } + } + } + } + } + } + + // Loop for ancestor streams (no groups) + for (auto stream : streams_list_) { + auto ancestors_no_groups = stream->ancestor_streams_; + EraseSet(&ancestors_no_groups, stream->ancestor_streams_group_); + + for (auto ancestor_stream : ancestors_no_groups) { + for (auto ancestor_tensor : ancestor_stream->tensors_) { + if (ancestor_tensor->GetAlignedSize() == 0) continue; + if (ancestor_tensor->IsLifelong()) continue; + if (ancestor_tensor->IsSemiLifelongEnd()) continue; + if (ancestor_tensor->IsRefOverlap()) continue; + + if (!ancestor_tensor->IsBetweenStreams()) { + for (auto tensor : stream->tensors_) { + if (tensor->IsGap()) continue; + if (tensor->GetAlignedSize() == 0) continue; + if (tensor->IsLifelong()) continue; + if (tensor->IsSemiLifelongStart()) continue; + if (tensor->IsRefOverlap()) continue; + + size_t max_ancestor_order = tensor->GetSourceNode()->anc_stream_max_order_[ancestor_stream->GetId()]; + + if (ancestor_tensor->lifetime_.end_ <= max_ancestor_order) { + (*cannot_reuse_)(ancestor_tensor->GetId(), tensor->GetId()) = 0; + (*cannot_reuse_)(tensor->GetId(), ancestor_tensor->GetId()) = 0; + count_reuse++; + } + } + } else { // ancestor tensor goes to another stream (might go to same stream also) + std::set dest_streams = ancestor_tensor->destinationStreams_; + std::set ancestors = stream->ancestor_streams_; + + std::set ancestors_and_self = ancestors; + ancestors_and_self.insert(stream); + + for (auto tensor : stream->tensors_) { + if (tensor->IsGap()) continue; + if (tensor->GetAlignedSize() == 0) continue; + if (tensor->IsLifelong()) continue; + if (tensor->IsSemiLifelongStart()) continue; + if (tensor->IsRefOverlap()) continue; + + if (ValidSubset(dest_streams, ancestors_and_self, ancestor_tensor, tensor)) { + (*cannot_reuse_)(ancestor_tensor->GetId(), tensor->GetId()) = 0; + (*cannot_reuse_)(tensor->GetId(), ancestor_tensor->GetId()) = 0; + count_reuse++; + } + } + } + } + } + } + + // Loop for same stream + for (auto stream : streams_list_) { + MS_EXCEPTION_IF_NULL(stream); + for (auto tensor1 : stream->tensors_) { + if (tensor1->GetAlignedSize() == 0) continue; + if (tensor1->IsLifelong()) continue; + if (tensor1->IsRefOverlap()) continue; + + for (auto tensor2 : stream->tensors_) { + if (tensor2->GetId() >= tensor1->GetId()) + break; // keep only when tensors kept sorted in tensors-vector of each stream, otherwise remove + + if (tensor2->GetAlignedSize() == 0) continue; + if (tensor2->IsLifelong()) continue; + if (tensor2->IsRefOverlap()) continue; + + // Between streams extra safety + if (tensor1->IsBetweenStreams() && tensor2->IsBetweenStreams()) continue; + + // Check lifetime overlap + lifetime_t lifetime1 = tensor1->lifetime_; + lifetime_t lifetime2 = tensor2->lifetime_; + + if (!LifetimeOverlap(lifetime1, lifetime2)) { + // Between-streams extra safety + if (tensor1->IsBetweenStreams() && lifetime1.end_ < lifetime2.start_) continue; + if (tensor2->IsBetweenStreams() && lifetime2.end_ < lifetime1.start_) continue; + + // Semi-lifelong extra safety + if (lifetime1.end_ < lifetime2.start_ && (tensor2->IsSemiLifelongStart() || tensor1->IsSemiLifelongEnd())) + continue; + if (lifetime2.end_ < lifetime1.start_ && (tensor1->IsSemiLifelongStart() || tensor2->IsSemiLifelongEnd())) + continue; + + // If arrived here, allow reuse + (*cannot_reuse_)(tensor2->GetId(), tensor1->GetId()) = 0; + (*cannot_reuse_)(tensor1->GetId(), tensor2->GetId()) = 0; + count_reuse++; + } + } + } + } + + MS_LOG(INFO) << "End Conflict Computing"; + MS_LOG(INFO) << "Found " << count_reuse << " tensor pairs of allowed reusability"; +} + +bool Somas::Assign(const session::KernelGraph *graph) { + if (tensors_list_.empty()) { + MS_LOG(INFO) << "No Tensor for Assigner"; + return true; + } + + // Ref Node Preprocessing + MS_LOG(INFO) << "Start Solving Preprocessing for Ref Node"; + for (auto ref_node_list : ref_node_constraints_) { + // Keep all constraints for first tensor in list + size_t tid_0 = ref_node_list[0]; + for (auto tensor : tensors_list_) { + if ((*cannot_reuse_)(tid_0, tensor->GetId()) == 1) continue; + for (size_t tid : ref_node_list) { + if ((*cannot_reuse_)(tid, tensor->GetId()) == 1) { + (*cannot_reuse_)(tid_0, tensor->GetId()) = 1; + (*cannot_reuse_)(tensor->GetId(), tid_0) = 1; + break; + } + } + } + // Set rest to size 0, so that solver ignores them + for (size_t i = 1; i < ref_node_list.size(); ++i) { + tensors_map_[ref_node_list[i]]->aligned_size_ = 0; + } + } + MS_LOG(INFO) << "End Solving Preprocessing for Ref Node"; + + // Ref Overlap Preprocessing + MS_LOG(INFO) << "Start Solving Preprocessing for Ref Overlap"; + // In ConflictComputing(), by use of ref_overlap_ flag, each tensor in a ref_overlap_list has all entries 1 in + // cannot_reuse_ array Here, we allow reuse only among tensors in same list + for (auto ref_overlap_list : ref_overlap_constraints_) { + for (size_t tid_1 : ref_overlap_list) { + for (size_t tid_2 : ref_overlap_list) { + (*cannot_reuse_)(tid_1, tid_2) = 0; + (*cannot_reuse_)(tid_2, tid_1) = 0; + } + } + } + MS_LOG(INFO) << "End Solving Preprocessing for Ref Overlap"; + + // Compute number of constraints for each tensor + for (auto tensor1 : tensors_list_) { + size_t count_constraints = 0; + for (auto tensor2 : tensors_list_) { + if ((*cannot_reuse_)(tensor1->GetId(), tensor2->GetId()) == 1) { + count_constraints++; + } + } + tensor1->num_constraints_ = count_constraints; + } + + // Solver info -- moved here because we set sizes to zero in ref node preprocessing (was before in GetSomasTensors()) + MS_LOG(INFO) << "Start Loop to create solver info"; + for (auto tensor : tensors_list_) { + if (tensor->GetSolverTensorDesc() != nullptr && tensor->type_ != kGetNextOutput) { + SomasSolverTensorDescPtr pSolverTensor = tensor->GetSolverTensorDesc(); + solver_tensor_desc_list_.insert( + std::pair(pSolverTensor->index_, pSolverTensor)); + } + } + MS_LOG(INFO) << "End Loop to create solver info"; + + MS_LOG(INFO) << "Start Solving"; + if (solver_tensor_desc_list_.empty()) { + MS_LOG(INFO) << "solver_tensor_desc_list is empty."; + return true; + } + + somas_solver_ = std::make_shared(); + somas_solver_->Solving(graph, &solver_tensor_desc_list_, cannot_reuse_, contiguous_tensors_list_, false); + MS_LOG(INFO) << "End Solving"; + + // Update solver_tensor_desc offset to tensors list + for (const auto &tensor : tensors_list_) { + tensor->SetOffset(get_next_size_); + } + + // Ref Node Postprocessing + MS_LOG(INFO) << "\nStart Solving Postprocessing for Ref Node"; + // Set offset for rest of ref node list (ignored by solver due to ref node preprocessing) + for (auto ref_node_list : ref_node_constraints_) { + for (size_t i = 1; i < ref_node_list.size(); ++i) { + tensors_map_[ref_node_list[i]]->offset_ = tensors_map_[ref_node_list[0]]->offset_; + } + } + MS_LOG(INFO) << "\nEnd Solving Postprocessing for Ref Node"; + + // Set mem_offset_ value by solver result + mem_offset_ = static_cast(somas_solver_->GetMaxOffset()) + get_next_size_; + + if (save_graphs_) { + std::string mem_pool_file_path = + save_graphs_path_ + "/" + "somas_mem_pool_info_" + std::to_string(graph->graph_id()) + ".ir"; + DumpSomasMemoryPoolInfoIR(mem_pool_file_path); + } + return true; +} + +std::string Somas::GetSplitName(const std::string &scope_name) const { + auto indx = scope_name.rfind('/'); + if (indx == std::string::npos) { + return scope_name; + } else { + if (indx < scope_name.size() - 1) { + auto split_name = scope_name.substr(indx + 1); + return split_name; + } + return scope_name; + } +} + +void Somas::DumpSomasBasicIR(const string filename) { + if (filename.size() > PATH_MAX) { + MS_LOG(ERROR) << "File path " << filename << " is too long."; + return; + } + char real_path[PATH_MAX] = {0}; +#if defined(_WIN32) || defined(_WIN64) + if (_fullpath(real_path, filename.c_str(), PATH_MAX) == nullptr) { + MS_LOG(DEBUG) << "dir " << filename << " does not exit."; + } +#else + if (realpath(filename.c_str(), real_path) == nullptr) { + MS_LOG(DEBUG) << "Dir " << filename << " does not exit."; + } +#endif + + std::string path_string = real_path; + ChangeFileMode(path_string, S_IRWXU); + std::ofstream ofs(real_path); + + if (!ofs.is_open()) { + MS_LOG(ERROR) << "Open dump file '" << real_path << "' failed!"; + return; + } + ofs << "All Tensors:\n\n"; + ofs << "index:" + << "\tsize:" + << "\treal_size:" + << "\toffset:" + << "\taddr:" + << "\ttype:" + << "\tlifelong:\n"; + + for (const auto &tensor : tensors_list_) { + ofs << "%" << tensor->GetId() << "T" + << "\t" + << "#" << tensor->GetAlignedSize() << "S" + << "\t" + << "#" << tensor->GetOriginalSize() << "S" + << "\t" + << "&" << tensor->GetOffset() << "" + << "\t" + << "&" << static_cast(tensor->GetOffset() + mem_base_addr_) << "\t" + << tensor_type_name_map[tensor->type_] << "\t" << tensor->IsLifelong() << "\n"; + } + + ofs << "\n\nAll Nodes:\n\n"; + for (const auto &node : nodes_list_) { + auto scope_name = node->scope_full_name_; + std::string split_name = GetSplitName(scope_name); + ofs << "$" << node->GetId() << "\t" << split_name << "\t" << static_cast(node->GetType()) << "\t"; + ofs << "inputs["; + for (const auto &in : node->input_tensors_) { + ofs << "%" << in->GetId() << "T" + << ", "; + } + ofs << "]"; + ofs << "\toutputs["; + for (const auto &out : node->output_tensors_) { + ofs << "%" << out->GetId() << "T" + << ", "; + } + ofs << "]"; + ofs << "\tworkspace["; + for (const auto &wk : node->workspace_tensors_) { + ofs << "%" << wk->GetId() << "T" + << ", "; + } + ofs << "]"; + ofs << "\tstreamID[" + << "@" << node->GetStream()->GetId() << "]\n"; + } + + ofs << "\n\nAll Stream Groups:\n\n"; + for (const auto &stream_group : streams_groups_) { + for (const auto &stream : stream_group) { + ofs << "stm" << stream << " "; + } + ofs << "\n"; + } + + ofs << "\n\nAll Ref Node Info:\n\n"; + for (const auto &ref_in_out : ref_node_constraints_) { + ofs << "refnode input-output:"; + for (const auto &item : ref_in_out) { + ofs << "%" << item << "T "; + } + ofs << "\n"; + } +} + +void Somas::DumpOfflineIR(const string filename) { + MS_LOG(INFO) << "Printing somas-log-from-graph log: " << filename; + if (filename.size() > PATH_MAX) { + MS_LOG(ERROR) << "File path " << filename << " is too long."; + return; + } + char real_path[PATH_MAX] = {0}; +#if defined(_WIN32) || defined(_WIN64) + if (_fullpath(real_path, filename.c_str(), PATH_MAX) == nullptr) { + MS_LOG(DEBUG) << "dir " << filename << " does not exit."; + } +#else + if (realpath(filename.c_str(), real_path) == nullptr) { + MS_LOG(DEBUG) << "Dir " << filename << " does not exit."; + } +#endif + + std::string path_string = real_path; + ChangeFileMode(path_string, S_IRWXU); + std::ofstream ofs(real_path); + + if (!ofs.is_open()) { + MS_LOG(ERROR) << "Open dump file '" << real_path << "' failed!"; + return; + } + + for (auto tensor : tensors_list_) { + if (tensor->IsGap()) continue; + if (tensor->IsOutputOnly()) { + ofs << "Somas EDGE ERROR src=n" << tensor->GetSourceNode()->GetId() + << ", srcstm=" << tensor->GetSourceStream()->GetId() << ", dst=nc" + << ", dststm=nc" + << ", workspace=0, size=" << tensor->GetAlignedSize() + << ", lifelong=" << static_cast(tensor->lifelong_value_) << ", tid=" << tensor->GetId() + << ", start=" << tensor->lifetime_.start_ << ", end=" << tensor->lifetime_.end_ << std::endl; + } else { + std::map dest_infos; + for (SomasNodePtr dest_node : tensor->destinations_) { + dest_infos.insert(std::make_pair(dest_node->GetId(), dest_node->GetStream()->GetId())); + } + + for (auto dest_info : dest_infos) { + ofs << "Somas EDGE src=n" << tensor->GetSourceNode()->GetId() + << ", srcstm=" << tensor->GetSourceStream()->GetId() << ", dst=n" << dest_info.first + << ", dststm=" << dest_info.second << ", workspace=" << static_cast(tensor->type_ == kWorkspace) + << ", size=" << tensor->GetAlignedSize() << ", lifelong=" << static_cast(tensor->lifelong_value_) + << ", tid=" << tensor->GetId() << ", start=" << tensor->lifetime_.start_ + << ", end=" << tensor->lifetime_.end_ << std::endl; + } + } + } + for (vector tList : contiguous_tensors_list_) { + ofs << "Somas CONTIGUOUS "; + // ignore front and back gaps + for (size_t i = 1; i < tList.size() - 1; ++i) { + if (tensors_map_[tList[i]]->IsGap()) { + ofs << "INPUT"; + break; + } + if (i == tList.size() - 2) ofs << "OUTPUT"; + } + for (size_t tid : tList) { + if (tensors_map_[tid]->IsGap()) continue; + ofs << " " << tid; + } + ofs << std::endl; + } + for (const auto &group : streams_groups_) { + ofs << "Somas GROUP"; + for (int64_t sid : group) { + ofs << " " << sid; + } + ofs << std::endl; + } + ofs.close(); +} + +void Somas::DumpSomasMemoryIR(const string filename) { + if (filename.size() > PATH_MAX) { + MS_LOG(ERROR) << "File path " << filename << " is too long."; + return; + } + char real_path[PATH_MAX] = {0}; +#if defined(_WIN32) || defined(_WIN64) + if (_fullpath(real_path, filename.c_str(), PATH_MAX) == nullptr) { + MS_LOG(DEBUG) << "dir " << filename << " does not exit."; + } +#else + if (realpath(filename.c_str(), real_path) == nullptr) { + MS_LOG(DEBUG) << "Dir " << filename << " does not exit."; + } +#endif + + std::string path_string = real_path; + ChangeFileMode(path_string, S_IRWXU); + std::ofstream ofs(real_path); + + if (!ofs.is_open()) { + MS_LOG(ERROR) << "Open dump file '" << real_path << "' failed!"; + return; + } + + std::map mem_map; + for (auto tensor : tensors_list_) { + mem_map[tensor->GetOffset()] = 0; + } + + size_t num = 0; + for (auto iter = mem_map.begin(); iter != mem_map.end(); ++iter, ++num) { + iter->second = num; + } + + std::map> mem_list; + + for (const auto &tensor : tensors_list_) { + size_t key = tensor->offset_; + auto iter = mem_list.find(key); + if (iter == mem_list.end()) { + std::map id_tensor_map; + id_tensor_map[tensor->GetId()] = tensor; + mem_list[key] = id_tensor_map; + } else { + iter->second[tensor->GetId()] = tensor; + } + } + + ofs << "mem_id:" + << "\tstart_offset:" + << "\tend_offset:" + << "\ttensor_id:" + << "\torigin_size:" + << "\talign_size:" + << "\tstart_addr:" + << "\tend_addr:" + << "\ttype:" + << "\tsrc_node:" + << "\tsrc_stm_id:" + << "lifetime_start\t" + << "lifetime_end\n"; + + for (const auto &mem : mem_list) { + auto id_tensor_map = mem.second; + for (const auto &id_tensor : id_tensor_map) { + auto tensor = id_tensor.second; + std::string scope_name; + size_t src_stm_id = 0xffff; + if (tensor->GetSourceNode() != nullptr) { + scope_name = tensor->GetSourceNode()->scope_full_name_; + src_stm_id = tensor->GetSourceNode()->GetStream()->GetId(); + } else { + scope_name = "Somas Tensor"; + } + + std::string split_name = GetSplitName(scope_name); + ofs << "#" << mem_map[tensor->GetOffset()] << "\t" << tensor->GetOffset() << "\t" + << tensor->GetOffset() + tensor->GetAlignedSize() << "\t%" << tensor->GetId() << "T\t" + << tensor->GetOriginalSize() << "\t" << tensor->GetAlignedSize() << "\t&" + << static_cast(tensor->GetOffset() + mem_base_addr_) << "\t&" + << static_cast(tensor->GetOffset() + mem_base_addr_ + tensor->GetAlignedSize()) << "\t" + << tensor_type_name_map[tensor->type_] << "\t" << split_name << "\tstm" << src_stm_id << "\t" + << tensor->lifetime_.start_ << "\t" << tensor->lifetime_.end_ << "\n"; + } + } +} + +size_t Somas::CalcLowerBound() const { + size_t max_node_id = std::accumulate(tensors_list_.begin(), tensors_list_.end(), 0, [](size_t max_id, auto tensor) { + return std::max(max_id, tensor->lifetime_.end_); + }); + + std::map lifetime_lb; + for (size_t time = 0; time <= max_node_id; time++) { + lifetime_lb[time] = 0; + } + + size_t lower, upper; + for (auto tensor : tensors_list_) { + if (tensor->lifelong_value_ == kLifeLongGraphAll) { + lower = 0; + upper = max_node_id; + } else { + lower = tensor->lifetime_.start_; + upper = tensor->lifetime_.end_; + } + + for (size_t time = lower; time <= upper; time++) { + lifetime_lb[time] += tensor->GetAlignedSize(); + } + } + + size_t max_lifetime = 0; + for (size_t time = 0; time <= max_node_id; time++) { + if (max_lifetime < lifetime_lb[time]) { + max_lifetime = lifetime_lb[time]; + } + } + return max_lifetime; +} + +void Somas::DumpSomasMemoryPoolInfoIR(const string filename) { + if (filename.size() > PATH_MAX) { + MS_LOG(ERROR) << "File path " << filename << " is too long."; + return; + } + char real_path[PATH_MAX] = {0}; +#if defined(_WIN32) || defined(_WIN64) + if (_fullpath(real_path, filename.c_str(), PATH_MAX) == nullptr) { + MS_LOG(DEBUG) << "dir " << filename << " does not exit."; + } +#else + if (realpath(filename.c_str(), real_path) == nullptr) { + MS_LOG(DEBUG) << "Dir " << filename << " does not exit."; + } +#endif + + std::string path_string = real_path; + ChangeFileMode(path_string, S_IRWXU); + std::ofstream ofs(real_path); + + if (!ofs.is_open()) { + MS_LOG(ERROR) << "Open dump file '" << real_path << "' failed!"; + return; + } + + ofs << "Total Dynamic Size (Upper Bound):\t" << upper_bound_ << "\n" + << "Theoretical Optimal Size (Lower Bound):\t" << lower_bound_ << "\n" + << "Total Workspace Size:\t" << workspace_total_size_ << "\n" + << "Total Communication Input Tensor Size:\t" << comm_input_total_size_ << "\n" + << "Total Communication Output Tensor Size:\t" << comm_output_total_size_ << "\n" + << "Total LifeLong All Tensor Size:\t" << lifelong_all_total_size_ << "\n" + << "Total LifeLong Start Tensor Size:\t" << lifelong_start_total_size_ << "\n" + << "Total LifeLong End Tensor Size:\t" << lifelong_end_total_size_ << "\n" + << "Reused Size(Allocate Size):\t" << GetTotalMemSize() << "\n\n\n"; + + std::map mem_map; + for (auto tensor : tensors_list_) { + mem_map[tensor->GetOffset()] = 0; + } + + size_t num = 0; + for (auto iter = mem_map.begin(); iter != mem_map.end(); ++iter, ++num) { + iter->second = num; + } + + std::map tensor_mask; + for (size_t i = 0; i < tensors_list_.size(); ++i) { + tensor_mask[i] = false; + } + + std::vector order_tensors_list = tensors_list_; + std::sort(order_tensors_list.begin(), order_tensors_list.end(), + [](const SomasTensorPtr tensor1, const SomasTensorPtr tensor2) { + return tensor1->GetOffset() < tensor2->GetOffset(); + }); + + size_t cur_total_tensor_size = 0; + for (const auto &node : nodes_list_) { + if (node == nullptr) { + MS_LOG(WARNING) << "Node is NULL, No ir information output"; + continue; + } + ofs << "node_name: " << GetSplitName(node->scope_full_name_) << "\tnode_id: " << node->GetId() << "\n"; + ofs << "mem_id\t" + << "mem_head\t" + << "mem_tail\t" + << "node_id\t" + << "stream_id\t" + << "tensor_id\t" + << "tensor_type\t" + << "lifelong\t" + << "origin_size\t" + << "align_size\t" + << "source_node\t" + << "lifetime_start\t" + << "lifetime_end\t\n"; + + size_t cur_alive_tensor_size = 0; + size_t curr_runtime = node->GetId(); + for (size_t i = 0; i < order_tensors_list.size(); ++i) { + auto tensor = order_tensors_list[i]; + if (tensor->lifetime_.start_ <= curr_runtime && tensor->lifetime_.end_ >= curr_runtime) { + cur_alive_tensor_size += tensor->aligned_size_; + if (!tensor_mask[i]) { + cur_total_tensor_size += tensor->aligned_size_; + tensor_mask[i] = true; + } + std::string scope_name; + size_t src_node_id = 0xffff; + size_t tensor_stream_id = 0xffff; + if (tensor->GetSourceNode() != nullptr) { + scope_name = tensor->GetSourceNode()->scope_full_name_; + src_node_id = tensor->GetSourceNode()->GetId(); + tensor_stream_id = tensor->GetSourceNode()->GetId(); + } else { + scope_name = "Somas Tensor"; + } + std::string split_name = GetSplitName(scope_name); + + ofs << "&" << mem_map[tensor->GetOffset()] << "\t" << tensor->GetOffset() << "\t" + << tensor->GetOffset() + tensor->GetAlignedSize() << "\t" + << "\t#" << src_node_id << "\t@" << tensor_stream_id << "\t%" << tensor->GetId() << "T\t" + << tensor_type_name_map[tensor->type_] << "\t" << static_cast(tensor->lifelong_value_) << "\t" + << tensor->GetOriginalSize() << "\t" << tensor->GetAlignedSize() << "\t" + << "\t" << split_name << "\t" << tensor->lifetime_.start_ << "\t" << tensor->lifetime_.end_ << "\n"; + } + } + ofs << "Current Alive Tensor Size(Lower Bound):\t" << cur_alive_tensor_size << "\n" + << "Current Total Tensor Size(Upper Bound):\t" << cur_total_tensor_size << "\n\n"; + } + ofs.close(); +} + +void Somas::GenStatisticInfo() { + lower_bound_ = CalcLowerBound(); + for (const auto &tensor : tensors_list_) { + upper_bound_ += tensor->aligned_size_; + if (tensor->type_ == kWorkspace) { + workspace_total_size_ += tensor->aligned_size_; + } + if (tensor->lifelong_value_ == kLifeLongGraphAll) { + lifelong_all_total_size_ += tensor->aligned_size_; + } else if (tensor->lifelong_value_ == kLifeLongGraphStart) { + lifelong_start_total_size_ += tensor->aligned_size_; + } else if (tensor->lifelong_value_ == kLifeLongGraphEnd) { + lifelong_end_total_size_ += tensor->aligned_size_; + } + } + + const double giga = 1024. * 1024. * 1024.; + MS_LOG(INFO) << "Lower Bound: " << lower_bound_ << " (" << lower_bound_ / giga + << " GB), Upper Bound: " << upper_bound_ << " (" << upper_bound_ / giga << " GB)"; + + MS_LOG(INFO) << "\nTotal Dynamic Size (Upper Bound):\t" << upper_bound_ << "\n" + << "Theoretical Optimal Size (Lower Bound):\t" << lower_bound_ << "\n" + << "Total Workspace Size:\t" << workspace_total_size_ << "\n" + << "Total Communication Input Tensor Size:\t" << comm_input_total_size_ << "\n" + << "Total Communication Output Tensor Size:\t" << comm_output_total_size_ << "\n" + << "Total LifeLong All Tensor Size:\t" << lifelong_all_total_size_ << "\n" + << "Total LifeLong Start Tensor Size:\t" << lifelong_start_total_size_ << "\n" + << "Total LifeLong End Tensor Size:\t" << lifelong_end_total_size_ << "\n" + << "Reused Size(Allocate Size):\t" << GetTotalMemSize() << "\n\n\n"; +} + +uint8_t *Somas::GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const { + auto key = node.get(); + auto iter = nodes_map_.find(key); + uint8_t *ptr = nullptr; + if (iter != nodes_map_.end()) { + if (index >= iter->second->output_tensors_.size()) { + MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's workspace size:[" + << iter->second->output_tensors_.size() << "]"; + } + auto output_tensor = iter->second->output_tensors_[index]; + ptr = mem_base_addr_ + output_tensor->offset_; + } else { + MS_LOG(EXCEPTION) << "node [" << AnfAlgo::GetCNodeName(node) << "] don't exist in nodes_map"; + } + return ptr; +} + +uint8_t *Somas::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const { + auto key = node.get(); + auto iter = nodes_map_.find(key); + uint8_t *ptr = nullptr; + if (iter != nodes_map_.end()) { + if (index >= iter->second->workspace_tensors_.size()) { + MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's workspace size:[" + << iter->second->workspace_tensors_.size() << "]"; + } + auto workspace_tensor = iter->second->workspace_tensors_[index]; + ptr = mem_base_addr_ + workspace_tensor->offset_; + } + return ptr; +} + +} // namespace somas +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas.h b/mindspore/ccsrc/backend/optimizer/somas/somas.h new file mode 100644 index 0000000000..14fdf2a84f --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas.h @@ -0,0 +1,132 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "backend/kernel_compiler/tbe/tbe_utils.h" +#include "backend/optimizer/somas/somas_node.h" +#include "backend/optimizer/somas/somas_solver_pre.h" +#include "backend/optimizer/somas/somas_stream.h" +#include "backend/session/anf_runtime_algorithm.h" +#include "backend/session/kernel_graph.h" + +namespace mindspore { +namespace somas { +class Somas { + public: + // Constructors/Destructors + Somas() = default; + Somas(const Somas &) = delete; + Somas &operator=(const Somas &) = delete; + ~Somas() = default; + + bool Allocate(const session::KernelGraph *graph); + size_t GetTotalMemSize() { return mem_offset_; } + void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; } + uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const; + uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const; + + void DumpSomasBasicIR(const string filename); + void DumpSomasMemoryIR(const string filename); + + private: + // Maps + std::unordered_map tensors_map_; + std::map nodes_map_; + + // Vectors + std::vector nodes_list_; + std::vector streams_list_; + std::vector tensors_list_; + + // Stream groups + std::vector> streams_groups_; + + // Solver + std::unordered_map solver_tensor_desc_list_; + SomasSolverPrePtr somas_solver_; + + // Constraints + std::shared_ptr cannot_reuse_; + + // Contiguous list + std::vector> contiguous_tensors_list_; + + // Ref lists + std::vector> ref_node_constraints_; + std::vector> ref_overlap_constraints_; + + // total Offset + size_t mem_offset_; + + // getnext op output size + size_t get_next_size_; + + // Memory base addr + uint8_t *mem_base_addr_{nullptr}; + + // Save debug info + bool save_graphs_{false}; + std::string save_graphs_path_; + + // statistic info + size_t upper_bound_{0}; + size_t lower_bound_{0}; + size_t workspace_total_size_{0}; + size_t comm_input_total_size_{0}; + size_t comm_output_total_size_{0}; + size_t lifelong_all_total_size_{0}; + size_t lifelong_start_total_size_{0}; + size_t lifelong_end_total_size_{0}; + + bool InitSomasTensors(const session::KernelGraph *graph); + void InitBasicInfo(const session::KernelGraph *graph); + void InitSomasStreamAndNode(const session::KernelGraph *graph); + void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph); + void InitSomasInputTensors(const session::KernelGraph *graph); + void GetNextOutputProcess(const session::KernelGraph *graph); + void IndependentNodeOutputProcess(const session::KernelGraph *graph); + void SummaryInputProcess(const session::KernelGraph *graph); + void RefNodeProcess(const session::KernelGraph *graph); + void UnReuseNodeProcess(const session::KernelGraph *graph); + SomasTensorPtr CreateGapTensor(size_t gap_tensor_id); + void GenContiguousList(const session::KernelGraph *graph); + + void PreprocessingConflicts(); + void ComputeConflictPairs(); + + bool Assign(const session::KernelGraph *graph); + + void DumpOfflineIR(const string filename); + void DumpSomasMemoryPoolInfoIR(const string filename); + std::string GetSplitName(const string &scope_name) const; + size_t CalcLowerBound() const; + void GenStatisticInfo(); +}; + +using SomasPtr = std::shared_ptr; +} // namespace somas +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_H_ diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_node.cc b/mindspore/ccsrc/backend/optimizer/somas/somas_node.cc new file mode 100644 index 0000000000..af1c9af910 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_node.cc @@ -0,0 +1,46 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include "backend/optimizer/somas/somas_node.h" +#include + +namespace mindspore { +namespace somas { +void SomasNode::ComputeAncestorNodes() { + // Fast algorithm: assuming nodes execute this function in the received topological order + int64_t thisId = this->GetStream()->GetId(); + + for (SomasNodePtr node : ancestor_nodes_) { + int64_t ancestorId = node->GetStream()->GetId(); + // Map Improvement for max_ancestor_order + if (thisId != ancestorId) { + this->anc_stream_max_order_[ancestorId] = std::max(this->anc_stream_max_order_[ancestorId], node->GetId()); + } + for (SomasStreamPtr stream : node->GetStream()->ancestor_streams_) { + int64_t streamId = stream->GetId(); + this->anc_stream_max_order_[streamId] = + std::max(this->anc_stream_max_order_[streamId], node->anc_stream_max_order_[streamId]); + } + } +} + +void SomasNode::PresetAncestorStreams(const std::vector stream_vector) { + for (SomasStreamPtr stream : stream_vector) { + anc_stream_max_order_[stream->GetId()] = 0; + } +} +} // namespace somas +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_node.h b/mindspore/ccsrc/backend/optimizer/somas/somas_node.h new file mode 100644 index 0000000000..49a2728c78 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_node.h @@ -0,0 +1,78 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_NODE_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_NODE_H_ + +#include "backend/optimizer/somas/somas_stream.h" +#include "backend/optimizer/somas/somas_tensor.h" + +#include +#include +#include +#include +#include + +namespace mindspore { +namespace somas { +class SomasStream; +class SomasTensor; + +enum NodeType { kCommonNode, kCommunicationNode }; + +using SomasStreamPtr = std::shared_ptr; +using SomasTensorPtr = std::shared_ptr; + +class SomasNode { + public: + using SomasNodePtr = std::shared_ptr; + // Public attributes (mutated in code) + std::string scope_full_name_; + + std::set + ancestor_nodes_; // keeping only distance *one* ancestor nodes; enough to ComputeAncestorNodes() + std::set tensors_; + + std::vector input_tensors_; + std::vector output_tensors_; + std::vector workspace_tensors_; + + std::unordered_map anc_stream_max_order_; + + // Constructors/Destructors + SomasNode(size_t id, NodeType type, SomasStreamPtr stream) : id_(id), stream_(stream), type_(type) {} + SomasNode(const SomasNode &) = delete; + SomasNode &operator=(const SomasNode &) = delete; + ~SomasNode() = default; + + // Accessors + const size_t &GetId() { return id_; } + SomasStreamPtr GetStream() { return stream_; } + const NodeType &GetType() { return type_; } + + // Computing ancestors + void PresetAncestorStreams(const std::vector stream_vector); + void ComputeAncestorNodes(); + + private: + const size_t id_{0}; + SomasStreamPtr const stream_; + const NodeType type_; +}; +} // namespace somas +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_NODE_H_ diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_solver_alg.cc b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_alg.cc new file mode 100644 index 0000000000..d60568fd1e --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_alg.cc @@ -0,0 +1,260 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include "backend/optimizer/somas/somas_solver_alg.h" + +#include +#include +#include + +namespace mindspore { +namespace somas { +// offset picking heuristics +bool SmallestFit(const pair &a, const pair &b) { + return a.first < b.first || (a.first == b.first && a.second < b.second); +} +bool LargestFit(const pair &a, const pair &b) { + return a.first > b.first || (a.first == b.first && a.second < b.second); +} +bool BestFit(const pair &a, const pair &b) { + return a.second < b.second || (a.second == b.second && a.first < b.first); +} +bool WorstFit(const pair &a, const pair &b) { + return a.second > b.second || (a.second == b.second && a.first < b.first); +} +size_t SharedObjects(FootPrint *p) { return p->Next()->getOffset(); } +size_t SingleObject(FootPrint *p) { return SIZE_MAX; } + +bool (*g_pBranching[kNumFittingTypes])(const pair &a, const pair &b) = { + BestFit, SmallestFit +#ifdef SOMAS_DEBUG + , + LargestFit, WorstFit +#endif +}; +size_t (*algorithm[kNumAlgorithmTypes])(FootPrint *p) = {SharedObjects, SingleObject}; + +size_t FootPrint::Result() { + std::shared_ptr foot_print = shared_from_this(); + size_t upperbound = 0; + uint32_t total_footprints = 0; + while (NULL != foot_print) { + foot_print->printStats(); + + upperbound = foot_print->getOffset(); + foot_print = foot_print->Next(); + total_footprints++; + } + + MS_LOG(DEBUG) << total_footprints << " footprints allocated"; + + return upperbound; +} +bool FootPrint::findFirst(stack *merged, const BlockTensor &block, size_t *offset) { + MS_EXCEPTION_IF_NULL(merged); + MS_EXCEPTION_IF_NULL(offset); + bool bfound = false; + std::set, bool (*)(const pair &a, const pair &b)> + offsetcandidates(g_pBranching[m_branching_strategy_]); + size_t gap = 0; + + Interval a; + Interval it; + + a.ub() = algorithm[m_algorithm_](this); + while (!(*merged).empty()) { + it = (*merged).top(); + (*merged).pop(); + a.lb() = it.ub(); + if (a.contains(block.m_size_)) { + gap = a.ub() - a.lb() - block.m_size_; + offsetcandidates.emplace(pair(a.lb(), gap)); + } + a.ub() = it.lb(); + } + a.lb() = m_offset_; + gap = a.ub() - a.lb() - block.m_size_; + if (a.contains(block.m_size_)) offsetcandidates.emplace(pair(a.lb(), gap)); + + if (offsetcandidates.size() > 0) { + *offset = (*offsetcandidates.begin()).first; + m_foot_print_next_->m_offset_ = std::max(m_foot_print_next_->m_offset_, *offset + block.m_size_); + offsetcandidates.erase(offsetcandidates.begin()); + bfound = true; + } + + return bfound; +} +void FootPrint::Merge(vector *interval_v, stack *s) { + MS_EXCEPTION_IF_NULL(s); + sort((*interval_v).begin(), (*interval_v).end(), + [](Interval &i1, Interval &i2) { return (i1.lb() < i2.lb()) || (i1.lb() == i2.lb() && i1.ub() < i2.ub()); }); + (*s).push((*interval_v)[0]); + + for (size_t i = 1; i < (*interval_v).size(); i++) { + Interval &top = (*s).top(); + Interval &b = (*interval_v)[i]; + if (top.ub() < b.lb()) + (*s).push(b); + + else if (top.ub() < b.ub()) + top.ub() = b.ub(); + } + + return; +} +void FootPrint::ConstrainedBLocks(const std::shared_ptr &constraints, const BlockTensor &b1, + const BlockTensor &b2, vector *oInterval) { + MS_EXCEPTION_IF_NULL(oInterval); + // propagate + size_t acum = m_offset_; + + for (SomasSolverTensorDescPtr p1 = b1.m_start_tensor_; NULL != p1; p1 = p1->right_) { + for (SomasSolverTensorDescPtr p2 = b2.m_start_tensor_; NULL != p2; p2 = p2->right_) { + if ((*constraints)(p1->index_, p2->index_) == 1) { + Interval a = Interval(acum, acum + p1->size_); + Interval b = Interval(p2); + if (a.lb() < b.ub()) { + (*oInterval).emplace_back(b); + } + } + } + acum += p1->size_; + } +} +bool FootPrint::findOffset(const std::shared_ptr &constraints, const BlockTensor &block, size_t *offset) { + MS_EXCEPTION_IF_NULL(offset); + bool bretval = true; + vector l_interval; + + const size_t intervals_estimation = 1000; + l_interval.reserve(intervals_estimation * sizeof(Interval)); + + *offset = m_offset_; + bretval = true; + + // transform constrained tensors in non eligible intervals + for (size_t i = 0; i < m_starts_.size(); i++) { + if (block.Alone() && m_starts_[i]->Alone() && + (*constraints)(block.m_start_tensor_->index_, m_starts_[i]->m_start_tensor_->index_)) { + if (m_algorithm_ != 1 && i == 0) return false; + Interval It = Interval(m_starts_[i]->m_start_tensor_); + l_interval.emplace_back(It); + } else { + ConstrainedBLocks(constraints, block, *m_starts_[i], &l_interval); // solve multiple tensor blocks + } + } + // merge non-eligible intervals and find a slot to allocate the tensor block + if (!l_interval.empty()) { + stack l_mergedIntervals; + Merge(&l_interval, &l_mergedIntervals); + bretval = findFirst(&l_mergedIntervals, block, offset); + } + + return bretval; +} +void FootPrint::addElem(BlockTensor *block, const size_t &offset) { + if (m_foot_print_next_ == NULL) { + m_foot_print_next_ = std::make_shared(); + size_t newoffset = m_offset_ + block->m_size_; + m_foot_print_next_->setOffset(newoffset); + m_foot_print_next_->setAlignment(m_alignment_); + m_foot_print_next_->m_solId_ = m_solId_; + m_starts_.clear(); + MS_LOG(DEBUG) << "Creating footprint at offset: " << m_offset_; + } + + addStart(block); + size_t offset1 = offset; + SomasSolverTensorDescPtr tensor = block->m_start_tensor_; + MS_LOG(DEBUG) << "Allocating block: " << tensor->index_ << " in offset: " << offset; + pair sol_offset; + sol_offset.first = block->m_current_sol_; + sol_offset.second = offset; + if (block->offsets_.count(sol_offset.first)) + MS_LOG(WARNING) << "Warning addElem: Offset overwritten at solution " << block->m_current_sol_ << " for block " + << block->m_start_tensor_->index_; + block->offsets_.insert(sol_offset); + while (tensor) { + tensor->offset_ = offset1; + offset1 += tensor->size_; + + MS_LOG(DEBUG) << tensor->index_ << " " << tensor->size_ << " " << tensor->offset_; + tensor = tensor->right_; + } +} +void FootPrint::printStats() { + MS_LOG(DEBUG) << "Footprint blocks: " << m_starts_.size() << " \toffset: " << m_offset_; +} +bool FastHeuristic::Eval( // unordered_map &tensors_m, + vector *block_tensors_v, std::shared_ptr foot_print, + const std::shared_ptr &pConstraints) { + MS_EXCEPTION_IF_NULL(foot_print); + auto start = std::chrono::system_clock::now(); + + std::shared_ptr p = foot_print; + bool bpushed = false; + uint32_t startscount = 0; + size_t offset = foot_print->getOffset(); + m_tensors_allocated_ = 0; + SomasSolverTensorDescPtr tensor = NULL; + + for (size_t i = 0; i < (*block_tensors_v).size(); i++) { + BlockTensor &block = (*block_tensors_v)[i]; + if (block.m_bre_allocate_ == false) { + offset = block.m_start_tensor_->offset_; + pair aux; + aux.first = foot_print->m_solId_; + aux.second = block.m_start_tensor_->offset_; + if (block.offsets_.count(aux.first)) { + MS_LOG(WARNING) << "Warning: Offset overwritten at solution " << aux.first << " for block " + << block.m_start_tensor_->index_; + } + block.offsets_.insert(aux); + continue; + } + bpushed = false; + p = foot_print; + block.m_current_sol_ = foot_print->m_solId_; + while (!bpushed) { + if (p->findOffset(pConstraints, block, &offset)) { + p->addElem(&block, offset); + startscount++; + tensor = block.m_start_tensor_; + while (tensor) { + m_tensors_allocated_++; + tensor = tensor->right_; + } + bpushed = true; + break; + } + // go to the next footprint slot + if (NULL != p->Next()) { + p = p->Next(); + } else if (bpushed == false) { // something went wrong + MS_LOG(WARNING) << "Could not allocate memory for tensor: " << tensor->index_; + return false; + } + } + } + + MS_LOG(DEBUG) + << "\nElapsed time of Fast Heuristic search: " + << std::chrono::duration_cast(std::chrono::system_clock::now() - start).count() << " ms"; + return true; +} +} // namespace somas +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_solver_alg.h b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_alg.h new file mode 100644 index 0000000000..2c0674f42a --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_alg.h @@ -0,0 +1,177 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_SOLVER_ALG_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_SOLVER_ALG_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "backend/optimizer/somas/somas_solver_pre.h" +#include "utils/ms_context.h" + +using std::pair; +using std::set; +using std::stack; +using std::unordered_map; +using std::vector; + +namespace mindspore { +namespace somas { +class Interval { + public: + Interval() { m_a_ = m_b_ = 0; } + explicit Interval(SomasSolverTensorDescPtr t) { + m_a_ = t->offset_; + m_b_ = m_a_ + t->size_; + } + Interval(const size_t &a, const size_t &b) { + m_a_ = a; + m_b_ = b; + } + bool intersect(const Interval &i) { return (in(i.m_a_) || in(i.m_b_)); } + bool in(const size_t &a) { return ((a > m_a_) && (a < m_b_)); } + Interval intersection(const Interval &i) { + if (m_a_ < i.m_a_) + return Interval(m_a_, i.m_b_); + else + return Interval(i.m_a_, m_b_); + } + void merge(const Interval &i) { + m_a_ = std::min(m_a_, i.m_a_); + m_b_ = std::max(m_b_, i.m_b_); + } + size_t &lb() { return m_a_; } + size_t &ub() { return m_b_; } + bool contains(size_t width) { return (m_b_ - m_a_) >= width; } + bool contains(const Interval &a) { return ((a.m_a_ >= m_a_) && (a.m_b_ <= m_b_)); } + Interval &operator=(const Interval &in) { + m_a_ = in.m_a_; + m_b_ = in.m_b_; + return *this; + } + + private: + size_t m_a_; + size_t m_b_; +}; + +class BlockTensor { + public: + SomasSolverTensorDescPtr m_start_tensor_; + unordered_map, bool (*)(const pair &, const pair &)>> + offsets_candidates_; + uint32_t m_current_sol_; + bool m_bre_allocate_; + unordered_map offsets_; + size_t m_size_; + BlockTensor() + : m_start_tensor_(NULL), + offsets_candidates_(), + m_current_sol_(0), + m_bre_allocate_(true), + offsets_(), + m_size_(0) {} + + BlockTensor &operator=(const BlockTensor &bt) { + m_bre_allocate_ = bt.m_bre_allocate_; + m_current_sol_ = 0; + m_start_tensor_ = bt.m_start_tensor_; + offsets_candidates_ = bt.offsets_candidates_; + offsets_ = bt.offsets_; + m_size_ = bt.m_size_; + return *this; + } + void log() { + SomasSolverTensorDescPtr p = m_start_tensor_; + MS_LOG(DEBUG) << "Block of Tensors [" << m_start_tensor_->index_ << "]\nsize: " << m_size_ << "Tensors:"; + while (p) { + MS_LOG(DEBUG) << "[" << p->index_ << "," << p->size_ << "]"; + p = p->right_; + } + } + bool Alone() const { return ((NULL == m_start_tensor_->right_) && (NULL == m_start_tensor_->left_)); } +}; + +class FootPrint : public std::enable_shared_from_this { + public: + uint32_t m_solId_; + + FootPrint() + : m_offset_(0), + m_starts_(), + m_foot_print_next_(NULL), + m_alignment_(0), + m_branching_strategy_(0), + m_algorithm_(0) {} + void setAlignment(const size_t a) { m_alignment_ = a; } + void setBranchingStrategy(uint32_t bs) { m_branching_strategy_ = bs; } + void setCurrentSol(uint32_t solId) { m_solId_ = solId; } + void setAlgorithm(uint32_t algorithm) { m_algorithm_ = algorithm; } + void addStart(BlockTensor *elemIndex) { m_starts_.push_back(elemIndex); } + void addElem(BlockTensor *block, const size_t &offset); + std::shared_ptr &Next() { return m_foot_print_next_; } + vector &getStarts() { return m_starts_; } + void Destroy(); + const size_t getOffset() { return m_offset_; } + void setOffset(const size_t &offset) { m_offset_ = offset; } + bool findOffset(const std::shared_ptr &constraints, const BlockTensor &block, size_t *offset); + void ConstrainedBLocks(const std::shared_ptr &constraints, const BlockTensor &b1, const BlockTensor &b2, + vector *oInterval_l); + void Merge(vector *l_interval, stack *l_merged); + bool findFirst(stack *merged, const BlockTensor &block, size_t *offset); + size_t Result(); + void printStats(); + + private: + size_t m_offset_; + vector m_starts_; + std::shared_ptr m_foot_print_next_; + size_t m_alignment_; + uint32_t m_branching_strategy_; + uint32_t m_algorithm_; +}; + +class FastHeuristic { + public: + FastHeuristic() : m_alignment_(512), m_tensors_allocated_(0) {} + + void setAlignment(const size_t &a) { m_alignment_ = a; } + void Destroy(); + bool Eval( // unordered_map &tensors_m, + vector *block_tensors_v, std::shared_ptr foot_print, + const std::shared_ptr &pConstraints); + + private: + size_t m_alignment_; + size_t m_tensors_allocated_; +}; +} // namespace somas +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_SOLVER_ALG_H_ diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_solver_core.cc b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_core.cc new file mode 100644 index 0000000000..56269622d7 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_core.cc @@ -0,0 +1,405 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "backend/optimizer/somas/somas_solver_alg.h" +#include "backend/optimizer/somas/somas_solver_core.h" +#include "backend/optimizer/somas/somas_solver_pre.h" + +using std::sort; +using std::unordered_map; +using std::vector; + +namespace mindspore { +namespace somas { +Status SomasSolverCore::MemoryAllocationSolver() { + auto start = std::chrono::system_clock::now(); + Status retval = SUCCESS; + size_t best = SIZE_MAX; + size_t best_timing = SIZE_MAX; + if (all_) { // loop over all heuristics + FittingType best_branching = kBest; + SortingType best_sorting = kGreaterSizeSmallerIndex; + AlgorithmType best_algorithm = kManyObjects; + uint32_t best_sol = 0; + size_t worst = 0; + BuildBlocks(); + Clean(); + MS_LOG(INFO) << "time\tSol#\tResult\t\t\t\tAlgorithm\tSorting Strategy\tOffset Strategy"; + for (size_t algorithm = 0; algorithm < kNumAlgorithmTypes; algorithm++) { + algorithm_ = static_cast(algorithm); + for (size_t sort_strategy = 0; sort_strategy < kNumSortingTypes; sort_strategy++) { + sort_strategy_ = static_cast(sort_strategy); + SortTensors(); + for (size_t branching_strategy = 0; branching_strategy < kNumFittingTypes; branching_strategy++) { + branching_strategy_ = static_cast(branching_strategy); + Clean(); + MS_LOG(DEBUG) << "Timing Start " << tensors_.size() << " Tensors"; + start = std::chrono::system_clock::now(); + upperbound_ = FindSolutions(); + MS_LOG(DEBUG) + << "\nElapsed time of upper bound testing: " + << std::chrono::duration_cast(std::chrono::system_clock::now() - start).count() + << " ms"; + start = std::chrono::system_clock::now(); + + if (upperbound_ > worst) { + worst = upperbound_; + } + if (upperbound_ < best || upperbound_ == best) { + best = upperbound_; + best_algorithm = algorithm_; + best_branching = branching_strategy_; + best_sorting = sort_strategy_; + best_sol = sol_count_; + best_timing = timing_; + } + Verify(); + sol_count_++; + } + } + } + upperbound_ = best; + auto end = std::chrono::system_clock::now(); + size_t total_time = std::chrono::duration_cast((end - start)).count(); + const double giga = 1024. * 1024. * 1024.; + const double cent = 100.; + MS_LOG(INFO) << "SOMAS SOLVER RESUME:"; + MS_LOG(INFO) << "Best Solution:[" << 1 + best_sol << "/" << sol_count_ << "] "; + MS_LOG(INFO) << "Best result:" << best << " Bytes " << (best) / (giga) << " GB (" + << (best - lifelongmemory_) / (giga) << " GB + " << lifelongmemory_ / (giga) + << " GB from lifelong tensors)"; + + MS_LOG(INFO) << "Best timing:" << best_timing << " ms"; + MS_LOG(INFO) << "Best algorithm: " << algorithm_type_[best_algorithm].c_str(); + MS_LOG(INFO) << "Best sorting strategy: " << sorting_[best_sorting].c_str(); + MS_LOG(INFO) << "Best offset strategy: " << branching_[best_branching].c_str(); + MS_LOG(INFO) << "Time elapsed: " << total_time << " ms"; + MS_LOG(INFO) << "Spread:" << static_cast((worst - best) / static_cast(best * cent)) << " %%"; + best_sol_ = best_sol; + SetBestSolution(); + } else { + MS_LOG(INFO) << "Algorithm strategy: " << algorithm_type_[algorithm_].c_str(); + MS_LOG(INFO) << "Sorting strategy: " << sorting_[sort_strategy_].c_str(); + MS_LOG(INFO) << "Offset strategy: " << branching_[branching_strategy_].c_str(); + BuildBlocks(); + SortTensors(); + upperbound_ = FindSolutions(); + Verify(); + } + return retval; +} + +Status SomasSolverCore::Verify() { + Status retval = SUCCESS; + if (verify_) { + MS_LOG(INFO) << "Verifying solution.."; + + if (!Verify(upperbound_)) { + MS_LOG(WARNING) << "Solver Allocation Memory Check FAILS"; + retval = FAILED; + } else { + const double giga = 1024. * 1024. * 1024.; + MS_LOG(INFO) << "Solver Allocation Memory Check SUCCESS !!"; + MS_LOG(INFO) << "Result: " << upperbound_ << " (" << (upperbound_) / (giga) << " GB)"; + retval = SUCCESS; + } + } + + return retval; +} + +Status SomasSolverCore::Verify(unordered_map *pTensor_map) { + Status retval = SUCCESS; + if (NULL == pTensor_map) return retval; + MS_LOG(INFO) << "Verifying HQ Solution.."; + MS_LOG(INFO) << "Checking tensors id, sizes.."; + + for (auto ptensor : *pTensor_map) { + if (tensors_.count(ptensor.first) == 0) { + MS_LOG(WARNING) << "HQ Tensor id " << ptensor.first << " does not exists"; + } else if (tensors_[ptensor.first]->size_ != ptensor.second->size_) { + size_t HQ_index = ptensor.first; + size_t HQ_size = ptensor.second->size_; + size_t index = ptensor.first; + size_t size = tensors_[ptensor.first]->size_; + MS_LOG(WARNING) << "HQ Tensor Id: " << HQ_index << " with size: " << HQ_size + << " is different from Tensor Id: " << index << " size: " << size; + } + } + + MS_LOG(INFO) << "Checking HQ Solution.."; + tensors_ = *pTensor_map; + retval = Verify(upperbound_) == 0 ? FAILED : SUCCESS; + return retval; +} +bool SomasSolverCore::Verify(const size_t &upperbound) { + auto start = std::chrono::system_clock::now(); + bool retval = true; + size_t result = 0; + SomasSolverTensorDescPtr t1; + SomasSolverTensorDescPtr t2; + + for (auto t1_ : tensors_) { + // check alignment + result = std::max(result, t1_.second->size_ + t1_.second->offset_); + for (auto t2_ : tensors_) { + t1 = t1_.second; + t2 = t2_.second; + if (t1->index_ == t2->index_) continue; + bool blifelong = (t1->lifelong_ || t2->lifelong_) && (t1->index_ != t2->index_); + const size_t continuous = 2; + const size_t conflict = 1; + if ((*constraints_)(t1->index_, t2->index_) == continuous) { // continuous constraint + // t1 must be continous to t2 + bool bcontinuous = t1->offset_ == (t2->offset_ + t2->size_); + if (!bcontinuous) { + MS_LOG(WARNING) << "Continuous constraint violation in tensors " << t1->index_ << " and" << t2->index_; + retval = false; + } + } else if (blifelong || (*constraints_)(t1->index_, t2->index_) == conflict) { // conflict constraint + size_t t1_ub = t1->offset_ + t1->size_; + size_t t2_ub = t2->offset_ + t2->size_; + bool b_overlap_lb = ((t2->offset_ >= t1->offset_) && (t2->offset_ < t1_ub)); + bool b_overlap_ub = ((t2_ub > t1->offset_) && (t2_ub < t1_ub)); + bool b_overlap = b_overlap_lb || b_overlap_ub; + bool biszerosized = t1->size_ == 0 || t2->size_ == 0; + if (b_overlap && !biszerosized) { + MS_LOG(WARNING) << "Non-overlap constraint violation in tensors " << t1->index_ << " and" << t2->index_; + retval = false; + } + } + } + } + if (upperbound != result) { + MS_LOG(WARNING) << "ERROR Invalid upperbound result --> Footprint Result: " << upperbound_ + << " Tensor Result: " << result + lifelongmemory_; + retval = false; + } + MS_LOG(DEBUG) + << "\nElapsed time of Fast Heuristic Check: " + << std::chrono::duration_cast(std::chrono::system_clock::now() - start).count() << " ms"; + return retval; +} + +void SomasSolverCore::BuildBlocks() { + MS_LOG(DEBUG) << "Building block of tensors"; + + lifelongmemory_ = 0; + uint64_t tensors_block_count = 0; + for (auto tensor : tensors_) { + SomasSolverTensorDescPtr pTensor = tensor.second; + if (pTensor->blocked_) continue; + if (pTensor->lifelong_) { + lifelongmemory_ += pTensor->size_; + continue; + } + // move to the left + while (pTensor->left_) pTensor = pTensor->left_; + + // set start tensor + BlockTensor bTensor; + bTensor.m_bre_allocate_ = true; + bTensor.m_start_tensor_ = pTensor; + // find size + bTensor.m_size_ = 0; + + do { + bTensor.m_size_ += pTensor->size_; + pTensor->blocked_ = true; + pTensor = pTensor->right_; + tensors_block_count++; + } while (NULL != pTensor); + + // add to the list + this->block_tensors_.emplace_back(bTensor); + } + + if (tensors_block_count != tensors_.size()) + MS_LOG(INFO) << static_cast(tensors_.size() - tensors_block_count) << " lifelong tensors found"; + + // for debug + for (auto &b : block_tensors_) b.log(); +} + +void SomasSolverCore::Clean() { + for (auto &block : block_tensors_) { + block.m_bre_allocate_ = true; + auto pTensor = block.m_start_tensor_; + while (pTensor) { + pTensor->offset_ = 0; + pTensor = pTensor->right_; + } + } + upperbound_ = SIZE_MAX; +} +void SomasSolverCore::SortTensors() { // need to sort the tensors for Fast Heuristic + MS_LOG(DEBUG) << "Sorting Blocks of tensor, strategy: " << sorting_[sort_strategy_].c_str(); + switch (sort_strategy_) { + case kGreaterSizeSmallerIndex: { // size(>), index(<) + sort(block_tensors_.begin(), block_tensors_.end(), [](const BlockTensor &t1, const BlockTensor &t2) { + return t1.m_size_ > t2.m_size_ || + (t1.m_size_ == t2.m_size_ && t1.m_start_tensor_->index_ < t2.m_start_tensor_->index_); + }); + break; + } +#ifdef SOMAS_DEBUG + case kGreaterSizeGreaterIndex: { // size(>), index(>) + sort(block_tensors_.begin(), block_tensors_.end(), [](const BlockTensor &t1, const BlockTensor &t2) { + return t1.m_size > t2.m_size || + (t1.m_size == t2.m_size && t1.m_pStartTensor->index_ > t2.m_pStartTensor->index_); + }); + break; + } + case kGreaterSizeSmallerConstraintsSmallerIndex: { // size(>), constraints(<), index(<) + sort(block_tensors_.begin(), block_tensors_.end(), [](const BlockTensor &t1, const BlockTensor &t2) { + return t1.m_size > t2.m_size || + (t1.m_size == t2.m_size && t1.m_pStartTensor->constraints_ < t2.m_pStartTensor->constraints_) || + (t1.m_size == t2.m_size && t1.m_pStartTensor->constraints_ == t2.m_pStartTensor->constraints_ && + t1.m_pStartTensor->index_ < t2.m_pStartTensor->index_); + }); + break; + } + case kGreaterSizeSmallerConstraintsGreaterIndex: { // size(>), constraints(<), index(>) + sort(block_tensors_.begin(), block_tensors_.end(), [](const BlockTensor &t1, const BlockTensor &t2) { + return t1.m_size > t2.m_size || + (t1.m_size == t2.m_size && t1.m_pStartTensor->constraints_ < t2.m_pStartTensor->constraints_) || + (t1.m_size == t2.m_size && t1.m_pStartTensor->constraints_ == t2.m_pStartTensor->constraints_ && + t1.m_pStartTensor->index_ > t2.m_pStartTensor->index_); + }); + break; + } + case kGreaterSizeGreaterConstraintsSmallerIndex: { // size(>), constraints(>), index(<) + sort(block_tensors_.begin(), block_tensors_.end(), [](const BlockTensor &t1, const BlockTensor &t2) { + return t1.m_size > t2.m_size || + (t1.m_size == t2.m_size && t1.m_pStartTensor->constraints_ > t2.m_pStartTensor->constraints_) || + (t1.m_size == t2.m_size && t1.m_pStartTensor->constraints_ == t2.m_pStartTensor->constraints_ && + t1.m_pStartTensor->index_ < t2.m_pStartTensor->index_); + }); + break; + } + case kGreaterSizeGreaterConstraintsGreaterIndex: { // // size(>), constraints(>), index(>) + sort(block_tensors_.begin(), block_tensors_.end(), [](const BlockTensor &t1, const BlockTensor &t2) { + return t1.m_size > t2.m_size || + (t1.m_size == t2.m_size && t1.m_pStartTensor->constraints_ > t2.m_pStartTensor->constraints_) || + (t1.m_size == t2.m_size && t1.m_pStartTensor->constraints_ == t2.m_pStartTensor->constraints_ && + t1.m_pStartTensor->index_ > t2.m_pStartTensor->index_); + }); + break; + } +#endif + case kNumSortingTypes: { // no sorting case + break; + } + } + // log for debug purposes + for (auto &block : block_tensors_) block.log(); +} + +void SomasSolverCore::RestoreSolution(uint32_t sol_id) { + for (auto block : block_tensors_) { + if (block.offsets_.count(sol_id) == 0) assert(0); + size_t bestOffset = block.offsets_[sol_id]; + size_t offset = bestOffset; + SomasSolverTensorDescPtr pTensor = block.m_start_tensor_; + + while (pTensor) { + pTensor->offset_ = offset; + offset += pTensor->size_; + pTensor = pTensor->right_; + } + } +} +size_t SomasSolverCore::Search(const std::shared_ptr &pFootprint) { + size_t result = 0; + FastHeuristic fh; + MS_LOG(INFO) << "Calling FastSolver Search for " << block_tensors_.size() << " tensors "; + auto start = std::chrono::system_clock::now(); + if (fh.Eval(&block_tensors_, pFootprint, constraints_)) { + result = pFootprint->Result(); + auto end = std::chrono::system_clock::now(); + timing_ = std::chrono::duration_cast((end - start)).count(); + if (all_) { + const double giga = 1073741824.; + MS_LOG(INFO) << timing_ << " ms\t" << sol_count_ + 1 << "/" + << kNumFittingTypes * kNumAlgorithmTypes * kNumSortingTypes << "\t" << result << " Bytes (" + << result / giga << " GB)\t" << algorithm_type_[algorithm_].c_str() << "\t" + << sorting_[sort_strategy_].c_str() << "\t" << branching_[branching_strategy_].c_str(); + } + } else { + MS_LOG(INFO) << "FastSolver could not find solution"; + } + + if (result < upperbound_) { + upperbound_ = result; + best_sol_ = pFootprint->m_solId_; + best_branching_ = branching_strategy_; + best_sort_ = sort_strategy_; + } + + return upperbound_; +} + +void SomasSolverCore::AppendLifelongTensors() { + MS_LOG(DEBUG) << "Appending lifelong tensors to solution"; + size_t offset = upperbound_; + for (auto t_ : tensors_) { + SomasSolverTensorDescPtr pTensor = t_.second; + if (pTensor->lifelong_) { + pTensor->offset_ = offset; + offset += pTensor->size_; + } + } + upperbound_ += lifelongmemory_; + MS_LOG(DEBUG) << lifelongmemory_ << " bytes from lifelong tensors added to solution"; +} + +size_t SomasSolverCore::FindSolutions() { + MS_LOG(DEBUG) << "Start allocating blocks,offset strategy: " << branching_[branching_strategy_].c_str(); + + std::shared_ptr pFootprint = std::make_shared(); + pFootprint->setBranchingStrategy(branching_strategy_); + pFootprint->setCurrentSol(sol_count_); + pFootprint->setAlgorithm(algorithm_); + Search(pFootprint); + AppendLifelongTensors(); + Destroy(pFootprint); + return upperbound_; +} + +void SomasSolverCore::Destroy(std::shared_ptr &pFootprint) { + while (NULL != pFootprint) { + if (NULL != pFootprint->Next()) { + std::shared_ptr &p = pFootprint; + pFootprint = pFootprint->Next(); + // delete p; + p = NULL; + } else { + // delete pFootprint; + pFootprint = NULL; + } + } +} +} // namespace somas +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_solver_core.h b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_core.h new file mode 100644 index 0000000000..b9901a1bd0 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_core.h @@ -0,0 +1,101 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_SOLVER_CORE_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_SOLVER_CORE_H_ + +#include +#include +#include +#include +#include +#include + +#include "backend/optimizer/somas/somas_solver_alg.h" +#include "backend/optimizer/somas/somas_solver_pre.h" + +namespace mindspore { +namespace somas { +class SomasSolverCore { + public: + /// Interface Function: receive parameters, creates the model to solve and then save the result + SomasSolverCore(const std::unordered_map &tensors, + const std::shared_ptr &constraints) + : tensors_(tensors), + constraints_(constraints), + upperbound_(SIZE_MAX), + timing_(0), + lifelongmemory_(0), + verify_(false), + all_(true), + best_sol_(0), + sort_strategy_(kGreaterSizeSmallerIndex), + branching_strategy_(kBest), + sol_count_(0), + algorithm_(kManyObjects) {} + ~SomasSolverCore() = default; + + Status MemoryAllocationSolver(); + Status Verify(); + bool Verify(const size_t &); + Status Verify(unordered_map *); + void VerifySolution(const bool verify) { verify_ = verify; } + void SortTensors(); + void BuildBlocks(); + void Clean(); + void SetBestSolution() { RestoreSolution(best_sol_); } + void RestoreSolution(uint32_t sol_id); + void SetSortingStrategy(SortingType sort_strategy) { sort_strategy_ = sort_strategy; } + void SetFittingStrategy(FittingType branching_strategy) { branching_strategy_ = branching_strategy; } + void SetAlgorithmStrategy(AlgorithmType algorithm_strategy) { algorithm_ = algorithm_strategy; } + void SetAllStrategies(bool all) { all_ = all; } + const size_t &GetUpperbound() const { return upperbound_; } + + private: + std::unordered_map tensors_; + vector block_tensors_; + std::shared_ptr constraints_; + size_t upperbound_{0}; + size_t timing_{0}; + size_t lifelongmemory_{0}; + bool verify_{false}; + bool all_{false}; + uint32_t best_sol_{0}; + SortingType best_sort_; + FittingType best_branching_; + SortingType sort_strategy_; + FittingType branching_strategy_; + uint32_t sol_count_{0}; + AlgorithmType algorithm_; + + size_t FindSolutions(); + size_t Search(const std::shared_ptr &pFootprint); + void AppendLifelongTensors(); + void Destroy(std::shared_ptr &); + + const std::string sorting_[6] = {"size(>), index(<)", + "size(>), index(>)", + "size(>), constraints(<), index(<)", + "size(>), constraints(<), index(>)", + "size(>), constraints(>), index(<)", + "size(>), constraints(>), index(>)"}; + const std::string branching_[4] = {"bestfit", "smallest", "largest", "worstfit"}; + const std::string algorithm_type_[2] = {"Shared Objects", "Single Object"}; +}; +} // namespace somas +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_SOLVER_CORE_H_ diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.cc b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.cc new file mode 100644 index 0000000000..3aa187dfd7 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.cc @@ -0,0 +1,209 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include +#include +#include +#include +#include + +#include "backend/optimizer/somas/somas_solver_core.h" +#include "backend/optimizer/somas/somas_solver_pre.h" + +namespace mindspore { +namespace somas { +Status SomasSolverPre::Solving(const session::KernelGraph *graph, + std::unordered_map *ptensors, + std::shared_ptr pConstraints, const vector> &continuous_v, + bool bVerifySolution, bool ball, SortingType sorting, FittingType fitting, + AlgorithmType algorithm) { + Status retval = SUCCESS; + + try { + size_t maxIndex = 0; + std::unordered_map &tensors = *ptensors; + std::unordered_map::iterator max = + std::max_element(tensors.begin(), tensors.end(), + [](const std::pair &a, + const std::pair &b) { return a.first < b.first; }); + maxIndex = max->first; + if (maxIndex > pConstraints->Rows() - 1) { + MS_LOG(WARNING) << "ERROR: MaxIndex invalid, MaxIndex " << maxIndex << ", Rows " << pConstraints->Rows(); + return FAILED; + } + MS_LOG(INFO) << "Filling in constraints matrix.."; + uint32_t continuous_cnt = 0; + + // creating S Lists + for (auto &aux : continuous_v) { + for (uint32_t i = 0; i < aux.size() - 1; i++) { + uint32_t index1 = aux[i]; + uint32_t index2 = aux[i + 1]; + if (NULL == tensors[index1]) { + MS_LOG(WARNING) << "NULL tensor received in continuous constraint (tensor index " << index1 << ")"; + return FAILED; + } + if (NULL == tensors[index2]) { + MS_LOG(WARNING) << "NULL tensor received in continuous constraint (tensor index " << index2 << ")"; + return FAILED; + } + + const size_t continuous = 2; + (*pConstraints)(index2, index1) = continuous; + if (tensors[index1]->right_) + MS_LOG(WARNING) << "Warning:tensor " << index1 + << " already has a right tensor (id: " << tensors[index1]->right_->index_; + if (tensors[index2]->left_) + MS_LOG(WARNING) << "Warning:tensor " << index2 + << " already has a left tensor (id: " << tensors[index2]->left_->index_; + + tensors[index1]->right_ = tensors[index2]; + tensors[index2]->left_ = tensors[index1]; + continuous_cnt++; + } + } + continuous_cnt++; + + std::shared_ptr pSolver = std::make_shared(tensors, pConstraints); + pSolver->SetAlgorithmStrategy(algorithm); + pSolver->SetSortingStrategy(sorting); + pSolver->SetFittingStrategy(fitting); + pSolver->SetAllStrategies(ball); + pSolver->VerifySolution(bVerifySolution); + + if (SUCCESS == (pSolver->MemoryAllocationSolver())) { + max_offset_ = pSolver->GetUpperbound(); + const double giga = 1024. * 1024. * 1024.; + MS_LOG(INFO) << "SomasSolver::Solving SUCCESS"; + MS_LOG(INFO) << "SomasSolver::Solving RESULT: " << max_offset_ << " (" << max_offset_ / (giga) << " GB)"; + } + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); + if (save_graphs) { + Log(graph, tensors, pConstraints, continuous_v); + } + } catch (const std::exception &e) { + MS_LOG(EXCEPTION) << "SomasSolver::Solving FAILED: " << e.what(); + retval = FAILED; + } + return retval; +} + +void SomasSolverPre::Log(const session::KernelGraph *graph, + const unordered_map &tensors, + const std::shared_ptr &pConstraints, const vector> &continuous_v) { + MS_LOG(INFO) << "SomasSolver::Log Writing somas-input.txt.."; + + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + auto save_graphs_path = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_PATH); + std::string filename = save_graphs_path + "/" + "somas_solver_input_" + std::to_string(graph->graph_id()) + ".ir"; + if (filename.size() > PATH_MAX) { + MS_LOG(ERROR) << "File path " << filename << " is too long."; + return; + } + char real_path[PATH_MAX] = {0}; +#if defined(_WIN32) || defined(_WIN64) + if (_fullpath(real_path, filename.c_str(), PATH_MAX) == nullptr) { + MS_LOG(DEBUG) << "dir " << filename << " does not exit."; + } +#else + if (realpath(filename.c_str(), real_path) == nullptr) { + MS_LOG(DEBUG) << "Dir " << filename << " does not exit."; + } +#endif + + std::string path_string = real_path; + ChangeFileMode(path_string, S_IRWXU); + std::ofstream ofs_1(real_path); + + if (!ofs_1.is_open()) { + MS_LOG(ERROR) << "Open log file '" << real_path << "' failed!"; + return; + } + + for (auto &t : tensors) { + ofs_1 << "T " << t.second->index_ << " " << t.second->size_ << " " << t.second->lifelong_ << std::endl; + } + + for (auto &t1 : tensors) { + for (auto &t2 : tensors) { + size_t idx1 = t1.first; + size_t idx2 = t2.first; + if ((idx1 != idx2) && (*pConstraints)(idx1, idx2) == 1) { + ofs_1 << "C " << idx1 << " " << idx2 << std::endl; + } + } + } + for (auto &s : continuous_v) { + ofs_1 << "S"; + for (auto idx : s) { + ofs_1 << " " << idx; + } + ofs_1 << std::endl; + } + ofs_1.close(); + + MS_LOG(INFO) << "SomasSolver::Log Writing somas-output.txt.."; + std::string out_filename = + save_graphs_path + "/" + "somas_solver_output_" + std::to_string(graph->graph_id()) + ".ir"; + if (out_filename.size() > PATH_MAX) { + MS_LOG(ERROR) << "File path " << out_filename << " is too long."; + return; + } +#if defined(_WIN32) || defined(_WIN64) + if (_fullpath(real_path, out_filename.c_str(), PATH_MAX) == nullptr) { + MS_LOG(DEBUG) << "dir " << out_filename << " does not exit."; + } +#else + if (realpath(out_filename.c_str(), real_path) == nullptr) { + MS_LOG(DEBUG) << "Dir " << out_filename << " does not exit."; + } +#endif + path_string = real_path; + ChangeFileMode(path_string, S_IRWXU); + std::ofstream ofs_2(real_path); + + if (!ofs_2.is_open()) { + MS_LOG(ERROR) << "Open log file '" << real_path << "' failed!"; + return; + } + + for (auto &t : tensors) { + SomasSolverTensorDescPtr tensor = t.second; + int continuous = 0; + if (tensor->left_ == NULL && tensor->right_ != NULL) + continuous = 1; + else if (tensor->left_ != NULL && tensor->right_ != NULL) + continuous = 2; + else if (tensor->left_ != NULL && tensor->right_ == NULL) + continuous = 3; + const size_t alignment = 512; + bool size_aligned = tensor->size_ % alignment == 0; + bool offset_aligned = tensor->offset_ % alignment == 0; + + ofs_2 << std::endl + << "tensor_id=" << tensor->index_ << "\tsize=" << tensor->size_ << "\toffset=" << tensor->offset_ + << "\tcontinuous=" << continuous << "\tsize_aligned=" << size_aligned + << "\toffset_aligned=" << offset_aligned; + } + ofs_2.close(); + + MS_LOG(INFO) << "SomasSolver::Log done"; +} +} // namespace somas +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.h b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.h new file mode 100644 index 0000000000..d331b2d95e --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_solver_pre.h @@ -0,0 +1,159 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_SOLVER_PRE_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_SOLVER_PRE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "backend/session/kernel_graph.h" + +using std::unordered_map; +using std::vector; + +namespace mindspore { +namespace somas { +enum Status { FAILED, SUCCESS }; +enum AlgorithmType { kManyObjects = 0, kSingleObject, kNumAlgorithmTypes }; +enum SortingType { + kGreaterSizeSmallerIndex = 0, +#ifdef SOMAS_DEBUG + kGreaterSizeGreaterIndex, + kGreaterSizeSmallerConstraintsSmallerIndex, + kGreaterSizeSmallerConstraintsGreaterIndex, + kGreaterSizeGreaterConstraintsSmallerIndex, + kGreaterSizeGreaterConstraintsGreaterIndex, +#endif + kNumSortingTypes +}; +enum FittingType { + kBest = 0, + kSmallest, +#ifdef SOMAS_DEBUG + kLargest, + kWorst, +#endif + kNumFittingTypes +}; + +class Array { + public: + Array(const size_t &rows, const size_t &cols) : rows_(rows), cols_(cols) { + conflicts_array_ = std::make_unique(rows * cols); + for (uint32_t i = 0; i < rows * cols; i++) { + conflicts_array_[i] = 1; + } + } + + Array(const Array &array) : rows_(array.rows_), cols_(array.cols_) { + conflicts_array_ = std::make_unique(array.rows_ * array.cols_); + for (uint32_t i = 0; i < array.rows_ * array.cols_; i++) { + conflicts_array_[i] = array.conflicts_array_[i]; + } + } + + Array &operator=(const Array &array) { return *this; } + + int &operator()(const size_t &i, const size_t &j) { + assert((i * cols_ + j) < (rows_ * cols_)); + return conflicts_array_[i * cols_ + j]; + } + + const size_t &Rows() { return rows_; } + const size_t &Cols() { return cols_; } + + private: + const size_t rows_; + const size_t cols_; + std::unique_ptr conflicts_array_; +}; + +struct SomasSolverTensorDesc { + size_t index_; + size_t size_; + size_t offset_; + bool lifelong_; + size_t constraints_; + using SomasSolverTensorDescPtr = std::shared_ptr; + SomasSolverTensorDescPtr right_; + SomasSolverTensorDescPtr left_; + bool blocked_; + + SomasSolverTensorDesc() = default; + + SomasSolverTensorDesc(size_t index, size_t size, size_t offset, bool blifelong) + : index_(index), size_(size), offset_(offset), lifelong_(blifelong) { + constraints_ = 0; + right_ = NULL; + left_ = NULL; + blocked_ = false; + } + + void Update(size_t index, size_t size, size_t offset, bool blifelong, size_t constraints) { + index_ = index; + size_ = size; + offset_ = offset; + lifelong_ = blifelong; + constraints_ = constraints; + } + + friend std::ostream &operator<<(std::ostream &out, const SomasSolverTensorDescPtr n) { + out << n->index_ << " " << n->size_ << " " << n->offset_ << "\n"; + return out; + } + friend std::istream &operator>>(std::istream &in, SomasSolverTensorDescPtr n) { + in >> n->index_ >> n->size_ >> n->offset_; + return in; + } +}; +using SomasSolverTensorDescPtr = std::shared_ptr; + +class SomasSolverPre { + public: + SomasSolverPre() = default; + ~SomasSolverPre() = default; + + SomasSolverPre(const SomasSolverPre &) = delete; + SomasSolverPre &operator=(const SomasSolverPre &) = delete; + + size_t GetMaxOffset() { return max_offset_; } + + Status Solving(const session::KernelGraph *graph, std::unordered_map *tensors, + std::shared_ptr pConstraints, const vector> &continuous_v, + bool bVerifySolution, // true -> Check continuous and non overlapping constraints solution + bool ball = true, // true -> run full set of heuristics, false -> run single heuristic specified + SortingType sorting = kGreaterSizeSmallerIndex, FittingType fitting = kBest, + AlgorithmType algorithm = kManyObjects); + + void Log(const session::KernelGraph *graph, const unordered_map &tensors, + const std::shared_ptr &pConstraints_v, const vector> &continuous_v); + + private: + size_t max_offset_; +}; +using SomasSolverPrePtr = std::shared_ptr; +} // namespace somas +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_SOLVER_PRE_H_ diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_stream.cc b/mindspore/ccsrc/backend/optimizer/somas/somas_stream.cc new file mode 100644 index 0000000000..20cd8b1b3f --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_stream.cc @@ -0,0 +1,53 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include "backend/optimizer/somas/somas_stream.h" + +namespace mindspore { +namespace somas { +void SomasStream::ComputeAncestorStreams() { + // (Naive) algorithm: for a given stream, compute its ancestors assuming only distance 1 ancestors are known (handles + // cycles between streams) + std::set current_level, temp_level, already_visited; + auto thisPtr = std::make_shared(id_); + already_visited.insert(thisPtr); + // Initialize current level to distance 2 ancestors + for (auto stream1 : ancestor_streams_) { + already_visited.insert(stream1); + for (auto stream2 : stream1->ancestor_streams_) { + if (std::find(already_visited.begin(), already_visited.end(), stream2) == already_visited.end()) + current_level.insert(stream2); + } + } + + while (!current_level.empty()) { + // Push current level into ancestors + for (auto stream1 : current_level) { + ancestor_streams_.insert(stream1); + already_visited.insert(stream1); + // Keep next level of this ancestor + for (auto stream2 : stream1->ancestor_streams_) { + if (std::find(already_visited.begin(), already_visited.end(), stream2) == already_visited.end()) + temp_level.insert(stream2); + } + } + current_level.clear(); + current_level = temp_level; + temp_level.clear(); + } +} +} // namespace somas +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_stream.h b/mindspore/ccsrc/backend/optimizer/somas/somas_stream.h new file mode 100644 index 0000000000..9cd421334b --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_stream.h @@ -0,0 +1,61 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_STREAM_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_STREAM_H_ + +#include "backend/optimizer/somas/somas_node.h" +#include "backend/optimizer/somas/somas_tensor.h" + +#include +#include +#include + +namespace mindspore { +namespace somas { +class SomasNode; +class SomasTensor; + +using SomasTensorPtr = std::shared_ptr; + +class SomasStream { + public: + using SomasStreamPtr = std::shared_ptr; + + // Attributes mutated in code + std::vector tensors_; // vector needed for same-stream loop in ConflictComputing() + std::set ancestor_streams_; + std::set ancestor_streams_group_; + + // Constructors/Destructors + explicit SomasStream(int64_t id) : id_(id) {} + SomasStream(const SomasStream &) = delete; + SomasStream &operator=(const SomasStream &) = delete; + ~SomasStream() = default; + + // Accessors + const int64_t &GetId() const { return id_; } + + // Ancestor Computing + void ComputeAncestorStreams(); // Given "ancestors at distance one" information, compute "ancestors at any distance" + + private: + const int64_t id_{0}; +}; +} // namespace somas +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_STREAM_H_ diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_tensor.cc b/mindspore/ccsrc/backend/optimizer/somas/somas_tensor.cc new file mode 100644 index 0000000000..08a89da410 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_tensor.cc @@ -0,0 +1,64 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include "backend/optimizer/somas/somas_tensor.h" +#include "backend/optimizer/somas/somas_node.h" +#include "backend/optimizer/somas/somas_stream.h" +#include "backend/optimizer/somas/somas.h" + +namespace mindspore { +namespace somas { +SomasTensor::SomasTensor(size_t id, SomasNodePtr source_node, SomasStreamPtr source_stream, size_t real_size, + LifeLongType lifelong_value) + : lifelong_value_(lifelong_value), + type_(kUnknown), + offset_(0), + id_(id), + source_node_(source_node), + source_stream_(source_stream), + original_size_(real_size) { + const size_t alignment = 512; + const size_t alignment_complement = 31; + aligned_size_ = (real_size > 0) ? (real_size + alignment + alignment_complement) / alignment * alignment : 0; + + solver_tensor_desc_ = std::make_shared(id_, aligned_size_, offset_, false); + + ref_overlap_ = false; + between_streams_ = false; + num_constraints_ = 0; +} + +SomasSolverTensorDescPtr SomasTensor::GetSolverTensorDesc() { + if (type_ == kGap) { // ignore lifelong_ value for gaps given to solver, and pass with original_size_ + solver_tensor_desc_->Update(id_, original_size_, offset_, false, num_constraints_); + } else { + solver_tensor_desc_->Update(id_, aligned_size_, offset_, lifelong_value_ == kLifeLongGraphAll, num_constraints_); + } + if (aligned_size_ == 0) { // ignore zero-size tensors for solver + return nullptr; + } else { + return solver_tensor_desc_; + } +} + +void SomasTensor::ComputeMaxDestinationId() { + for (SomasStreamPtr stream : destinationStreams_) max_destination_id_[stream] = 0; + + for (SomasNodePtr node : destinations_) + if (node->GetId() > max_destination_id_[node->GetStream()]) max_destination_id_[node->GetStream()] = node->GetId(); +} +} // namespace somas +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/somas/somas_tensor.h b/mindspore/ccsrc/backend/optimizer/somas/somas_tensor.h new file mode 100644 index 0000000000..708a9942e1 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/somas/somas_tensor.h @@ -0,0 +1,129 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + + * http://www.apache.org/licenses/LICENSE-2.0 + + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_TENSOR_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_TENSOR_H_ + +#include +#include +#include +#include + +#include "backend/optimizer/somas/somas_node.h" +#include "backend/optimizer/somas/somas_solver_pre.h" +#include "backend/optimizer/somas/somas_stream.h" + +namespace mindspore { +namespace somas { +class SomasNode; +class SomasStream; + +// Lifetime type +struct Lifetime { + size_t start_; + size_t end_; + + explicit Lifetime(size_t start = 0, size_t end = 0) : start_(start), end_(end) {} +}; +using lifetime_t = struct Lifetime; + +// Tensor type +enum TensorType { + kCommon, + kOutputOnly, + kWorkspace, + kGetNextOutput, + kSummaryInput, + kRefNodeInput, + kRefNodeOutput, + kGap, + kUnknown +}; + +enum LifeLongType { + kLifeLongNone, // life time is from tensor start to tensor end + kLifeLongGraphAll, // life time is from graph start to graph end + kLifeLongGraphStart, // life time is from graph start to tensor end + kLifeLongGraphEnd // life time is from tensor start to graph end +}; + +using SomasNodePtr = std::shared_ptr; +using SomasStreamPtr = std::shared_ptr; + +class SomasTensor { + public: + using SomasTensorPtr = std::shared_ptr; + + size_t aligned_size_{0}; + LifeLongType lifelong_value_; + + bool ref_overlap_; + bool between_streams_; + + lifetime_t lifetime_; + TensorType type_; + + size_t offset_{0}; + size_t num_constraints_{0}; + + std::set destinations_; + std::set destinationStreams_; + unordered_map max_destination_id_; + + // Constructors/Destructors + explicit SomasTensor(size_t id, SomasNodePtr source_node, SomasStreamPtr source_stream, size_t real_size, + LifeLongType lifelong_value = kLifeLongNone); + SomasTensor(const SomasTensor &) = delete; + SomasTensor &operator=(const SomasTensor &) = delete; + ~SomasTensor() = default; + + // Accessors + const size_t &GetId() { return id_; } + SomasNodePtr GetSourceNode() const { return source_node_; } + SomasStreamPtr GetSourceStream() const { return source_stream_; } + const size_t &GetOriginalSize() { return original_size_; } + const size_t &GetAlignedSize() { return aligned_size_; } + bool IsLifelong() { return lifelong_value_ == kLifeLongGraphAll; } + bool IsWorkspace() { return type_ == kWorkspace; } + bool IsOutputOnly() { return type_ == kOutputOnly; } + size_t GetOffset() { return offset_; } + bool IsBetweenStreams() { return between_streams_; } + bool IsSemiLifelongStart() { return lifelong_value_ == kLifeLongGraphStart; } + bool IsSemiLifelongEnd() { return lifelong_value_ == kLifeLongGraphEnd; } + bool IsRefOverlap() { return ref_overlap_; } + bool IsGap() { return type_ == kGap; } + + // Computing functions + void SetOffset(size_t start_offset = 0) { + if (aligned_size_ != 0 && type_ != kGetNextOutput) { + offset_ = start_offset + solver_tensor_desc_->offset_; + } + } + SomasSolverTensorDescPtr GetSolverTensorDesc(); + void ComputeMaxDestinationId(); + + private: + const size_t id_{0}; + const SomasNodePtr source_node_; + SomasStreamPtr const source_stream_; + const size_t original_size_{0}; + + SomasSolverTensorDescPtr solver_tensor_desc_; +}; +} // namespace somas +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_SOMAS_SOMAS_TENSOR_H_ diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index 4dbda945b9..5bc6ece84c 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -1,3 +1,4 @@ + /** * Copyright 2019 Huawei Technologies Co., Ltd * @@ -15,21 +16,21 @@ */ #include "runtime/device/kernel_runtime.h" -#include -#include -#include #include -#include "utils/ms_utils.h" -#include "common/trans.h" -#include "utils/utils.h" -#include "utils/ms_context.h" -#include "frontend/operator/ops.h" -#include "backend/session/kernel_graph.h" -#include "backend/session/anf_runtime_algorithm.h" +#include +#include +#include #include "backend/optimizer/common/helper.h" +#include "backend/session/anf_runtime_algorithm.h" +#include "backend/session/kernel_graph.h" +#include "common/trans.h" #include "debug/data_dump/dump_json_parser.h" +#include "frontend/operator/ops.h" #include "ir/value.h" +#include "utils/ms_context.h" +#include "utils/ms_utils.h" #include "utils/shape_utils.h" +#include "utils/utils.h" using mindspore::kernel::Address; using mindspore::kernel::AddressPtr; @@ -440,6 +441,9 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(MemType type, const AnfNode if (type == kReuseDynamicMem) { // reuse communication op's all outputs' memory type = kReuseDynamicCommMem; + } + + if (type == kReuseDynamicCommMem || type == kSomasReuseDynamicMem) { bool not_reuse = KernelMemNotReuse(node); if (not_reuse) { type = kDynamicMem; @@ -504,7 +508,7 @@ void KernelRuntime::AssignCommunicationNodeInputMem(MemType type, const AnfNodeP return; } - if (type == kReuseDynamicMem) { + if (type == kReuseDynamicMem || type == kSomasReuseDynamicMem) { bool not_reuse = KernelMemNotReuse(node); if (not_reuse) { type = kDynamicMem; @@ -530,13 +534,13 @@ void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, in if (node->isa()) { bool independent = AnfAlgo::IsIndependentNode(node->cast()); - if (independent && type == kReuseDynamicMem) { - MS_LOG(INFO) << "Independent disable mem_reuse"; + if (independent && (type == kReuseDynamicMem || type == kSomasReuseDynamicMem)) { + MS_LOG(INFO) << "Independent node " << node->fullname_with_scope() << " disable memory reuse"; type = kDynamicMem; } } - if (type == kReuseDynamicMem) { + if (type == kReuseDynamicMem || type == kSomasReuseDynamicMem) { bool not_reuse = KernelMemNotReuse(node); if (not_reuse) { type = kDynamicMem; @@ -671,8 +675,13 @@ void KernelRuntime::AssignDynamicMemory(session::KernelGraph *graph) { if (is_enable_mem_reuse) { MS_LOG(INFO) << "Memory Reuse is enable..."; +#ifdef MEM_REUSE_DEBUG mem_manager_->MallocReusedDynamicMem(graph); mem_type = kReuseDynamicMem; +#else + mem_manager_->MallocSomasDynamicMem(graph); + mem_type = kSomasReuseDynamicMem; +#endif } else { MS_LOG(INFO) << "Memory Reuse is disable..."; } diff --git a/mindspore/ccsrc/runtime/device/memory_manager.cc b/mindspore/ccsrc/runtime/device/memory_manager.cc index 88d6ce8511..102c1c5a2b 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.cc +++ b/mindspore/ccsrc/runtime/device/memory_manager.cc @@ -15,6 +15,7 @@ */ #include "runtime/device/memory_manager.h" +#include #include "backend/session/anf_runtime_algorithm.h" #include "utils/ms_context.h" using mindspore::memreuse::BestFitMemReuse; @@ -47,6 +48,40 @@ void MemoryManager::MallocReusedDynamicMem(const session::KernelGraph *graph) { mem_reuse_util_ptr_->set_mem_base(base_ptr); } +void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + SomasPtr somas_reuse_util_ptr = std::make_shared(); + MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr); + somas_reuse_util_ptr_ = somas_reuse_util_ptr; + + if (!(somas_reuse_util_ptr->Allocate(graph))) { + MS_LOG(EXCEPTION) << "Somas Allocate Failed."; + } + + size_t total_allocated_size = somas_reuse_util_ptr->GetTotalMemSize(); + MS_LOG(INFO) << "Graph " << graph->graph_id() << ": TotalSomasReuseDynamicSize [" << total_allocated_size << "]"; + auto base_ptr = MallocDynamicMem(total_allocated_size, false); + MS_LOG(INFO) << "Somas Reuse Memory Base Address [" << static_cast(base_ptr) << "], End Address [" + << static_cast(base_ptr + total_allocated_size) << "]"; + somas_reuse_util_ptr->set_mem_base_addr(base_ptr); + + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); + auto save_graphs_path = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_PATH); + if (save_graphs_path.empty()) { + save_graphs_path = "."; + } + if (save_graphs) { + std::string file_path = + save_graphs_path + "/" + "somas_after_allocate_" + std::to_string(graph->graph_id()) + ".ir"; + somas_reuse_util_ptr_->DumpSomasBasicIR(file_path); + + std::string mem_file_path = save_graphs_path + "/" + "somas_mem_info_" + std::to_string(graph->graph_id()) + ".ir"; + somas_reuse_util_ptr_->DumpSomasMemoryIR(mem_file_path); + } +} + uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, MemType type, size_t size, const DeviceAddressPtr &address) { MS_EXCEPTION_IF_NULL(node); @@ -68,6 +103,9 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me } else if (type == kReuseDynamicCommMem) { MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_); ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index); + } else if (type == kSomasReuseDynamicMem) { + MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_); + ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index); } else { ptr = MallocDynamicMem(size, communication_mem); } @@ -83,6 +121,9 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me } else if (type == kReuseDynamicMem) { MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_); ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index); + } else if (type == kSomasReuseDynamicMem) { + MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_); + ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index); } address->ptr_ = ptr; return ptr; @@ -92,6 +133,9 @@ uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, if (type == kReuseDynamicMem) { MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_); return mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index); + } else if (type == kSomasReuseDynamicMem) { + MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_); + return somas_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index); } return MallocDynamicMem(size, false); } diff --git a/mindspore/ccsrc/runtime/device/memory_manager.h b/mindspore/ccsrc/runtime/device/memory_manager.h index ad2142d1a2..ffde7c8a90 100644 --- a/mindspore/ccsrc/runtime/device/memory_manager.h +++ b/mindspore/ccsrc/runtime/device/memory_manager.h @@ -17,16 +17,18 @@ #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_MEMORY_MANAGER_H_ #define MINDSPORE_CCSRC_RUNTIME_DEVICE_MEMORY_MANAGER_H_ #include -#include #include +#include #include "backend/optimizer/mem_reuse/mem_reuse.h" #include "backend/optimizer/mem_reuse/mem_reuse_allocator.h" +#include "backend/optimizer/somas/somas.h" namespace mindspore { namespace device { -enum MemType { kStaticMem, kDynamicMem, kReuseDynamicMem, kReuseDynamicCommMem }; +enum MemType { kStaticMem, kDynamicMem, kReuseDynamicMem, kSomasReuseDynamicMem, kReuseDynamicCommMem }; const int kGetAllOuts = -1; const uint64_t kMemAlignSize = 512; using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr; +using SomasPtr = mindspore::somas::SomasPtr; class MemoryManager { public: @@ -42,6 +44,7 @@ class MemoryManager { virtual void ClearGlobalIdleMem() {} void MallocReusedDynamicMem(const session::KernelGraph *graph); + void MallocSomasDynamicMem(const session::KernelGraph *graph); uint8_t *MallocOutputMem(const AnfNodePtr &node, size_t index, MemType type, size_t size, const DeviceAddressPtr &address); uint8_t *MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size); @@ -68,6 +71,7 @@ class MemoryManager { size_t total_static_size_ = 0; size_t total_dynamic_size_ = 0; MemReuseUtilPtr mem_reuse_util_ptr_{nullptr}; + SomasPtr somas_reuse_util_ptr_{nullptr}; }; } // namespace device } // namespace mindspore